In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, mean_squared_error, mean_absolute_error

In [2]:
import sys
sys.path.append('/Users/hauptjoh/projects/treatment-learn')

from treatlearn.policy import bayesian_targeting_policy
from treatlearn.evaluation import transformed_outcome_loss, expected_policy_profit

## Load Data

In [3]:
X = pd.read_csv("../data/fashionB_clean_linear.csv")

# Downsampling for debugging
#X = X.sample(5000)

c = X.pop('converted').to_numpy()
g = X.pop('TREATMENT').to_numpy()
y = X.pop('checkoutAmount').to_numpy()
tau_conversion = X.pop('TREATMENT_EFFECT_CONVERSION')
tau_basket = X.pop('TREATMENT_EFFECT_BASKET')
tau_response = X.pop('TREATMENT_EFFECT_RESPONSE')

In [4]:
DATE = "20200217"

In [5]:
predictions = np.load(f"../results/{DATE}/treatment_model_predictions.npy", allow_pickle=True)
predictions_train = [fold["train"] for fold in predictions]
predictions_test = [fold["test"] for fold in predictions]

## Conversion Predictions C(T=1)

In [6]:
# ##### Comparison on AUC

def calc_classification_error(prediction_dict, y_true, g):
    """
    Calculate the prediction error of the model predictions
    prediction_dict : dict
        Dictionary with the model predictions in the form model_name: array of predictions
    y_true : 1D array-like
        Observed outcomes
    g : 1D array-like
        Binary group indicator
    prob_treatment : array-like or int
        The group propensity for each observation. If None or int, the constant probability
        to be in binary treatment group 1.
    tau_true : 1D array-like
        Array of the true treatment effect. The true treatment effect
        is only known in simulations
    """
    output = {}
    
    for model_name, pred in prediction_dict.items():
        output[model_name] = {}
        output[model_name]["ROC-AUC"] = roc_auc_score(y_true=y_true[g], y_score=pred[g]) 
        output[model_name]["brier"] = mean_squared_error(y_pred=pred[g], y_true=y_true[g])

    return output

In [7]:
eval_conversion = [calc_classification_error(outcome_dict["conversion"], y_true=c[outcome_dict["idx"]], g=np.nonzero(g[outcome_dict["idx"]]))
             for outcome_dict in predictions_test]

In [8]:
eval_conversion = pd.concat([pd.DataFrame(x) for x in eval_conversion], axis=0, keys=range(len(eval_conversion)))
eval_conversion.index.rename(["fold","metric"], inplace=True)

In [9]:
eval_conversion = eval_conversion.groupby("metric").mean().T

In [10]:
eval_conversion.index = pd.MultiIndex.from_tuples(eval_conversion.index.str.split("_", expand=True).tolist())
eval_conversion = eval_conversion.rename(mapper={"ROC-AUC": "ROC-AUC", "brier": "Brier Score"}, axis=1)

In [11]:
eval_conversion

Unnamed: 0,Unnamed: 1,metric,ROC-AUC,Brier Score
single-model,outcome,linear,0.635528,0.102788
single-model,outcome,gbt,0.639185,0.102333
single-model,hurdle,gbt,0.635589,0.102454
two-model,hurdle,linear,0.635528,0.102788
two-model,hurdle,gbt,0.640081,0.102288
Conversion-Rate,,,0.5,0.105741


In [12]:
eval_conversion.to_latex(buf="../results/conversion_prediction_quality.tex", float_format="%.3f")

## Treatment Effect Precision

In [13]:
# ##### Comparison on transformed outcome loss

def calc_prediction_error(prediction_dict, y_true, g, prob_treatment=None, tau_true=None):
    """
    Calculate the prediction error of the model predictions
    prediction_dict : dict
        Dictionary with the model predictions in the form model_name: array of predictions
    y_true : 1D array-like
        Observed outcomes
    g : 1D array-like
        Binary group indicator
    prob_treatment : array-like or int
        The group propensity for each observation. If None or int, the constant probability
        to be in binary treatment group 1.
    tau_true : 1D array-like
        Array of the true treatment effect. The true treatment effect
        is only known in simulations
    """
    output = {}
    
    if prob_treatment is None:
        prob_treatment = g.mean()
    
    for model_name, pred in prediction_dict.items():
        output[model_name] = {}
        #pred.clip(-100,100)
        output[model_name]["transformed_outcome_loss"] = transformed_outcome_loss(tau_pred=pred, y_true=y_true, g=g, 
                                                                                  prob_treatment=prob_treatment)
        if tau_true is not None:
            output[model_name]["root_mean_squared_error"] = np.sqrt(mean_squared_error(y_pred=pred, y_true=tau_true))
            output[model_name]["mean_absolute_error"] = mean_absolute_error(y_pred=pred, y_true=tau_true)

    return output

In [14]:
eval_test = [calc_prediction_error(outcome_dict["treatment_spending"], 
                                   y[outcome_dict["idx"]], g[outcome_dict["idx"]], tau_true=tau_response[outcome_dict["idx"]]) 
             for outcome_dict in predictions_test]

In [15]:
eval_test_dataframe = pd.concat([pd.DataFrame(x) for x in eval_test], axis=0, keys=range(len(eval_test)))
eval_test_dataframe.index.rename(["fold","metric"], inplace=True)

In [16]:
eval_test_dataframe

Unnamed: 0_level_0,Unnamed: 1_level_0,single-model_outcome_gbt,single-model_hurdle_gbt,two-model_outcome_linear,two-model_outcome_gbt,two-model_hurdle_linear,two-model_hurdle_gbt,dr_outcome_linear,dr_outcome_gbt,oracle__,ATE__
fold,metric,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,transformed_outcome_loss,3339.285382,3337.084017,3332.145207,3335.05765,3335.431009,3335.284076,3332.302591,3333.517357,3325.946777,3336.97671
0,root_mean_squared_error,2.803669,2.393461,1.831453,2.084505,2.59634,1.97969,1.845807,2.082083,0.0,2.744631
0,mean_absolute_error,1.993172,1.913756,1.306013,1.418477,1.454465,1.256889,1.319718,1.366142,0.0,1.756179
1,transformed_outcome_loss,3438.300981,3437.442961,3553.62629,3432.127788,3532.357514,3435.488224,3548.442752,3439.076736,3428.348985,3440.108731
1,root_mean_squared_error,2.803779,2.394457,11.126856,1.917549,10.05153,2.013198,10.893226,2.733849,0.0,2.774646
1,mean_absolute_error,2.007792,1.903654,1.361465,1.301229,1.458747,1.296864,1.363837,1.25323,0.0,1.773758
2,transformed_outcome_loss,3337.067429,3337.698004,3335.200013,3332.807979,3380.334308,3333.156114,3335.242158,3335.800881,3327.097329,3339.052633
2,root_mean_squared_error,2.68242,2.336092,2.616154,1.912157,8.182439,1.965586,2.616593,1.991868,0.0,2.748715
2,mean_absolute_error,1.978477,1.848734,1.314805,1.246675,1.494406,1.223837,1.324399,1.200945,0.0,1.750067
3,transformed_outcome_loss,3405.485561,3402.915728,3404.839029,3397.403781,3398.221501,3396.779459,3404.701742,3402.905399,3391.317195,3406.138622


In [17]:
eval_precision = eval_test_dataframe.groupby("metric").mean().T
#print(eval_test_dataframe.groupby("metric").std().T)

In [18]:
eval_precision.index = pd.MultiIndex.from_tuples(eval_precision.index.str.split("_", expand=True).tolist())
eval_precision = eval_precision.rename(mapper={"transformed_outcome_loss": "TOL", "root_mean_squared_error": "RMSE", "mean_absolute_error": "MAE"}, axis=1)

In [19]:
eval_precision

Unnamed: 0,Unnamed: 1,metric,TOL,RMSE,MAE
single-model,outcome,gbt,3387.692547,2.772372,1.989062
single-model,hurdle,gbt,3385.06208,2.365343,1.872875
two-model,outcome,linear,3407.129678,4.157082,1.329631
two-model,outcome,gbt,3381.755356,1.941999,1.310739
two-model,hurdle,linear,3410.776394,5.154396,1.446003
two-model,hurdle,gbt,3381.785386,1.944311,1.245225
dr,outcome,linear,3406.095868,4.111919,1.336267
dr,outcome,gbt,3385.448058,2.372327,1.277614
oracle,,,3374.988552,0.0,0.0
ATE,,,3387.896434,2.74981,1.751105


In [20]:
eval_precision[["RMSE","TOL"]].to_latex(buf="../results/treatment_prediction_quality.tex", float_format="%.2f")

In [21]:
predictions[0]['train']

{'idx': array([     1,      2,      4, ..., 118898, 118899, 118901]),
 'conversion': {'single-model_outcome_linear': array([0.07463045, 0.10396556, 0.08264694, ..., 0.11412579, 0.07461111,
         0.07766156]),
  'single-model_outcome_gbt': array([0.09199969, 0.10255665, 0.09849085, ..., 0.13575878, 0.10880719,
         0.08518385]),
  'single-model_hurdle_gbt': array([0.09557734, 0.09713367, 0.0888524 , ..., 0.12878386, 0.11329535,
         0.07200926]),
  'two-model_hurdle_linear': array([0.07463045, 0.10396556, 0.08264694, ..., 0.11412579, 0.07461111,
         0.07766156]),
  'two-model_hurdle_gbt': array([0.08430627, 0.10646553, 0.09863738, ..., 0.11965201, 0.11066057,
         0.0835836 ]),
  'Conversion-Rate__': array([0.12018978, 0.12018978, 0.12018978, ..., 0.12018978, 0.12018978,
         0.12018978])},
 'treatment_conversion': {'single-model_hurdle_gbt': array([0.03981723, 0.03853449, 0.03163743, ..., 0.03435922, 0.02678827,
         0.03582944]),
  'two-model_hurdle_linear'

## Profit Comparison

In [60]:
MARGIN_RATIO = 0.3
OFFER_COST = 10

In [61]:
# # #### Comparison in terms of profit

# treatment_conversion_model_map = {
#         "ATE_logit":('ATE','logit'), "oracle_gbt":('oracle','gbt'), "oracle_gbt":('oracle','logit')
#         "dr_reg-logit":('dr_reg','logit'), "dr_gbt-gbt":('dr_gbt','gbt'), 
#         # Two-Model
#         "two_model_rf-rf":('two_model_rf','rf'), "two_model_gbt-gbt":('two_model_gbt','gbt'), "two_model_reg-logit":('two_model_reg','logit'),
#         "two_model_hurdle_rf":('two_model_hurdle_rf','two_model_hurdle_rf'), "two_model_hurdle_gbt":('two_model_hurdle_gbt','two_model_hurdle_gbt'), 
#         "two_model_hurdle_linear":('two_model_hurdle_linear','two_model_hurdle_linear'),
#         # Single model
#         #"single_model_bart-gbt":('single_model_bart','gbt'),
#         "single_model_gbt-gbt":('single_model_gbt','gbt'), "single_model_rf-rf":('single_model_gbt','rf'),
#         "single_model_hurdle_rf*2":('hurdle_rf','hurdle_rf'), "single_model_hurdle_gbt*2":('hurdle_gbt','hurdle_gbt')
# }

In [62]:
def tune_threshold(treatment_dict, y_true, c_true, g, margin, contact_cost, offer_cost, prob_treatment=None):
    if prob_treatment is None:
        prob_treatment = g.mean()
    
    # Threshold candiates [1, 0.975,...,0]
    step_size=25
    quantiles = np.array(range(1000,-1,-step_size))/1000
    
    threshold_dict = {}
    for treatment_model, treatment_pred in treatment_dict.items():
        quantile_candidates = np.quantile(treatment_pred, quantiles)

        best_profit = -np.inf
        best_threshold = None
        for threshold in quantile_candidates:
            decision = (treatment_pred>threshold)*1
            profit = expected_policy_profit(targeting_decision=decision, g=g, observed_profit= (y_true*margin-(offer_cost*decision*c_true)), prob_treatment=prob_treatment)
            if profit > best_profit:
                best_threshold = threshold
                best_profit = profit
        
        threshold_dict[treatment_model] = best_threshold
    
    return threshold_dict
    

In [63]:
def calc_bayesian_policy(treatment_dict, conversion_dict, margin, contact_cost, offer_cost):
    
    policy={}
    # Calculate targeting threshold according to expected value
    for treatment_model, treatment_pred in treatment_dict.items():
        for conversion_model, conversion_pred in conversion_dict.items():
            policy["Bayesian+"+str(treatment_model)+"+"+str(conversion_model)] = bayesian_targeting_policy(
                                                         tau_pred=treatment_pred * margin, 
                                                         offer_accept_prob=conversion_pred,
                                                         contact_cost=contact_cost, offer_cost=offer_cost
                                                    )
    return policy

In [64]:
def calc_naive_policy(treatment_dict):
    policy={}
    # Calculate targeting threshold according to expected value
    for treatment_model, treatment_pred in treatment_dict.items():
        policy["Treat-all+"] = np.ones(treatment_pred.shape[0], dtype="int")
        policy["Treat-none+"] = np.zeros(treatment_pred.shape[0], dtype="int")
    
    return policy

In [65]:
def calc_threshold_policy(treatment_dict, threshold=0):
    policy={}
    # Calculate targeting threshold according to expected value
    try:
        for treatment_model, treatment_pred in treatment_dict.items():
            # policy["Filter+"+str(threshold[treatment_model].round(2))+"+"+str(treatment_model)]
            policy["Threshold+"+str(treatment_model)] = (treatment_pred>=threshold[treatment_model])*1       
    except:
        for treatment_model, treatment_pred in treatment_dict.items():
            policy["Threshold"+str(threshold)+"+"+str(treatment_model)] = (treatment_pred>=threshold)*1

    return policy

In [66]:
def calc_policy_profit(policy_dict, y_true, c_true, g, margin, contact_cost, offer_cost, prob_treatment=None):
    if prob_treatment is None:
        prob_treatment = g.mean()
        
    profit = {key:expected_policy_profit(targeting_decision=decision, g=g, observed_profit= (y_true*margin-(offer_cost*decision*c_true)), prob_treatment=prob_treatment).round(0) 
              for key,decision in policy_dict.items()}
    ratio_treated = {key:decision.mean().round(2)
              for key,decision in policy_dict.items()}
    
    return {"profit":profit, "ratio_treated":ratio_treated}

In [67]:
# Benchmark policies
#policy_train["naive_none"] = np.zeros(X.shape[0], dtype="int")
#policy_train["naive_all"] = np.ones(X.shape[0], dtype="int")
#policy["naive_none"] = np.zeros(X_val.shape[0], dtype = "int")
#policy["naive_all"] = np.ones(X_val.shape[0], dtype = "int")

In [68]:
eval_profit = []
for prediction_dict in predictions:
    outcome_dict_train = prediction_dict["train"]    
    outcome_dict = prediction_dict["test"]
    
    # Calculate policy decision
    policy_dict = calc_bayesian_policy(
        treatment_dict = outcome_dict["treatment_spending"], 
        conversion_dict = outcome_dict["conversion"],
    margin =MARGIN_RATIO, contact_cost=0, offer_cost=OFFER_COST) 
    
    policy_dict.update(calc_naive_policy(treatment_dict=outcome_dict["treatment_spending"]))
    policy_dict.update(calc_threshold_policy(treatment_dict=outcome_dict["treatment_spending"]))
    
    # Threshold policy
    # Tune threshold 
    threshold_dict = tune_threshold(outcome_dict_train["treatment_spending"],        
                   y_true = y[outcome_dict_train["idx"]], c_true = c[outcome_dict_train["idx"]], 
                   g=g[outcome_dict_train["idx"]],
                   margin =MARGIN_RATIO, contact_cost=0, offer_cost=OFFER_COST)
    # Make policy
    policy_dict.update(calc_threshold_policy(treatment_dict=outcome_dict["treatment_spending"], 
                      threshold = threshold_dict))
    
    # Calculate profit
    profit = calc_policy_profit(
        policy_dict=policy_dict,
        y_true = y[outcome_dict["idx"]], c_true = c[outcome_dict["idx"]], g=g[outcome_dict["idx"]],
        margin =MARGIN_RATIO, contact_cost=0, offer_cost=OFFER_COST)
    
    eval_profit.append(profit)

In [69]:
eval_profit = pd.concat([pd.DataFrame(x) for x in eval_profit], axis=0, keys=range(len(eval_profit)))
eval_profit.index.rename(["fold","model"], inplace=True)

In [70]:
eval_profit = eval_profit.groupby("model").mean()

In [71]:
eval_profit.index = pd.MultiIndex.from_tuples(eval_profit.index.str.split("[+|_]", expand=True).tolist())

In [72]:
eval_profit.index.names = ["Policy","Causal","Stages","Estimator","conversion_architecture","conversion_stages","conversion_estimator"]

In [73]:
eval_profit.reset_index(drop=False, inplace=True)

In [74]:
eval_profit = eval_profit.reindex(columns= ["Policy","Stages","Causal","Estimator",
                                            "conversion_stages","conversion_architecture","conversion_estimator",
                                           "profit","ratio_treated"])

In [75]:
eval_profit.sort_values(["Policy","Stages","Causal","Estimator", "conversion_architecture", "conversion_estimator"],
                        ascending=[True, True, True, False, True, False], inplace=True)

#### Conversion Tables

In [76]:
temp = eval_profit.query(('(Policy in ["Bayesian","Threshold0"] and Causal in ["oracle","ATE"]) and conversion_stages!="outcome"'
                         'or (Policy in ["Treat-all", "Treat-none"])'),
                  inplace=False).drop(["Stages", "Estimator","conversion_stages"], axis=1)#.sort_values(["Policy","Causal", "conversion_architecture", "conversion_estimator"])

temp.to_latex("../results/profit_oracle_conversion_models.tex", na_rep="", index=False,formatters={'profit':'{:.0f}'.format,"ratio_treated":'{:.2f}'.format})

In [77]:
temp

Unnamed: 0,Policy,Causal,conversion_architecture,conversion_estimator,profit,ratio_treated
59,Bayesian,ATE,Conversion-Rate,,50830.2,1.0
56,Bayesian,ATE,single-model,gbt,52930.6,0.838
57,Bayesian,ATE,two-model,linear,51936.2,0.756
58,Bayesian,ATE,two-model,gbt,52402.4,0.79
53,Bayesian,oracle,Conversion-Rate,,55492.6,0.706
50,Bayesian,oracle,single-model,gbt,56696.8,0.718
51,Bayesian,oracle,two-model,linear,57360.8,0.69
52,Bayesian,oracle,two-model,gbt,57022.6,0.688
71,Threshold0,ATE,,,50830.2,1.0
70,Threshold0,oracle,,,52996.2,0.94


#### Treatment Effect Architectures

Average conversion rate

In [78]:
temp = eval_profit.query('(conversion_architecture=="Conversion-Rate") or (Policy=="Threshold0")',
                  inplace=False)

In [40]:
temp.drop(["conversion_stages", "conversion_estimator"], axis=1).to_latex("../results/profit_treatment-models_avg-conversion.tex", na_rep="-", index=False,formatters={'profit':'{:.0f}'.format,"ratio_treated":'{:.2f}'.format})

In [80]:
temp.drop(["conversion_stages", "conversion_estimator"], axis=1)

Unnamed: 0,Policy,Stages,Causal,Estimator,conversion_architecture,profit,ratio_treated
59,Bayesian,,ATE,,Conversion-Rate,50830.2,1.0
53,Bayesian,,oracle,,Conversion-Rate,55492.6,0.706
11,Bayesian,hurdle,single-model,gbt,Conversion-Rate,48840.0,0.204
29,Bayesian,hurdle,two-model,linear,Conversion-Rate,54550.4,0.656
35,Bayesian,hurdle,two-model,gbt,Conversion-Rate,55590.2,0.696
41,Bayesian,outcome,dr,linear,Conversion-Rate,54458.6,0.656
47,Bayesian,outcome,dr,gbt,Conversion-Rate,54629.4,0.832
5,Bayesian,outcome,single-model,gbt,Conversion-Rate,52795.4,0.41
17,Bayesian,outcome,two-model,linear,Conversion-Rate,54456.0,0.656
23,Bayesian,outcome,two-model,gbt,Conversion-Rate,55146.2,0.72


Model-based conversion rate

In [81]:
temp = eval_profit.query(('(Causal==conversion_architecture and Stages==conversion_stages and Estimator==conversion_estimator) or'
                   '(Policy=="Threshold") or'
                   '(Causal=="dr" and conversion_stages!="hurdle" and Estimator==conversion_estimator) or'
                   '(Stages=="outcome" and conversion_stages=="outcome" and Estimator==conversion_estimator) or'
                   '(Stages=="outcome" and conversion_stages=="outcome" and Estimator=="reg" and conversion_estimator=="linear")'),
                  inplace=False)#.sort_values(["Policy","Stages", "Causal", "Estimator"])

In [83]:
temp.drop(["conversion_stages", "Estimator"], axis=1)

Unnamed: 0,Policy,Stages,Causal,conversion_architecture,conversion_estimator,profit,ratio_treated
8,Bayesian,hurdle,single-model,single-model,gbt,54665.4,0.528
27,Bayesian,hurdle,two-model,two-model,linear,56172.0,0.712
34,Bayesian,hurdle,two-model,two-model,gbt,56084.2,0.712
36,Bayesian,outcome,dr,single-model,linear,56028.0,0.656
43,Bayesian,outcome,dr,single-model,gbt,55160.4,0.75
1,Bayesian,outcome,single-model,single-model,gbt,52881.2,0.488
12,Bayesian,outcome,two-model,single-model,linear,56010.0,0.658
19,Bayesian,outcome,two-model,single-model,gbt,55942.4,0.682
81,Threshold,,ATE,,,50830.2,1.0
80,Threshold,,oracle,,,55391.8,0.642


In [43]:
temp.drop(["conversion_stages", "Estimator"], axis=1).to_latex("../results/profit_treatment_models.tex", na_rep="-", index=False, formatters={'profit':'{:.0f}'.format,"ratio_treated":'{:.2f}'.format})

## Prediction distribution analysis

In [44]:
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns

In [45]:
##### Evaluation of distribution of predicted treatment effects

axes_limits = {"treatment_spending":[-20,20,0,0.4],
               "treatment_basket_value":[-20,20,0,0.3],
               "treatment_conversion":[-0.15, 0.25, 0, 60]}

clip_limits = {"treatment_spending":[-50,50],
               "treatment_basket_value":[-50,50],
               "treatment_conversion":[-0.5, 0.5]}

In [46]:
for treatment_level in ["treatment_spending","treatment_basket_value","treatment_conversion"]:
    for fold_index in range(len(predictions_test)):
        with PdfPages(f"../results/{treatment_level}_distribution_fold{fold_index}.pdf") as pdf:
            for model in predictions_test[fold_index][treatment_level].keys():
                if model not in ["oracle__", "ATE__"]:
                    plt.figure()
                    plt.title(model)
                    plt.xlabel("Model Estimate")
                    plt.ylabel("Kernel Density")
                    plt.axis(axes_limits[treatment_level]) #
                    try:
                        sns.kdeplot(predictions_test[fold_index][treatment_level]["oracle__"])
                    except:
                        sns.kdeplot(predictions_test[fold_index][treatment_level]["oracle"])
                    sns.kdeplot(predictions_test[fold_index][treatment_level][model])
                    pdf.savefig()
                    plt.close()

Look at range of treatment effect predictions

In [47]:
for treatment_level in ["treatment_spending"]:  
    predictions_combined = pd.concat([pd.DataFrame(fold[treatment_level]) for fold in predictions_test])
print(predictions_combined.min())
print(predictions_combined.max())

single-model_outcome_gbt      -3.096472
single-model_hurdle_gbt       -3.636512
two-model_outcome_linear   -1678.827528
two-model_outcome_gbt        -27.358335
two-model_hurdle_linear     -151.376817
two-model_hurdle_gbt         -23.977496
dr_outcome_linear          -1643.319431
dr_outcome_gbt              -305.398171
oracle__                     -17.881361
ATE__                          4.625705
dtype: float64
single-model_outcome_gbt       7.569022
single-model_hurdle_gbt       17.938571
two-model_outcome_linear     272.052807
two-model_outcome_gbt         50.861363
two-model_hurdle_linear     1506.642720
two-model_hurdle_gbt          38.647268
dr_outcome_linear            269.113883
dr_outcome_gbt               107.173000
oracle__                      36.782805
ATE__                          4.669789
dtype: float64


In [48]:
predictions_combined["oracle__"][predictions_combined["two-model_hurdle_linear"]>100]

19411    4.449368
5569     5.311719
Name: oracle__, dtype: float64

In [49]:
##### Merge all folds
for treatment_level in ["treatment_spending","treatment_basket_value","treatment_conversion"]:  
    predictions_combined = pd.concat([pd.DataFrame(fold[treatment_level]) for fold in predictions_test])

    with PdfPages(f"../results/{treatment_level}_distribution_combined.pdf") as pdf:
        for model in predictions_combined.columns.values:
            if model not in ["oracle__", "ATE__"]:
                plt.figure()
                plt.title(model)
                plt.xlabel("Model Estimate")
                plt.ylabel("Kernel Density")
                plt.axis(axes_limits[treatment_level]) #
                try:
                    sns.kdeplot(predictions_combined[["oracle__"]].values.flatten(), linestyle="--", color='grey')
                except:
                    sns.kdeplot(predictions_combined[["oracle"]].values.flatten(), linestyle="--", color='grey')
                sns.kdeplot(predictions_combined[[model]].values.flatten(), 
                            clip=clip_limits[treatment_level], color="blue")
                pdf.savefig()
                plt.close()

### Hurdle Model Parts Prediction Accuracy

In [50]:
results = {}

for treatment_level in ["treatment_basket_value","treatment_conversion"]:  
    predictions_combined = pd.concat([pd.DataFrame(fold[treatment_level]) for fold in predictions_test])
    results[treatment_level] = {}
    for model in predictions_combined.columns.values:
        results[treatment_level][model] = np.sqrt(mean_squared_error(y_pred=predictions_combined[[model]], 
                                                                     y_true=predictions_combined[["oracle__"]]))

In [51]:
results = pd.DataFrame(results)
results.index = pd.MultiIndex.from_tuples(results.index.str.split("[+|_]", expand=True).tolist())
results.columns = ["RMSE Basket Value", "RMSE Conversion"]
results.index.names = ["Causal","Stages","Estimator"]

results.drop('oracle', level="Causal", inplace=True)
results.reset_index(drop=False, inplace=True)

In [52]:
results

Unnamed: 0,Causal,Stages,Estimator,RMSE Basket Value,RMSE Conversion
0,single-model,hurdle,gbt,3.071577,0.025373
1,two-model,hurdle,linear,14.487432,0.022042
2,two-model,hurdle,gbt,3.632286,0.020456


In [53]:
results.to_latex("../results/stepwise_performance_hurdle_models.tex", na_rep="-", index=False, formatters={'RMSE Basket Value':'{:.2f}'.format,"RMSE Conversion":'{:.4f}'.format})

In [54]:
treatment_level= "treatment_basket_value"
predictions_combined = pd.concat([pd.DataFrame(fold[treatment_level]) for fold in predictions_test])

In [55]:
predictions_combined[['two-model_hurdle_linear']].mean()

two-model_hurdle_linear    2.292251
dtype: float64

In [56]:
predictions_combined[['two-model_hurdle_linear']].quantile()

two-model_hurdle_linear    2.67448
Name: 0.5, dtype: float64

In [57]:
predictions_combined[['two-model_hurdle_gbt']].mean()

two-model_hurdle_gbt    1.972801
dtype: float64

In [58]:
predictions_combined[['oracle__']].mean()

oracle__    1.015758
dtype: float64

In [59]:
predictions_combined[['oracle__']].quantile(q=[0,0.05,0.5,0.95,1])

Unnamed: 0,oracle__
0.0,-10.0
0.05,-4.354376
0.5,1.13091
0.95,5.645624
1.0,10.0
