In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, average_precision_score
from sklearn.metrics import roc_auc_score, roc_curve, auc, make_scorer, classification_report
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import KNNImputer
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.utils import resample
from xgboost import XGBClassifier, XGBRegressor, plot_tree
from xgboost import plot_importance

In [2]:
# Load in our uplift model library
from main import uplift_model

In [3]:
#==================================================================#
# Here we need to first prepare our data for modelling. Please     #
# keep in mind that your dataframe (here referred to as train in   #
# our code) must have all features, treatment column and target    #
# column. We will then separate them out into X, t and y           #
# respectively.                                                    #
#                                                                  #
# Here I am using a sample dataset from Kaggle which can be found  #
# via link: https://www.kaggle.com/davinwijaya/customer-retention  #
#==================================================================#

# Load csv file
df = pd.read_csv("Data/data.csv")
df['party_id'] = df.index
df = df[df['offer'].isin(['Discount', 'No Offer'])]

# Treatment feature
df['treatment'] = [1 if x == 'Discount' else 0 for x in df['offer']]

# Create a dataframe of features, treatment, target that is ready for training
train = pd.DataFrame()
train = pd.concat([train, 
                   df[['party_id', 'recency', 'history', 'used_discount', 'used_bogo',
                       'is_referral']],
                   pd.get_dummies(df[['zip_code', 'channel']]),
                   df[['treatment', 'conversion']]], sort=False, axis=1).reset_index()

# Create X, t and y
X = train.iloc[:,2:-2]
t = train['treatment']
y = train['conversion']

In [4]:
# Invoke our uplift_model class
mdl = uplift_model()

In [5]:
# Run xlearner_uplift in uplift_model class
final, clf0, clf1, reg0, reg1 = mdl.xlearner_uplift(X, t, y)

mu0 XGBoost Classifier for control: [Accuracy: 0.7491, F1-score: 0.7648, Gini: 0.6476]
mu1 XGBoost Classifier for treatment: [Accuracy: 0.6678, F1-score: 0.6760, Gini: 0.4744]
tau0 XGBoost Regressor: [Mean Squared Error: 0.1028, R2-score: 0.0606]
tau1 XGBoost Regressor: [Mean Squared Error: 0.1699, R2-score: 0.0599]


In [6]:
# Here just to demonstrate the results we cut our population into deciles
final['decile'] = pd.qcut(final['uplift'], 10, labels=False)
final['decile'] = 10-final['decile']

In [7]:
# A summary table to show uplift by decile
final[['mu0', 'mu1', 'tau0_ite', 'tau1_ite', 'uplift', 'conversion', 'decile']].groupby(['decile']).agg(['mean'])

Unnamed: 0_level_0,mu0,mu1,tau0_ite,tau1_ite,uplift,conversion
Unnamed: 0_level_1,mean,mean,mean,mean,mean,mean
decile,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1,0.261386,0.581696,0.469899,0.02486,0.24738,0.241671
2,0.29267,0.505546,0.41264,-0.100496,0.156072,0.152781
3,0.303523,0.464505,0.376971,-0.132932,0.12202,0.122506
4,0.310775,0.433536,0.354977,-0.155993,0.099492,0.11552
5,0.348867,0.428755,0.345081,-0.187002,0.079039,0.112597
6,0.374741,0.417335,0.329273,-0.2144,0.057437,0.106992
7,0.40383,0.405946,0.310858,-0.24225,0.034304,0.113823
8,0.452053,0.406307,0.298166,-0.282605,0.007781,0.125323
9,0.514443,0.412325,0.285462,-0.340256,-0.027397,0.138967
10,0.59793,0.380775,0.177819,-0.400768,-0.111475,0.214403
