In [2]:
from pprint import pprint

import pandas as pd
import matplotlib.pyplot as plt
from sklift.metrics import uplift_at_k
from sklift.viz import plot_uplift_curve
from sklift.viz import plot_qini_curve
import numpy as np
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

from causalml.inference.meta import BaseXRegressor, BaseTClassifier, BaseSClassifier, BaseRClassifier
from causalml.dataset import *
from causalml.metrics import *

from classifierNN import *
from lightgbm import LGBMClassifier

In [3]:
import torch
import random
seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU.
np.random.seed(seed)  # Numpy module.
random.seed(seed)  # Python random module.
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

In [64]:
def MetaLearners_NN(X_train,treat_train,y_train,X_val,treat_val,y_val,hid_size,epoch = 5, lr=1e-3):
    learner_t = BaseTClassifier(learner=Classifier_NN(X_train.shape[1],hid_size,epoch,lr))
    learner_t.fit(X=X_train, treatment=treat_train, y=y_train)
    cate_t = np.squeeze(learner_t.predict(X_val))
    score_t = uplift_at_k(y_true=y_val, uplift=cate_t, treatment=treat_val, strategy='by_group', k=0.3)
    
    learner_s = BaseSClassifier(learner=Classifier_NN(X_train.shape[1]+1,hid_size,epoch,lr))
    learner_s.fit(X=X_train, treatment=treat_train, y=y_train)
    cate_s = np.squeeze(learner_s.predict(X_val))
    score_s = uplift_at_k(y_true=y_val, uplift=cate_s, treatment=treat_val, strategy='by_group', k=0.3)
    
    learner_x = BaseXRegressor(Classifier_NN(X_train.shape[1],hid_size,epoch,lr),Classifier_NN(X_train.shape[1],hid_size,epoch,lr))
    learner_x.fit(X=X_train, treatment=treat_train, y=y_train)
    cate_x = np.squeeze(learner_x.predict(X_val))
    score_x = uplift_at_k(y_true=y_val, uplift=cate_x, treatment=treat_val, strategy='by_group', k=0.3)
    
    #learner_r = BaseRClassifier(Classifier_NN(X_train.shape[1],4),Classifier_NN(X_train.shape[1],4))
    #learner_r.fit(X=X_train, treatment=treat_train, y=y_train)
    #cate_r = np.squeeze(learner_x.predict(X_val.values))
    #score_r = uplift_at_k(y_true=y_val, uplift=cate_r, treatment=treat_val, strategy='by_group', k=0.3)
    
    return score_t, score_s, score_x

In [5]:
def MetaLearners(X_train,treat_train,y_train,X_val,treat_val,y_val):
    learner_t = BaseTClassifier(learner=LGBMClassifier())
    learner_t.fit(X=X_train, treatment=treat_train, y=y_train)
    cate_t = np.squeeze(learner_t.predict(X_val))
    score_t = uplift_at_k(y_true=y_val, uplift=cate_t, treatment=treat_val, strategy='by_group', k=0.3)
    
    learner_s = BaseSClassifier(learner=LGBMClassifier())
    learner_s.fit(X=X_train, treatment=treat_train, y=y_train)
    cate_s = np.squeeze(learner_s.predict(X_val))
    score_s = uplift_at_k(y_true=y_val, uplift=cate_s, treatment=treat_val, strategy='by_group', k=0.3)
    
    learner_x = BaseXRegressor(LGBMClassifier(),LGBMClassifier())
    learner_x.fit(X=X_train, treatment=treat_train, y=y_train)
    cate_x = np.squeeze(learner_x.predict(X_val))
    score_x = uplift_at_k(y_true=y_val, uplift=cate_x, treatment=treat_val, strategy='by_group', k=0.3)
    
    #learner_r = BaseRClassifier(LGBMClassifier(),LGBMClassifier())
    #learner_r.fit(X=X_train, treatment=treat_train, y=np.array(y_train,dtype=np.int64))
    #cate_r = np.squeeze(learner_x.predict(X_val.values))
    #score_r = uplift_at_k(y_true=y_val, uplift=cate_r, treatment=treat_val, strategy='by_group', k=0.3)
    
    return score_t, score_s, score_x

In [6]:
met = np.array(['T','S','X'])[:,None]

# X5 Retail Hero dataset

In [33]:
df_clients = pd.read_csv('clients.csv', index_col='client_id')
df_train = pd.read_csv('uplift_train.csv', index_col='client_id')
df_test = pd.read_csv('uplift_test.csv', index_col='client_id')

df_features = df_clients.copy()
df_features['first_issue_time'] = \
    (pd.to_datetime(df_features['first_issue_date'])
     - pd.to_datetime(df_features['first_issue_date']).min()) / pd.Timedelta('365d')

df_features['first_redeem_time'] = \
    (pd.to_datetime(df_features['first_redeem_date'])
     - pd.to_datetime(df_features['first_redeem_date']).min()) / pd.Timedelta('365d')

df_features['issue_redeem_delay'] = df_features['first_redeem_time'] \
    - df_features['first_issue_time']

df_features = df_features.join(pd.get_dummies(df_features['gender']))
df_features['first_redeem_time'] = df_features['first_redeem_time'].fillna(df_features['first_redeem_time'].mean())
df_features['issue_redeem_delay'] = df_features['issue_redeem_delay'].fillna(df_features['issue_redeem_delay'].mean())

df_features = df_features.drop(['first_issue_date', 'first_redeem_date', 'gender'], axis=1)

indices_train = df_train.index
indices_test = df_test.index
indices_learn, indices_valid = train_test_split(df_train.index, test_size=0.3, random_state=123)

X_train = df_features.loc[indices_learn, :]
y_train = df_train.loc[indices_learn, 'target']
treat_train = df_train.loc[indices_learn, 'treatment_flg']

X_val = df_features.loc[indices_valid, :]
y_val = df_train.loc[indices_valid, 'target']
treat_val =  df_train.loc[indices_valid, 'treatment_flg']

X_train_full = df_features.loc[indices_train, :]
y_train_full = df_train.loc[:, 'target']
treat_train_full = df_train.loc[:, 'treatment_flg']

X_test = df_features.loc[indices_test, :]

cat_features = ['gender']

In [35]:
res_x5 = np.array(['X5 Retail Hero']*3)[:,None]
res = np.array(MetaLearners(X_train.values,treat_train,y_train,X_val.values,treat_val,y_val))[:,None]

In [36]:
res_NN = np.array(MetaLearners_NN(X_train.values,treat_train,y_train,X_val.values,treat_val,y_val,[5,3],6))[:,None]
res_x5 = np.concatenate((res_x5,met,np.round(res,3),np.round(res_NN,3)),axis=1)

In [37]:
pd.DataFrame(res_x5)

Unnamed: 0,0,1,2,3
0,X5 Retail Hero,T,0.053,0.045
1,X5 Retail Hero,S,0.04,0.041
2,X5 Retail Hero,X,0.038,0.038


# Hillstrom dataset

In [46]:
df = pd.read_csv('Hillstrom.csv')
df.drop(['history_segment', "conversion", "spend"], axis=1, inplace=True)

cat_cols = ['zip_code', 'channel']
df_ohe = pd.get_dummies(df, columns=cat_cols)
df_ohe.segment = df_ohe.segment.map({'Womens E-Mail': 1, 'Mens E-Mail': 1, 'No E-Mail': 0})

X = df_ohe.drop('visit', axis=1)
y = df_ohe['visit'].astype('int')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

treat_train = X_train['segment']
treat_test = X_test['segment']

X_train.drop(['segment'], axis=1, inplace=True)
X_test.drop(['segment'], axis=1, inplace=True)

In [47]:
X_test.shape

(21120, 11)

In [48]:
res_hill = np.array(['Hillstorm']*3)[:,None]
res = np.array(MetaLearners(X_train.values,treat_train,y_train,X_test.values,treat_test,y_test))[:,None]
res_NN = np.array(MetaLearners_NN(X_train.values,treat_train,y_train,X_test.values,treat_test,y_test,[8,4],6))[:,None]
res_hill = np.concatenate((res_hill,met,np.round(res,3),np.round(res_NN,3)),axis=1)

In [49]:
pd.DataFrame(res_hill)

Unnamed: 0,0,1,2,3
0,Hillstorm,T,0.061,0.069
1,Hillstorm,S,0.067,0.069
2,Hillstorm,X,0.073,0.073


# Kuusito dataset

In [40]:
df = pd.read_csv('Kuusito.csv')
df.drop(['customer_type'], axis=1, inplace=True)

df = df.replace(r'Value', '', regex=True)
df['target_control'] = df['target_control'].map({'control': 1, 'target': 0})
df['outcome'] = df['outcome'].map({'negative': 0, 'positive': 1})

df = pd.get_dummies(df,drop_first=True)

X = df.drop('outcome', axis=1).astype('int64')
y = df['outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

treat_train = X_train['target_control']
treat_test = X_test['target_control']

X_train.drop(['target_control'], axis=1, inplace=True)
X_test.drop(['target_control'], axis=1, inplace=True)
X_train.drop(['customer_id'], axis=1, inplace=True)
X_test.drop(['customer_id'], axis=1, inplace=True)

In [44]:
res_kuusito = np.array(['Kuusito']*3)[:,None]
res = np.array(MetaLearners(X_train.values,treat_train,y_train,X_test.values,treat_test,y_test))[:,None]
res_NN = np.array(MetaLearners_NN(X_train.values,treat_train,y_train,X_test.values,treat_test,y_test,[38,19],400,1e-4))[:,None]
res_kuusito = np.concatenate((res_kuusito,met,np.round(res,3),np.round(res_NN,3)),axis=1)

In [45]:
pd.DataFrame(res_kuusito) #400 epoch

Unnamed: 0,0,1,2,3
0,Kuusito,T,0.279,0.087
1,Kuusito,S,0.31,0.026
2,Kuusito,X,0.239,0.091


In [32]:
pd.DataFrame(res_kuusito) #300 epoch

Unnamed: 0,0,1,2,3
0,Kuusito,T,0.279,0.085
1,Kuusito,S,0.31,0.06
2,Kuusito,X,0.239,0.077


# Synthetic

In [65]:
y, X, treatment, tau, b, e = synthetic_data(mode=2, n=10000, p=8, sigma=1.0)
y = (y > np.median(y)).astype(int)
X_train, X_test, y_train, y_test, treat_train, treat_test= train_test_split(X, y, treatment, test_size=0.33, random_state=0)

In [66]:
res_syn = np.array(['Synthetic']*3)[:,None]
res = np.array(MetaLearners(X_train,treat_train,y_train,X_test,treat_test,y_test))[:,None]
res_NN = np.array(MetaLearners_NN(X_train,treat_train,y_train,X_test,treat_test,y_test,[6,4,2],5))[:,None]
res_syn = np.concatenate((res_syn,met,np.round(res,3),np.round(res_NN,3)),axis=1)

In [67]:
pd.DataFrame(res_syn)

Unnamed: 0,0,1,2,3
0,Synthetic,T,0.459,0.43
1,Synthetic,S,0.513,0.316
2,Synthetic,X,0.458,0.167


# ALL

In [68]:
res_all = np.concatenate((res_x5,res_hill,res_kuusito,res_syn),axis=0)
pd.DataFrame(res_all, columns=['Dataset','Learner','LGBMClassifier', 'MLP'])

Unnamed: 0,Dataset,Learner,LGBMClassifier,MLP
0,X5 Retail Hero,T,0.053,0.045
1,X5 Retail Hero,S,0.04,0.041
2,X5 Retail Hero,X,0.038,0.038
3,Hillstorm,T,0.061,0.069
4,Hillstorm,S,0.067,0.069
5,Hillstorm,X,0.073,0.073
6,Kuusito,T,0.279,0.087
7,Kuusito,S,0.31,0.026
8,Kuusito,X,0.239,0.091
9,Synthetic,T,0.459,0.43
