## Prepare Notebook

In [1]:
from causalml.inference.meta import(
    BaseRClassifier,
    BaseSClassifier,
    BaseTClassifier,
    BaseXClassifier,
)
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from xgboost import XGBClassifier

plt.style.use("bmh")
plt.rcParams["figure.figsize"] = [10, 6]
plt.rcParams["figure.dpi"] = 100

%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format = "svg"

pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.


## Read Data

Data set: [RetailHero.ai contest data:](https://ods.ai/competitions/x5-retailhero-uplift-modeling/data)

https://nbviewer.org/github/maks-sh/scikit-uplift/blob/master/notebooks/RetailHero_EN.ipynb

In [2]:
from pathlib import Path

data_path = Path("/Users/juanitorduz/Downloads/retailhero-uplift/data")

clients_df = pd.read_csv(
    data_path / "clients.csv",
    parse_dates=["first_issue_date", "first_redeem_date"]
)
uplift_train_df = pd.read_csv(data_path / "uplift_train.csv")
uplift_test_df = pd.read_csv(data_path / "uplift_test.csv")

- `clients_df` data:

In [3]:
clients_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400162 entries, 0 to 400161
Data columns (total 5 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   client_id          400162 non-null  object        
 1   first_issue_date   400162 non-null  datetime64[ns]
 2   first_redeem_date  364693 non-null  datetime64[ns]
 3   age                400162 non-null  int64         
 4   gender             400162 non-null  object        
dtypes: datetime64[ns](2), int64(1), object(2)
memory usage: 15.3+ MB


- `uplift_train_df` data:

In [4]:
uplift_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200039 entries, 0 to 200038
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   client_id      200039 non-null  object
 1   treatment_flg  200039 non-null  int64 
 2   target         200039 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 4.6+ MB


- `uplift_test_df` data:

In [5]:
uplift_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200123 entries, 0 to 200122
Data columns (total 1 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   client_id  200123 non-null  object
dtypes: object(1)
memory usage: 1.5+ MB


## EDA

In [6]:
assert clients_df.shape[0] == clients_df["client_id"].nunique()
assert uplift_train_df.shape[0] == uplift_train_df["client_id"].nunique()
assert uplift_test_df.shape[0] == uplift_test_df["client_id"].nunique()

print(f"""
clients_id
----------
clients_df: {clients_df["client_id"].nunique()}
uplift_train_df: {uplift_train_df["client_id"].nunique()}
uplift_test_df: {uplift_test_df["client_id"].nunique()}
""")


clients_id
----------
clients_df: 400162
uplift_train_df: 200039
uplift_test_df: 200123



In [7]:
clients_df["gender"].value_counts()

U    185706
F    147649
M     66807
Name: gender, dtype: int64

In [8]:
clients_df["age"].value_counts()

 33      9437
 36      9394
 32      9354
 31      9286
 35      9152
         ... 
 537        1
-953        1
 1841       1
-2          1
 827        1
Name: age, Length: 290, dtype: int64

In [9]:
uplift_train_df \
    .groupby(["treatment_flg", "target"], as_index=False) \
    .agg(combination_count=("client_id", "count")) \
    .assign(
        total = lambda x: x["combination_count"].sum(),
        share = lambda x: x["combination_count"] / x["total"],
    )

Unnamed: 0,treatment_flg,target,combination_count,total,share
0,0,0,39695,200039,0.198436
1,0,1,60363,200039,0.301756
2,1,0,36342,200039,0.181675
3,1,1,63639,200039,0.318133


In [10]:
uplift_train_df \
    .groupby(["treatment_flg"], as_index=False) \
    .agg(combination_count=("client_id", "count")) \
    .assign(
        total = lambda x: x["combination_count"].sum(),
        share = lambda x: x["combination_count"] / x["total"],
    )

Unnamed: 0,treatment_flg,combination_count,total,share
0,0,100058,200039,0.500192
1,1,99981,200039,0.499808


## Prepare Data

In [11]:
features_df = clients_df.copy() \
    .set_index("client_id") \
    .assign(
        issue_redeem_delay_days = lambda x:
            (x["first_redeem_date"] - x["first_issue_date"]).dt.days
    ) \
    .drop(columns=["first_issue_date", "first_redeem_date"])


data_df = pd.merge(
    left=features_df,
    right=(
        uplift_train_df
        .copy()
        .set_index("client_id")
    ),
    left_index=True,
    right_index=True,
    how="inner"
)

In [12]:
from sklearn.model_selection import train_test_split

target_col = "target"
treatment_col = "treatment_flg"

y = data_df[target_col]
w = data_df[treatment_col]
x = data_df.drop(columns=[treatment_col, target_col])

idx_train, idx_val = train_test_split(
    data_df.index, test_size=0.3, random_state=123
)

x_train = x.loc[idx_train]
x_val = x.loc[idx_val]

w_train = w.loc[idx_train]
w_val = w.loc[idx_val]

y_train = y.loc[idx_train]
y_val = y.loc[idx_val]

In [13]:
preprocessor = ColumnTransformer(transformers=[
    ("issue_redeem_delay_days_imputer", SimpleImputer(fill_value=0), ["issue_redeem_delay_days"]),
    ("gender_ordinal_encoder", OrdinalEncoder(), ["gender"]),
], remainder='passthrough')

preprocessor.fit(x_train)
x_train_transformed = preprocessor.transform(x_train)
x_val_transformed =  preprocessor.transform(x_val)

## Models

### S-Learner

In [14]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer


xgb = XGBClassifier(use_label_encoder=False)

learner_s = BaseSClassifier(learner=xgb)

s_ate = learner_s.estimate_ate(X=x_train_transformed, treatment=w_train, y=y_train)



### T-Learner

In [15]:
learner_t = BaseTClassifier(learner=xgb)

t_ate_lwr, t_ate, t_ate_upr = learner_t.estimate_ate(X=x_train_transformed, treatment=w_train, y=y_train)



### X-Learner

In [16]:
from sklearn.linear_model import LinearRegression

learner_x = BaseXClassifier(
    outcome_learner=XGBClassifier(),
    effect_learner=LinearRegression(),
)

x_ate_lwr, x_ate, x_ate_upr = learner_x.estimate_ate(
    X=x_train_transformed,
    y=y_train,
    treatment=w_train,
    p=pd.Series(0.5, index=y_train.index)
)



