# Uplift Models using EconML

Project Goal: Predict the people most likely to click on a display ad given

## Import Libraries & Data

In [15]:
# General Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# For saving models
import joblib

# EconML Libraries
from econml.metalearners import SLearner, TLearner, XLearner
from econml.policy import PolicyTree

# Dataset
from sklift.datasets import fetch_criteo

# Scikit-learn Libraries
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error, roc_auc_score

# Display settings
%matplotlib inline
sns.set(style="whitegrid")

# random state
rs = 481516234

In [3]:
# Load Dataset
# dataset = fetch_criteo(target_col='conversion', treatment_col='exposure')
# data, target, treatment = dataset.data, dataset.target, dataset.treatment

# alternative option
data, target, treatment = fetch_criteo(target_col='conversion', treatment_col='exposure', return_X_y_t=True)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13979592 entries, 0 to 13979591
Data columns (total 12 columns):
 #   Column  Dtype  
---  ------  -----  
 0   f0      float64
 1   f1      float64
 2   f2      float64
 3   f3      float64
 4   f4      float64
 5   f5      float64
 6   f6      float64
 7   f7      float64
 8   f8      float64
 9   f9      float64
 10  f10     float64
 11  f11     float64
dtypes: float64(12)
memory usage: 1.2 GB


In [5]:
target.info()

<class 'pandas.core.series.Series'>
RangeIndex: 13979592 entries, 0 to 13979591
Series name: conversion
Non-Null Count     Dtype
--------------     -----
13979592 non-null  Int8 
dtypes: Int8(1)
memory usage: 26.7 MB


In [7]:
target.value_counts()

conversion
0    13938818
1       40774
Name: count, dtype: Int64

In [8]:
treatment.value_counts()

exposure
0    13551380
1      428212
Name: count, dtype: Int64

## Clean Data

In [9]:
# Split into training and test sets
X_train, X_test, T_train, T_test, Y_train, Y_test = train_test_split(data,target,treatment, test_size=0.3, random_state=rs)

## Exploratory Data Analysis (EDA)

## Train Models

### S-Learner

https://econml.azurewebsites.net/_autosummary/econml.metalearners.SLearner.html
- The S-Learner in EconML 

In [9]:
# S-learner
s_learner = SLearner(overall_model=RandomForestRegressor(random_state=rs))
s_learner.fit(Y_train, T_train, X=X_train)

# Predict treatment effects
s_te = s_learner.effect(X_test)

CPU times: total: 1h 58min 40s
Wall time: 2h 22s


In [18]:
# Dump out predictions
df_s_te = pd.DataFrame(s_te, columns=['s_te'])
df_s_te.to_csv('s_te.csv', index=False)

In [16]:
# Save the SLearner model to a file
joblib.dump(s_learner, 's_learner_model.pkl')

['s_learner_model.pkl']

In [10]:
# Load the SLearner model from the file
s_learner = joblib.load('s_learner_model.pkl')

# You can now use the loaded model
s_te = s_learner.effect(X_test)

### T-Learner

### X-Learner

## Evaluate Models

### Treatment Effect Estimation
- The `effect` method calculates the Conditional Average Treatment Effect (CATE) for each observation. This provides an overall understanding of the treatment effect distribution.

In [11]:
# Example: Evaluate mean predicted treatment effect
print(f"Mean Treatment Effect: {np.mean(s_te)}")

Mean Treatment Effect: 0.49414088911998494


### Causal Model Inference
- `EconML` allows you to calculate confidence intervals and p-values for treatment effect estimates. This is especially useful for determining the statistical significance of the effects.

In [12]:
# Calculate treatment effects with confidence intervals
predicted_effects, lower_bound, upper_bound = s_learner.effect_interval(X_test, alpha=0.05)

# Example: Display treatment effects with intervals
print(f"Predicted Effects: {predicted_effects[:5]}")
print(f"95% CI Lower Bound: {lower_bound[:5]}")
print(f"95% CI Upper Bound: {upper_bound[:5]}")

AttributeError: Can't call 'effect_interval' because 'inference' is None

### Policy Evaluation
- `EconML` supports policy evaluation through policy trees and other methods, which help determine how a treatment policy would perform based on predicted effects. This is particularly useful for binary treatment variables to decide who should receive the treatment.

In [14]:
# Fit a policy tree based on treatment effects
policy_tree = PolicyTree()
policy_tree.fit(X_test, T_test, Y_test)

# Visualize the policy tree
policy_tree.plot()

TypeError: PolicyTree.fit() takes 3 positional arguments but 4 were given

### Metrics for Model Evaluation

In [18]:
# AUC Score (if ground-truth treatment effect is available)
auc_score = roc_auc_score(T_test, s_te)
print(f"AUC Score: {auc_score}")

AUC Score: 0.28602605214927895


### Individual Treatment Effect (ITE) Evaluation

In [None]:
# Calculate RMSE between true and predicted treatment effects
true_effects = some_ground_truth_function(X_test)
rmse = mean_squared_error(true_effects, predicted_effects, squared=False)
print(f"RMSE: {rmse}")

### Model Diagnostics

***