# Imports

In [13]:
# Data Wrangling and Loading
import pandas as pd
import numpy as np
import sqlite3

# Common
import os
from dotenv import find_dotenv, load_dotenv
from pathlib import Path

# Plotting
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display

# Modeling
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklift.models import SoloModel, TwoModels, ClassTransformation

# Evaluation
from sklift.metrics import uplift_by_percentile, weighted_average_uplift, uplift_auc_score, qini_auc_score 

In [3]:
# Environment variables from .env file
load_dotenv(find_dotenv())

# Seabron style and context
sns.set_style('whitegrid', {'axes.labelcolor': 'b', 'axes.edgecolor': 'r', 'xtick.color': 'g'})
sns.set_context('notebook');

# Data

Read final canonical dataset for modeling `megafon_processed

Split columns to different categories:
1. Target:
2. Treatment:
3. Modeling features.`

In [4]:
with sqlite3.connect(os.environ['DATABASE_URL']) as conn:
    query = 'SELECT * FROM megafon_processed'
    data = pd.read_sql(query, conn)

print(data.info(verbose=False))
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600000 entries, 0 to 599999
Columns: 52 entries, X_1 to conversion
dtypes: float64(50), int64(2)
memory usage: 238.0 MB
None


Unnamed: 0,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,X_9,X_10,X_11,X_12,X_13,X_14,X_15,X_16,X_17,X_18,X_19,X_20,X_21,X_22,X_23,X_24,X_25,X_26,X_27,X_28,X_29,X_30,X_31,X_32,X_33,X_34,X_35,X_36,X_37,X_38,X_39,X_40,X_41,X_42,X_43,X_44,X_45,X_46,X_47,X_48,X_49,X_50,treatment_group,conversion
0,39.396577,-0.186548,19.524505,21.250208,55.291264,182.966712,-5.385606,144.573379,-12.534344,-58.279429,283.54475,-3.297557,74.491728,11.674656,-4.852843,99.324038,159.686346,-5.83057,-4.554391,-75.704888,-4.54072,107.891557,-0.592276,0.513937,-3.415607,-7.454581,-38.519996,-116.186909,18.253466,0.775597,-66.833169,121.204278,-58.588803,0.266334,0.758562,-13.254177,200.71558,77.227063,-230.59102,-3.023398,90.877638,134.363458,-213.584582,-2.092461,-93.973258,-0.155597,-312.130733,44.798182,-125.682413,16.231365,0,0
1,38.987694,0.819522,-42.064512,-48.270949,-33.171257,179.459341,-87.15181,-162.693257,20.651652,181.635081,114.382486,10.277246,132.852421,-14.088731,82.011396,-19.29041,-155.851825,3.904401,26.194543,87.908221,115.714068,-179.564296,-1.202563,1.827663,87.042091,6.771393,12.481973,304.514135,3.618649,-0.422986,76.520982,67.402922,-407.89651,-1.269765,-34.027547,-91.572702,61.215866,-80.652713,-782.791784,-33.541388,-183.840746,72.864779,559.783584,1.142391,80.037124,-1.216185,-111.473936,-127.737977,-117.501171,10.732234,0,0
2,-16.693093,1.844558,-8.615192,-18.81874,-22.271188,-116.290369,-63.816746,-38.340763,24.968496,-136.340629,628.310139,-7.714403,-4.863992,-41.63877,187.244122,16.716381,103.805977,15.248096,-7.96361,-18.581579,247.603883,-60.794763,-0.707689,-0.293637,186.251855,4.644561,-13.456976,52.765776,-95.096983,-2.191094,41.936533,17.929746,-243.263646,7.630227,-27.235726,38.757461,168.534929,94.97924,-177.641632,-40.559792,-203.637766,2.480242,96.998504,1.100962,-33.275159,0.920926,-679.492242,-91.009397,-18.173358,14.367636,1,0
3,-72.040154,-0.226921,39.802607,16.441262,-1.112509,68.128008,23.073147,4.688858,-49.383641,-91.866107,374.208539,-5.197953,-16.972788,-20.513819,26.733826,-5.05117,266.620852,-1.003867,5.644895,101.563496,24.222458,15.445679,-0.193439,-0.537621,18.220545,-19.826745,-9.969067,-176.056848,-34.994979,-0.169379,20.194249,97.909989,41.396933,-4.572652,73.594603,59.882086,-18.703877,109.139592,-4.272029,18.375161,172.906875,83.951551,-323.642557,-0.369182,93.221948,-1.96238,-442.466684,-22.298302,-75.916603,11.634299,1,0
4,18.296973,0.996437,24.465307,-34.151971,24.623458,-155.455558,-12.159787,26.705778,105.864805,258.607252,-555.11913,14.632472,100.997894,-10.130204,52.278128,-6.223072,-91.566718,24.962987,16.902873,-125.359051,77.880711,65.76335,-0.250273,0.470302,58.583197,24.301061,7.895425,64.681136,0.208348,1.215403,-32.063615,1.182104,-15.530264,9.247841,-40.267539,27.330542,-78.737771,-79.762654,-101.12323,-30.324588,125.577535,-208.531112,118.902324,-0.808578,-117.497906,1.770635,627.395611,122.019189,194.091195,-11.883858,1,0


In [7]:
target = 'conversion'
treatment = 'treatment_group'
not_features = [target, treatment]
features = data.columns.difference(not_features).to_list()

# Sort features because sqlite returns columns at random order
features = sorted(features, key=lambda x: int(x.lstrip('X_')))

assert len(not_features + features) == len(data.columns), 'sanity check'

# Validation Schema

Use Holdout-Set for final model quality estimation

- 70% for training
- 30% for testing

In [8]:
strat_cols = [target, treatment]
train_idx, test_idx = train_test_split(
    data.index, test_size=0.3, stratify=data[strat_cols], random_state=2022
)

X_train, y_train, treatment_train = data.loc[train_idx, features], data.loc[train_idx, target], data.loc[train_idx, treatment]
X_test, y_test, treatment_test = data.loc[test_idx, features], data.loc[test_idx, target], data.loc[test_idx, treatment]

In [9]:
def print_summary(y, prefix):
    rows, target_avg = len(y), y.mean()
    print(f'{prefix} dataset has {rows} - Avg. target {target_avg:.2f}')

print_summary(y_train, 'Train')
print_summary(y_test, 'Test')

Train dataset has 420000 - Avg. target 0.20
Test dataset has 180000 - Avg. target 0.20


# Modeling

In [14]:
def eval_uplift_summary(y_true, uplift_pred, treatment, label):

    avg_uplift = weighted_average_uplift(y_true, uplift_pred, treatment)
    uplift_auc = uplift_auc_score(y_true, uplift_pred, treatment)
    qini_auc = qini_auc_score(y_true, uplift_pred, treatment)

    index = ['weighted_average_uplift', 'uplift_curve_auc', 'qini_curve_auc']
    row = pd.Series([avg_uplift, uplift_auc, qini_auc], index=index, name=label)
    return row

In [23]:
modeling_results = []

# Solo model

Treatment Dummy approach, Solo model approach, Single model approach, S-Learner.

Fit solo model on whole dataset with ‘treatment’ as an additional feature.

Each object from the test sample is scored twice: with the treatment flag equal to 1 and equal to 0. Subtracting the probabilities for each observation, we get the uplift.

Return delta of predictions for each example.

In [12]:
base_model = RandomForestClassifier(n_estimators=10, criterion='entropy', max_depth=10)
s_learner = SoloModel(base_model, method='dummy')

%time s_learner.fit(X_train, y_train, treatment_train)

CPU times: user 50.6 s, sys: 1.04 s, total: 51.7 s
Wall time: 1min 1s


In [24]:
uplift_pred = s_learner.predict(X_test)
row = eval_uplift_summary(y_test, uplift_pred, treatment_test, label='s_learner+rf')
display(row.to_frame().T.round(2))

modeling_results.append(row)

Unnamed: 0,weighted_average_uplift,uplift_curve_auc,qini_curve_auc
s_learner+rf,0.05,0.1,0.14
