# Imports

In [65]:
# Data Wrangling and Loading
import pandas as pd
import numpy as np
import sqlite3

# Plotting
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display

# Modeling
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklift.models import ClassTransformation

# Evaluation
from sklift.metrics import uplift_by_percentile

# Read Data

Megafon data stored in SQLite DataBase

In [4]:
with sqlite3.connect('../data/lake.db') as conn:
    query = 'SELECT * FROM megafon_processed'
    df = pd.read_sql(query, conn)

In [44]:
target = 'conversion'
treatment = 'treatment_group'
features = sorted(
    df.columns.difference([target, treatment]).to_list(), 
    key=lambda x: int(x.lstrip('X_'))
    )


# EDA

## Target and Treatment group Analysis

In [51]:
normalize_opt = [False, 'all', 'index', 'columns']
freq_tables = [
    pd.crosstab(df[target], df[treatment], margins=True, normalize=opt) 
    for opt in normalize_opt
    ]

display(*freq_tables)

treatment_group,control,treatment,All
conversion,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,245878,231608,477486
1,53754,68760,122514
All,299632,300368,600000


treatment_group,control,treatment,All
conversion,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.409797,0.386013,0.79581
1,0.08959,0.1146,0.20419
All,0.499387,0.500613,1.0


treatment_group,control,treatment
conversion,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.514943,0.485057
1,0.438758,0.561242
All,0.499387,0.500613


treatment_group,control,treatment,All
conversion,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.8206,0.771081,0.79581
1,0.1794,0.228919,0.20419


## Descriptive statistics

In [36]:
descriptive_stats = [df[chunk_f].describe() for chunk_f in np.array_split(features, 5)]
display(*descriptive_stats)

Unnamed: 0,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,X_9,X_10
count,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0
mean,-3.758503,0.000405,0.356208,-1.004378,3.376919,-6.396371,-2.253712,-6.432606,-0.061507,-1.659301
std,54.881882,0.999419,31.804123,45.291429,53.397644,140.873734,59.810396,74.840857,44.912292,199.625148
min,-271.659497,-4.372119,-148.870768,-244.446728,-302.574049,-683.126343,-322.731683,-506.202937,-218.466369,-920.347783
25%,-40.693313,-0.673108,-20.758308,-30.644608,-31.865404,-100.762161,-42.313674,-54.840796,-30.32733,-134.565241
50%,-3.954771,0.000915,0.372583,-0.585368,3.720738,-6.357443,-2.26369,-6.416419,-0.103742,-1.628897
75%,33.174835,0.673056,21.49553,29.02786,38.98894,88.159514,37.70978,41.962767,30.144501,130.948487
max,250.81228,5.062006,170.053291,235.095937,284.915947,656.482242,293.909622,550.52578,219.628423,1251.776972


Unnamed: 0,X_11,X_12,X_13,X_14,X_15,X_16,X_17,X_18,X_19,X_20
count,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0
mean,19.093492,-0.093886,-0.002533,-2.46328,13.161988,7.063375,6.858253,-0.020896,-0.985809,-1.11477
std,455.894665,11.295157,56.797326,39.922589,114.277207,77.628539,190.951808,11.47603,26.239683,72.371005
min,-2289.294108,-52.074964,-313.255431,-195.45735,-636.553079,-533.40467,-913.01161,-55.14061,-125.733156,-330.755792
25%,-286.292508,-7.613948,-38.401625,-29.205585,-62.727814,-36.893267,-121.868113,-7.752689,-18.645383,-50.050451
50%,17.651122,-0.092166,-0.011893,-2.438805,13.42678,6.988854,6.63997,-0.030727,-1.007271,-1.084433
75%,324.226048,7.409305,38.291691,24.24816,89.130548,50.993029,135.756702,7.731675,16.690977,47.694238
max,2295.788999,70.827835,261.237844,203.081747,611.409759,517.044947,937.368874,54.307199,124.52959,365.716395


Unnamed: 0,X_21,X_22,X_23,X_24,X_25,X_26,X_27,X_28,X_29,X_30
count,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0
mean,14.835209,-9.909231,0.00247,-0.002707,11.159296,0.024798,-0.978981,4.599121,-6.263516,0.000554
std,138.517373,160.879307,0.999777,1.000431,104.195126,28.332643,13.729246,142.981246,75.071084,1.000134
min,-770.324798,-774.266979,-4.545333,-4.708793,-579.451426,-126.515623,-64.651485,-732.116727,-399.41446,-4.553086
25%,-77.058591,-117.394644,-0.671963,-0.676781,-57.964784,-19.118055,-10.247693,-91.19092,-56.366783,-0.674265
50%,15.207025,-9.837197,0.001802,-0.003722,11.438983,0.023834,-0.946478,4.772537,-6.364402,-0.000372
75%,106.78831,97.891879,0.677217,0.671145,80.327985,19.116412,8.309784,100.845315,43.792144,0.675415
max,742.670375,791.684238,5.287804,4.544524,558.649298,128.644637,63.130701,809.863509,361.286171,4.971995


Unnamed: 0,X_31,X_32,X_33,X_34,X_35,X_36,X_37,X_38,X_39,X_40
count,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0
mean,2.704003,0.027546,-6.51474,-0.005655,3.002134,5.325791,7.7399,-4.721989,-9.206852,2.728692
std,51.108902,107.851987,153.130771,6.964937,47.529482,70.70051,134.675174,131.789351,197.493595,39.501936
min,-346.728398,-515.521884,-743.800243,-32.088261,-251.277376,-348.076864,-668.793144,-700.110808,-1016.301591,-179.58937
25%,-30.975631,-72.732677,-109.888686,-4.711524,-26.971012,-42.250017,-82.922581,-93.661192,-142.413854,-23.338528
50%,2.732262,0.114877,-6.661874,-0.000436,2.966662,5.500324,7.516796,-4.835168,-9.003721,2.779691
75%,36.369149,72.890914,96.255945,4.690724,32.898811,52.905711,98.388563,84.208344,124.149483,28.739188
max,313.726266,524.545647,749.276255,32.27218,254.178146,372.691295,639.969009,666.145553,851.439466,201.330919


Unnamed: 0,X_41,X_42,X_43,X_44,X_45,X_46,X_47,X_48,X_49,X_50
count,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0
mean,6.882868,-6.632834,8.454493,0.001296,0.007967,-0.000966,-22.2596,-5.759041,6.24113,-1.176456
std,163.776538,137.025868,262.840194,1.000368,71.553713,0.999902,500.900364,130.952113,141.211999,21.363662
min,-810.842593,-633.575178,-1345.838757,-4.75672,-360.713742,-4.516004,-2506.960013,-687.526201,-702.184241,-98.094323
25%,-102.65838,-99.033996,-167.634846,-0.67378,-48.250836,-0.675549,-357.547278,-93.163915,-88.803657,-15.580688
50%,7.1906,-6.78476,8.77328,0.001639,0.045537,-0.002251,-20.695017,-5.774627,6.286783,-1.199895
75%,116.7556,85.621324,185.38237,0.675779,48.221733,0.673638,313.295748,81.636824,101.558007,13.23041
max,867.08669,689.626208,1488.759454,4.727996,384.665348,5.086304,2534.503855,595.321844,630.727101,112.233293


# Validation Schema

In [60]:
df[treatment] = df[treatment].eq('treatment').astype('int')

In [61]:
train_idx, test_idx = train_test_split(df.index, test_size=0.2, stratify=df[[target, treatment]], random_state=2022)

X_train, y_train, treatment_train = df.loc[train_idx, features], df.loc[train_idx, target], df.loc[train_idx, treatment] 
X_test, y_test, treatment_test = df.loc[test_idx, features], df.loc[test_idx, target], df.loc[test_idx, treatment]

# Modeling

In [62]:
rf_model = RandomForestClassifier(max_depth=8, min_samples_leaf=124, random_state=2022)
ct_model = ClassTransformation(rf_model)

In [63]:
ct_model.fit(X_train, y_train, treatment_train)

# Evaluation

In [99]:
y_uplift_pred = ct_model.predict(X_test)

uplift_table = uplift_by_percentile(y_test, y_uplift_pred, treatment_test, bins=10, total=True)
uplift_table

  return asarray(a).ndim


Unnamed: 0_level_0,n_treatment,n_control,response_rate_treatment,response_rate_control,uplift
percentile,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0-10,6081,5919,0.416708,0.16219,0.254518
10-20,5960,6040,0.285906,0.163576,0.12233
20-30,5993,6007,0.282997,0.174963,0.108034
30-40,6048,5952,0.279101,0.164147,0.114954
40-50,5957,6043,0.257009,0.154063,0.102946
50-60,6042,5958,0.20854,0.150386,0.058154
60-70,5944,6056,0.180855,0.16397,0.016885
70-80,6018,5982,0.183283,0.204948,-0.021665
80-90,5999,6001,0.129188,0.23846,-0.109272
90-100,6032,5968,0.063992,0.217493,-0.153501


In [83]:
quantiles = pd.qcut(y_uplift_pred, 10)
y_pred_df = pd.concat([y_test.to_frame(), treatment_test.to_frame()], axis=1)
y_pred_df['quantiles'] = quantiles

In [92]:
y_pred_df.pivot_table(index='quantiles', columns='treatment_group', values='conversion')

treatment_group,0,1
quantiles,Unnamed: 1_level_1,Unnamed: 2_level_1
"(-0.229, -0.0121]",0.217493,0.063992
"(-0.0121, 0.0288]",0.23846,0.129188
"(0.0288, 0.0378]",0.204948,0.183283
"(0.0378, 0.0416]",0.16397,0.180855
"(0.0416, 0.046]",0.150386,0.20854
"(0.046, 0.0535]",0.154063,0.257009
"(0.0535, 0.0657]",0.164147,0.279101
"(0.0657, 0.0797]",0.174963,0.282997
"(0.0797, 0.0977]",0.163576,0.285906
"(0.0977, 0.437]",0.16219,0.416708


In [98]:
np.quantile(y_uplift_pred, 0.1)

-0.012146129649857248