In [1]:
%matplotlib inline

import pandas as pd
from sklearn.model_selection import train_test_split
from sklift.metrics import uplift_at_k
from sklift.viz import plot_uplift_preds
from sklift.models import SoloModel
import xgboost as xgb

In [2]:
df = pd.read_csv('HW_data.csv')
df.head(3)

Unnamed: 0,recency,history,used_discount,used_bogo,zip_code,is_referral,channel,offer,conversion
0,10,142.44,1,0,Surburban,0,Phone,Buy One Get One,0
1,6,329.08,1,1,Rural,1,Web,No Offer,0
2,7,180.65,0,1,Surburban,1,Web,Buy One Get One,0


In [3]:
df.shape

(64000, 9)

In [4]:
df = df.rename(columns={'conversion': 'target'})

In [5]:
df = df.rename(columns={'offer': 'treatment'})

In [6]:
df.head(3)

Unnamed: 0,recency,history,used_discount,used_bogo,zip_code,is_referral,channel,treatment,target
0,10,142.44,1,0,Surburban,0,Phone,Buy One Get One,0
1,6,329.08,1,1,Rural,1,Web,No Offer,0
2,7,180.65,0,1,Surburban,1,Web,Buy One Get One,0


In [7]:
df['treatment'].value_counts()

Buy One Get One    21387
Discount           21307
No Offer           21306
Name: treatment, dtype: int64

In [8]:
df['treatment'] = df['treatment'].replace({'No Offer':0,'Discount':1,'Buy One Get One':1})

In [9]:
df['treatment'].value_counts()

1    42694
0    21306
Name: treatment, dtype: int64

In [10]:
df.dtypes

recency            int64
history          float64
used_discount      int64
used_bogo          int64
zip_code          object
is_referral        int64
channel           object
treatment          int64
target             int64
dtype: object

In [11]:
df.describe(include='object')

Unnamed: 0,zip_code,channel
count,64000,64000
unique,3,3
top,Surburban,Web
freq,28776,28217


In [12]:
object_cols = [col for col in df.columns if df[col].dtype == "object"]
for obj in object_cols:
    print('\n', obj)
    for unique in df[obj].unique():
        print("{} {}".format(unique,sum(df[obj] == unique)))


 zip_code
Surburban 28776
Rural 9563
Urban 25661

 channel
Phone 28021
Web 28217
Multichannel 7762


In [13]:
# One-Hot Encoding:
df = pd.get_dummies(df)
df.dtypes

recency                   int64
history                 float64
used_discount             int64
used_bogo                 int64
is_referral               int64
treatment                 int64
target                    int64
zip_code_Rural            uint8
zip_code_Surburban        uint8
zip_code_Urban            uint8
channel_Multichannel      uint8
channel_Phone             uint8
channel_Web               uint8
dtype: object

In [14]:
X = df.drop(['target'],axis=1)
y = df['target']
X_train, X_test,y_train, y_test  = train_test_split(X,y,test_size=0.3,random_state=42,stratify=df['treatment'])

In [19]:
result = pd.DataFrame(columns=['uplift@10%', 'uplift@20%'])

In [20]:
treat_train = X_train['treatment']
treat_val = X_test['treatment']

**Одна модель с признаком коммуникации**

In [21]:
sm = SoloModel(xgb.XGBClassifier(random_state=42))
sm = sm.fit(X_train, y_train, treat_train)

uplift_sm = sm.predict(X_test)

**uplift@10%**

In [23]:
sm_score_10 = uplift_at_k(y_true=y_test, uplift=uplift_sm, treatment=treat_val, strategy='by_group', k=0.1)
print(f'uplift@10%: {sm_score_10:.4f}')


uplift@10%: 0.0865


**uplift@20%**

In [24]:
sm_score_20 = uplift_at_k(y_true=y_test, uplift=uplift_sm, treatment=treat_val, strategy='by_group', k=0.2)
print(f'uplift@20%: {sm_score_20:.4f}')

uplift@20%: 0.0896


In [25]:
result.loc['SoloModel'] = [sm_score_10,sm_score_20]

In [26]:
result

Unnamed: 0,uplift@10%,uplift@20%
SoloModel,0.086504,0.08956


**Модель с трансформацией классов**

In [27]:
from sklift.models import ClassTransformation


ct = ClassTransformation(xgb.XGBClassifier(random_state=42))
ct = ct.fit(X_train, y_train, treat_train)

uplift_ct = ct.predict(X_test)

ct_score_10 = uplift_at_k(y_true=y_test, uplift=uplift_ct, treatment=treat_val, strategy='by_group', k=0.1)


  """


In [28]:
ct_score_20 = uplift_at_k(y_true=y_test, uplift=uplift_ct, treatment=treat_val, strategy='by_group', k=0.2)

result.loc['ClassTransformation'] = [ct_score_10,ct_score_20]

**Две независимые модели**

In [29]:
from sklift.models import TwoModels


tm = TwoModels(
    estimator_trmnt=xgb.XGBClassifier(random_state=42), 
    estimator_ctrl=xgb.XGBClassifier(random_state=42), 
    method='vanilla'
)
tm = tm.fit(
    X_train, y_train, treat_train 
)

uplift_tm = tm.predict(X_test)

tm_score_10 = uplift_at_k(y_true=y_test, uplift=uplift_tm, treatment=treat_val, strategy='by_group', k=0.1)


In [30]:
tm_score_20 = uplift_at_k(y_true=y_test, uplift=uplift_tm, treatment=treat_val, strategy='by_group', k=0.2)

result.loc['TwoModels'] = [tm_score_10,tm_score_20]

**Посмотрим на результаты**

In [31]:
result.sort_values(by=['uplift@10%', 'uplift@20%'], ascending=False)

Unnamed: 0,uplift@10%,uplift@20%
ClassTransformation,0.221833,0.18343
SoloModel,0.086504,0.08956
TwoModels,0.083388,0.059074
