In [1]:
import pandas as pd
from sklearn.metrics import classification_report,confusion_matrix
from datetime import datetime, timedelta,date
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from __future__ import division
from sklearn.cluster import KMeans
%matplotlib inline

In [4]:
import sklearn
import xgboost as xgb
from sklearn.model_selection import KFold, cross_val_score, train_test_split
import warnings
warnings.filterwarnings("ignore")

In [5]:
df = pd.read_csv('/content/data.csv')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64000 entries, 0 to 63999
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   recency        64000 non-null  int64  
 1   history        64000 non-null  float64
 2   used_discount  64000 non-null  int64  
 3   used_bogo      64000 non-null  int64  
 4   zip_code       64000 non-null  object 
 5   is_referral    64000 non-null  int64  
 6   channel        64000 non-null  object 
 7   offer          64000 non-null  object 
 8   conversion     64000 non-null  int64  
dtypes: float64(1), int64(5), object(3)
memory usage: 4.4+ MB


In [7]:
df.head()

Unnamed: 0,recency,history,used_discount,used_bogo,zip_code,is_referral,channel,offer,conversion
0,10,142.44,1,0,Surburban,0,Phone,Buy One Get One,0
1,6,329.08,1,1,Rural,1,Web,No Offer,0
2,7,180.65,0,1,Surburban,1,Web,Buy One Get One,0
3,9,675.83,1,0,Rural,1,Web,Discount,0
4,2,45.34,1,0,Urban,0,Web,Buy One Get One,0


In [8]:
df['history'].value_counts(normalize=True)

29.99      0.124172
53.79      0.000141
81.20      0.000141
33.18      0.000125
44.14      0.000125
             ...   
433.13     0.000016
196.43     0.000016
1108.21    0.000016
1025.31    0.000016
552.94     0.000016
Name: history, Length: 34833, dtype: float64

In [9]:
df['conversion'].value_counts()

0    54606
1     9394
Name: conversion, dtype: int64

In [11]:
df.groupby('offer')['conversion'].value_counts()

offer            conversion
Buy One Get One  0             18149
                 1              3238
Discount         0             17413
                 1              3894
No Offer         0             19044
                 1              2262
Name: conversion, dtype: int64

In [12]:
df[df['offer']=='No Offer']['conversion'].mean()

0.10616727682343002

In [15]:
len(df[df.offer == 'Discount'])

21307

In [19]:
len(df[df.offer == 'Buy One Get One']['conversion'])

21387

In [16]:
def calc_uplift(df):
    #assigning 25$ to the average order value
    avg_order_value = 25
    
    #calculate conversions for each offer type
    base_conv = df[df.offer == 'No Offer']['conversion'].mean()
    disc_conv = df[df.offer == 'Discount']['conversion'].mean()
    bogo_conv = df[df.offer == 'Buy One Get One']['conversion'].mean()
    
    #calculate conversion uplift for discount and bogo
    disc_conv_uplift = disc_conv - base_conv
    bogo_conv_uplift = bogo_conv - base_conv
    
    #calculate order uplift
    disc_order_uplift = disc_conv_uplift * len(df[df.offer == 'Discount']['conversion'])
    bogo_order_uplift = bogo_conv_uplift * len(df[df.offer == 'Buy One Get One']['conversion'])
    
    #calculate revenue uplift
    disc_rev_uplift = disc_order_uplift * avg_order_value
    bogo_rev_uplift = bogo_order_uplift * avg_order_value
    
    
    print('Discount Conversion Uplift: {0}%'.format(np.round(disc_conv_uplift*100,2)))
    print('Discount Order Uplift: {0}'.format(np.round(disc_order_uplift,2)))
    print('Discount Revenue Uplift: ${0}\n'.format(np.round(disc_rev_uplift,2)))
          
    print('-------------- \n')

    print('BOGO Conversion Uplift: {0}%'.format(np.round(bogo_conv_uplift*100,2)))
    print('BOGO Order Uplift: {0}'.format(np.round(bogo_order_uplift,2)))
    print('BOGO Revenue Uplift: ${0}'.format(np.round(bogo_rev_uplift,2)))

In [17]:
calc_uplift(df)

Discount Conversion Uplift: 7.66%
Discount Order Uplift: 1631.89
Discount Revenue Uplift: $40797.35

-------------- 

BOGO Conversion Uplift: 4.52%
BOGO Order Uplift: 967.4
BOGO Revenue Uplift: $24185.01


In [20]:
df['campaign_group'] = 'treatment'
df.loc[df.offer == 'No Offer', 'campaign_group'] = 'control'

In [21]:
df.head()

Unnamed: 0,recency,history,used_discount,used_bogo,zip_code,is_referral,channel,offer,conversion,campaign_group
0,10,142.44,1,0,Surburban,0,Phone,Buy One Get One,0,treatment
1,6,329.08,1,1,Rural,1,Web,No Offer,0,control
2,7,180.65,0,1,Surburban,1,Web,Buy One Get One,0,treatment
3,9,675.83,1,0,Rural,1,Web,Discount,0,treatment
4,2,45.34,1,0,Urban,0,Web,Buy One Get One,0,treatment


In [23]:
df['target_class'] = 0 #CN
df.loc[(df.campaign_group == 'control') & (df.conversion > 0),'target_class'] = 1 #CR
df.loc[(df.campaign_group == 'treatment') & (df.conversion == 0),'target_class'] = 2 #TN
df.loc[(df.campaign_group == 'treatment') & (df.conversion > 0),'target_class'] = 3 #TR

In [24]:
df.target_class.value_counts()

2    35562
0    19044
3     7132
1     2262
Name: target_class, dtype: int64

In [25]:
def order_cluster(cluster_field_name, target_field_name,df,ascending):
    new_cluster_field_name = 'new_' + cluster_field_name
    df_new = df.groupby(cluster_field_name)[target_field_name].mean().reset_index()
    df_new = df_new.sort_values(by=target_field_name,ascending=ascending).reset_index(drop=True)
    df_new['index'] = df_new.index
    df_final = pd.merge(df,df_new[[cluster_field_name,'index']], on=cluster_field_name)
    df_final = df_final.drop([cluster_field_name],axis=1)
    df_final = df_final.rename(columns={"index":cluster_field_name})
    return df_final

In [26]:
kmeans = KMeans(n_clusters=5)
kmeans.fit(df[['history']])
df['history_cluster'] = kmeans.predict(df[['history']])
df['history_cluster']

0        1
1        0
2        0
3        2
4        1
        ..
63995    1
63996    1
63997    1
63998    4
63999    4
Name: history_cluster, Length: 64000, dtype: int32

In [27]:
df['history_cluster'].value_counts()

1    32399
0    17904
4     9038
2     3738
3      921
Name: history_cluster, dtype: int64

In [33]:
df_data = order_cluster('history_cluster', 'history',df,True)

In [28]:
#df_new = df.groupby('history_cluster')['history'].mean().reset_index()
#df_new

Unnamed: 0,history_cluster,history
0,0,247.469951
1,1,74.231862
2,2,810.557515
3,3,1409.772009
4,4,479.030523


In [31]:
# df_new = df_new.sort_values(by='history',ascending=True)
# df_new

Unnamed: 0,history_cluster,history
0,1,74.231862
1,0,247.469951
2,4,479.030523
3,2,810.557515
4,3,1409.772009


In [32]:
#df_new['index'] = df_new.index
#df_new

Unnamed: 0,history_cluster,history,index
0,1,74.231862,0
1,0,247.469951,1
2,4,479.030523,2
3,2,810.557515,3
4,3,1409.772009,4


In [34]:
df_data

Unnamed: 0,recency,history,used_discount,used_bogo,zip_code,is_referral,channel,offer,conversion,campaign_group,target_class,history_cluster
0,10,142.44,1,0,Surburban,0,Phone,Buy One Get One,0,treatment,2,0
1,2,45.34,1,0,Urban,0,Web,Buy One Get One,0,treatment,2,0
2,6,134.83,0,1,Surburban,0,Phone,Buy One Get One,1,treatment,3,0
3,9,46.42,0,1,Urban,0,Phone,Buy One Get One,0,treatment,2,0
4,10,32.84,0,1,Urban,1,Web,Buy One Get One,0,treatment,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...
63995,2,1399.97,1,1,Surburban,1,Phone,No Offer,1,control,1,4
63996,1,1584.31,1,0,Surburban,1,Multichannel,Discount,0,treatment,2,4
63997,1,1252.19,1,0,Urban,1,Phone,Buy One Get One,1,treatment,3,4
63998,2,1183.05,1,1,Rural,1,Phone,No Offer,0,control,0,4


In [35]:
df_model = df_data.drop(['offer','campaign_group','conversion'],axis=1)
df_model

Unnamed: 0,recency,history,used_discount,used_bogo,zip_code,is_referral,channel,target_class,history_cluster
0,10,142.44,1,0,Surburban,0,Phone,2,0
1,2,45.34,1,0,Urban,0,Web,2,0
2,6,134.83,0,1,Surburban,0,Phone,3,0
3,9,46.42,0,1,Urban,0,Phone,2,0
4,10,32.84,0,1,Urban,1,Web,2,0
...,...,...,...,...,...,...,...,...,...
63995,2,1399.97,1,1,Surburban,1,Phone,1,4
63996,1,1584.31,1,0,Surburban,1,Multichannel,2,4
63997,1,1252.19,1,0,Urban,1,Phone,3,4
63998,2,1183.05,1,1,Rural,1,Phone,0,4


In [36]:
df_model = pd.get_dummies(df_model)
df_model

Unnamed: 0,recency,history,used_discount,used_bogo,is_referral,target_class,history_cluster,zip_code_Rural,zip_code_Surburban,zip_code_Urban,channel_Multichannel,channel_Phone,channel_Web
0,10,142.44,1,0,0,2,0,0,1,0,0,1,0
1,2,45.34,1,0,0,2,0,0,0,1,0,0,1
2,6,134.83,0,1,0,3,0,0,1,0,0,1,0
3,9,46.42,0,1,0,2,0,0,0,1,0,1,0
4,10,32.84,0,1,1,2,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
63995,2,1399.97,1,1,1,1,4,0,1,0,0,1,0
63996,1,1584.31,1,0,1,2,4,0,1,0,1,0,0
63997,1,1252.19,1,0,1,3,4,0,0,1,0,1,0
63998,2,1183.05,1,1,1,0,4,1,0,0,0,1,0


In [37]:
#create feature set and labels
X = df_model.drop(['target_class'],axis=1)
y = df_model.target_class

In [38]:
X.columns

Index(['recency', 'history', 'used_discount', 'used_bogo', 'is_referral',
       'history_cluster', 'zip_code_Rural', 'zip_code_Surburban',
       'zip_code_Urban', 'channel_Multichannel', 'channel_Phone',
       'channel_Web'],
      dtype='object')

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=56)

In [41]:
from xgboost import XGBClassifier

In [42]:
xgb_model = XGBClassifier()
xgb_model.fit(X_train, y_train)
class_probs = xgb_model.predict_proba(X_test)
class_probs

array([[0.3169099 , 0.02089888, 0.6017264 , 0.06046481],
       [0.30947167, 0.02082642, 0.58143246, 0.08826943],
       [0.31923392, 0.01209301, 0.59602344, 0.07264965],
       ...,
       [0.30159032, 0.01800183, 0.60367113, 0.07673673],
       [0.30539408, 0.02549697, 0.59427786, 0.07483111],
       [0.26355934, 0.04965046, 0.4195396 , 0.26725063]], dtype=float32)

In [43]:
xgb_model.classes_

array([0, 1, 2, 3])

In [48]:
pd.DataFrame(xgb_model.predict_proba(X_test), columns=xgb_model.classes_)

Unnamed: 0,0,1,2,3
0,0.316910,0.020899,0.601726,0.060465
1,0.309472,0.020826,0.581432,0.088269
2,0.319234,0.012093,0.596023,0.072650
3,0.259263,0.058820,0.442380,0.239536
4,0.299989,0.025295,0.598412,0.076303
...,...,...,...,...
12795,0.294134,0.034117,0.559721,0.112028
12796,0.293092,0.034625,0.583875,0.088408
12797,0.301590,0.018002,0.603671,0.076737
12798,0.305394,0.025497,0.594278,0.074831


In [49]:
X_test['proba_CN'] = class_probs[:,0] 
X_test['proba_CR'] = class_probs[:,1] 
X_test['proba_TN'] = class_probs[:,2] 
X_test['proba_TR'] = class_probs[:,3] 

In [50]:
X_test['uplift_score'] = X_test.eval('proba_CN + proba_TR - proba_TN - proba_CR')

In [52]:
overall_proba = xgb_model.predict_proba(df_model.drop(['target_class'],axis=1))
df_model['proba_CN'] = overall_proba[:,0] 
df_model['proba_CR'] = overall_proba[:,1] 
df_model['proba_TN'] = overall_proba[:,2] 
df_model['proba_TR'] = overall_proba[:,3] 

In [53]:
df_model['uplift_score'] = df_model.eval('proba_CN + proba_TR - proba_TN - proba_CR')

In [54]:
df_data['uplift_score'] = df_model['uplift_score']

In [55]:
df_data.groupby('offer').uplift_score.mean()

offer
Buy One Get One   -0.182761
Discount          -0.182034
No Offer          -0.178188
Name: uplift_score, dtype: float32

In [57]:
df_data_lift = df_data.copy()
uplift_q_75 = df_data_lift.uplift_score.quantile(0.75)
df_data_lift = df_data_lift[(df_data_lift.offer != 'Buy One Get One') & (df_data_lift.uplift_score > uplift_q_75)]
len(df_data_lift)

10861

In [58]:
calc_uplift(df_data_lift)

Discount Conversion Uplift: 12.41%
Discount Order Uplift: 648.46
Discount Revenue Uplift: $16211.62

-------------- 

BOGO Conversion Uplift: nan%
BOGO Order Uplift: nan
BOGO Revenue Uplift: $nan


In [59]:
df_data_lift = df_data.copy()
uplift_q_5 = df_data_lift.uplift_score.quantile(0.5)
df_data_lift = df_data_lift[(df_data_lift.offer != 'Buy One Get One') & (df_data_lift.uplift_score < uplift_q_5)].reset_index()

In [60]:
calc_uplift(df_data_lift)

Discount Conversion Uplift: 5.83%
Discount Order Uplift: 627.78
Discount Revenue Uplift: $15694.62

-------------- 

BOGO Conversion Uplift: nan%
BOGO Order Uplift: nan
BOGO Revenue Uplift: $nan
