In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
import dmba
from dmba import classificationSummary
import matplotlib.pyplot as plt
import matplotlib

from PIL import Image # i was having issues with the "Interactive backend" and this was the only way to display the visual

no display found. Using non-interactive Agg backend


In [4]:
data = dmba.load_data('Hair-Care-Product.csv')
data.columns = [d.replace('.', '_').replace(' ', '_').replace('__', '_') for d in data.columns]
data.head()

Unnamed: 0,Purchase,Age,Hair_Color,U_S_Region,Validation,Promotion_ord,Gender_ord,Residence_ord
0,0,25,Black,Southwest,1,1,0,1
1,0,30,Black,Northwest,1,0,0,1
2,0,45,Red,Northeast,1,0,0,0
3,0,35,Blond,Southwest,0,0,0,1
4,0,33,Brown,Southwest,0,1,0,1


In [6]:
print("Number of purchases made", sum(data['Purchase'] * data['Promotion_ord']))

print("Total Number of Records", sum(data['Promotion_ord']))

print("80 / 4976 = ", sum(data['Purchase'] * data['Promotion_ord']) / sum(data['Promotion_ord']))

Number of purchases made 80
Total Number of Records 4976
80 / 4976 =  0.01607717041800643


In [8]:
print("Number of purchases made", sum(data['Purchase'] * (1 - data['Promotion_ord'])))

print("Total Number of Records", sum(1 - data['Promotion_ord']))

print("32 / 5024 = ", sum(data['Purchase'] * (1 - data['Promotion_ord'])) / sum(1 - data['Promotion_ord']))

Number of purchases made 32
Total Number of Records 5024
32 / 5024 =  0.006369426751592357


In [10]:
data.columns

Index(['Purchase', 'Age', 'Hair_Color', 'U_S_Region', 'Validation',
       'Promotion_ord', 'Gender_ord', 'Residence_ord'],
      dtype='object')

In [12]:
data.Hair_Color = data.Hair_Color.astype('category')
data.U_S_Region = data.U_S_Region.astype('category')
data = pd.get_dummies(data, drop_first = True)
data.head()

Unnamed: 0,Purchase,Age,Validation,Promotion_ord,Gender_ord,Residence_ord,Hair_Color_Blond,Hair_Color_Brown,Hair_Color_Red,U_S_Region_Northwest,U_S_Region_Southeast,U_S_Region_Southwest
0,0,25,1,1,0,1,False,False,False,False,False,True
1,0,30,1,0,0,1,False,False,False,True,False,False
2,0,45,1,0,0,0,False,False,True,False,False,False
3,0,35,0,0,0,1,True,False,False,False,False,True
4,0,33,0,1,0,1,False,True,False,False,False,True


In [14]:
y = data['Purchase']
x = data.drop(columns = ['Purchase'])

scaler = preprocessing.StandardScaler()
x_norm = scaler.fit_transform(x * 1.0)

data_norm = pd.concat([pd.DataFrame(x_norm, columns = data.columns[1:]),
                       data['Purchase']], axis = 1)
train, valid = train_test_split(data_norm, test_size = 0.4, random_state = 1)

In [16]:
rfModel = RandomForestClassifier(n_estimators = 100)
rfModel.fit(train.drop(columns = ['Purchase']), train.Purchase)

pred = rfModel.predict(valid.drop(columns = ['Purchase']))
classificationSummary(valid.Purchase, pred)

Confusion Matrix (Accuracy 0.9840)

       Prediction
Actual    0    1
     0 3936   12
     1   52    0


In [18]:
upliftDF = valid.drop(columns = ['Purchase']).copy()

upliftDF.Promotion_ord = 1
predTreatment = rfModel.predict_proba(upliftDF)
upliftDF.Promotion_ord = 0
predControl = rfModel.predict_proba(upliftDF)

upliftResult_knn = pd.DataFrame({
    'probMessage': predTreatment[:,1],
    'probNoMessage': predControl[:,1],
    'uplift': predTreatment[:,1] - predControl[:,1],
    }, index = upliftDF.index)

upliftResult = upliftResult_knn.sort_values(by=['uplift'], ascending=False)
upliftResult.reset_index().plot(x = None, y = 'uplift')

fig = plt.gcf()
fig.savefig('plot.jpg', format = 'jpg', dpi = 300)

image = Image.open('plot.jpg')
image.show() #it'll create a pop up of the visual

In [19]:
upliftResult.head(3)

Unnamed: 0,probMessage,probNoMessage,uplift
6751,0.723333,0.02,0.703333
201,0.723333,0.02,0.703333
4814,0.66,0.03,0.63


In [22]:
upliftResult_knn.head(3)

Unnamed: 0,probMessage,probNoMessage,uplift
9953,0.0,0.0,0.0
3850,0.033333,0.0,0.033333
4962,0.0,0.0,0.0
