In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from dmba import classificationSummary

from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
import matplotlib.pylab as plt

In [2]:
hairdf = pd.read_csv('DMBA-R-datasets/Hair-Care-Product.csv')
hairdf.head()

Unnamed: 0,Purchase,Age,Hair Color,U.S. Region,Validation,Promotion_ord,Gender_ord,Residence_ord
0,0,25,Black,Southwest,1,1,0,1
1,0,30,Black,Northwest,1,0,0,1
2,0,45,Red,Northeast,1,0,0,0
3,0,35,Blond,Southwest,0,0,0,1
4,0,33,Brown,Southwest,0,1,0,1


### a. What is the purchase propensity

In [3]:
# purchase propensity among those who received the promotion
purch_prom = hairdf.loc[(hairdf['Purchase'] == 1) & (hairdf['Promotion_ord'] ==1), ]
prom = hairdf.loc[hairdf['Promotion_ord'] == 1, ]
pruch_prop = purch_prom.shape[0]/prom.shape[0]
print(round(pruch_prop, 4))

0.0161


In [4]:
# purchase propensity among those who did not received the promotion
purch_unprom = hairdf.loc[(hairdf['Promotion_ord'] == 0) & (hairdf['Purchase'] == 1), ]
unprom = hairdf.loc[hairdf['Promotion_ord'] == 0, ]
pruch_prop_unprom = purch_unprom.shape[0]/unprom.shape[0]
print(round(pruch_prop_unprom, 4))

0.0064


In [5]:
# treat hair color & U.S. Region as categories
hairdf['Hair Color'] = hairdf['Hair Color'].astype('category')
hairdf['U.S. Region'] = hairdf['U.S. Region'].astype('category')
# one-hot encoding
hairdf_dummies = pd.get_dummies(hairdf, prefix_sep='_', drop_first=True)
hairdf_dummies.head()

Unnamed: 0,Purchase,Age,Validation,Promotion_ord,Gender_ord,Residence_ord,Hair Color_Blond,Hair Color_Brown,Hair Color_Red,U.S. Region_Northwest,U.S. Region_Southeast,U.S. Region_Southwest
0,0,25,1,1,0,1,0,0,0,0,0,1
1,0,30,1,0,0,1,0,0,0,1,0,0
2,0,45,1,0,0,0,0,0,1,0,0,0
3,0,35,0,0,0,1,1,0,0,0,0,1
4,0,33,0,1,0,1,0,1,0,0,0,1


In [6]:
# b. Partition the data into training (60%) and validation (40%)
x = hairdf_dummies.drop('Purchase', axis='columns')
y = hairdf_dummies['Purchase']
train_x, valid_x, train_y, valid_y = train_test_split(x, y, test_size = 0.4, random_state = 0)

### b. Uplift using a Random Forest

In [7]:
rf = RandomForestClassifier(n_estimators=100, random_state=1)
rf.fit(train_x, train_y)

RandomForestClassifier(random_state=1)

In [10]:
accuracy_score(valid_y, rf.predict(valid_x))

0.982

In [11]:
# Calculating the uplift
uplift_rf = valid_x.copy()
uplift_rf['Promotion_ord'] = 1
rf_pred_prom = rf.predict_proba(uplift_rf)
uplift_rf['Promotion_ord'] = 0
rf_pred_control = rf.predict_proba(uplift_rf)

rf_uplift = pd.DataFrame({'probPromotion': rf_pred_prom[:, 1],
                          'probNoPromotion': rf_pred_control[:, 1],
                          'uplift': rf_pred_prom[:,1] - rf_pred_control[:,1]}, index = uplift_rf.index)

### b. Uplift using KNN

In [12]:
# normalization
scaler = preprocessing.StandardScaler()
scaler.fit(train_x)

# Transform the full dataset
norm_x = pd.DataFrame(scaler.transform(x), columns=x.columns)

train_x_norm = norm_x.loc[train_x.index]
valid_x_norm = norm_x.loc[valid_x.index]

In [13]:
# Train a classifier for different values of k
results = []
for k in range(1, 15):
    knn = KNeighborsClassifier(n_neighbors = k).fit(train_x_norm, train_y)
    results.append({'k':k, 'accuracy': accuracy_score(valid_y, knn.predict(valid_x_norm))})

results = pd.DataFrame(results)
print(results.sort_values(by = ['accuracy'], ascending = False))

     k  accuracy
3    4   0.98875
5    6   0.98875
6    7   0.98875
7    8   0.98875
8    9   0.98875
9   10   0.98875
10  11   0.98875
11  12   0.98875
12  13   0.98875
13  14   0.98875
1    2   0.98800
4    5   0.98800
2    3   0.98725
0    1   0.97400


In [14]:
list(valid_x_norm['Promotion_ord'].unique())

[-1.0060181084889306, 0.9940178924830985]

In [15]:
valid_x_norm.head()

Unnamed: 0,Age,Validation,Promotion_ord,Gender_ord,Residence_ord,Hair Color_Blond,Hair Color_Brown,Hair Color_Red,U.S. Region_Northwest,U.S. Region_Southeast,U.S. Region_Southwest
9394,-1.24442,-0.703398,-1.006018,1.468412,0.634707,-0.78982,1.28596,-0.255248,-0.590952,-0.564519,1.72515
898,-0.813507,-0.703398,-1.006018,-0.681008,0.634707,-0.78982,1.28596,-0.255248,-0.590952,-0.564519,-0.57966
2398,-0.641142,1.421671,-1.006018,-0.681008,0.634707,-0.78982,1.28596,-0.255248,-0.590952,1.771421,-0.57966
5906,1.168695,-0.703398,-1.006018,-0.681008,0.634707,-0.78982,-0.777629,-0.255248,1.692185,-0.564519,-0.57966
2343,1.599608,-0.703398,-1.006018,-0.681008,0.634707,-0.78982,1.28596,-0.255248,-0.590952,-0.564519,1.72515


In [16]:
valid_x.head()

Unnamed: 0,Age,Validation,Promotion_ord,Gender_ord,Residence_ord,Hair Color_Blond,Hair Color_Brown,Hair Color_Red,U.S. Region_Northwest,U.S. Region_Southeast,U.S. Region_Southwest
9394,26,0,0,1,1,0,1,0,0,0,1
898,31,0,0,0,1,0,1,0,0,0,0
2398,33,1,0,0,1,0,1,0,0,1,0
5906,54,0,0,0,1,0,0,0,1,0,0
2343,59,0,0,0,1,0,1,0,0,0,1


In [17]:
# k = 4
knn4 = KNeighborsClassifier(n_neighbors=4)
knn4.fit(train_x_norm, train_y)

lst = list(valid_x_norm['Promotion_ord'].unique())

uplift_knn4 = valid_x_norm.copy()
# promoted
uplift_knn4['Promotion_ord'] = lst[1] # promoted
knn4_pred_prom = knn4.predict_proba(uplift_knn4)
# nopromotion
uplift_knn4['Promotion_ord'] = lst[0]
knn4_pred_noprom = knn4.predict_proba(uplift_knn4)

# uplift
knn4_uplift = pd.DataFrame({'probPromotion':knn4_pred_prom[:, 1],
                            'probNoPromotion': knn4_pred_noprom[:, 1],
                            'uplift': knn4_pred_prom[:,1] - knn4_pred_noprom[:,1]}, index = uplift_knn4.index)

### c. Report the two models' recommendations for the first three members.

In [18]:
rf_uplift.sort_values(by=['uplift'], ascending=False).head(3)

Unnamed: 0,probPromotion,probNoPromotion,uplift
4615,0.7775,0.05,0.7275
3343,0.727857,0.01,0.717857
9751,0.727857,0.01,0.717857


In [19]:
knn4_uplift.sort_values(by=['uplift'], ascending=False).head(3)

Unnamed: 0,probPromotion,probNoPromotion,uplift
3807,0.5,0.0,0.5
4615,0.5,0.0,0.5
9357,0.5,0.0,0.5
