### Summary

In [2]:
import pandas as pd

In [3]:
coupon = pd.read_excel('http://byungwan.com/class/Coupon_Sample.xlsx')
coupon.head()

Unnamed: 0,CID,Gender,Age,CClass,Discount,Visits,Email,SMS,MStatus,Purchases,Sales
0,1,0,41,2,0,1,0,1,1,1,0.92
1,2,0,41,1,0,0,0,0,0,0,0.0
2,3,0,30,2,0,3,0,0,0,1,1.54
3,4,0,28,2,0,2,0,0,0,2,0.88
4,5,1,24,2,1,5,0,0,0,2,0.53


In [4]:
coupon['Discount'].value_counts()

Discount
0    377
1    122
Name: count, dtype: int64

### 1.1 Preprocessing

#### 1.1. Balancing

In [5]:
from sklearn.utils import resample

In [6]:
coupon0 = coupon[coupon['Discount']==0]

In [7]:
coupon1 = coupon[coupon['Discount']==1]

In [9]:
coupon1_upsample = resample(coupon1, replace=True, n_samples = 377, random_state=0)

In [10]:
coupon_upsample = pd.concat([coupon0, coupon1_upsample], axis=0)

In [11]:
coupon_upsample.head()

Unnamed: 0,CID,Gender,Age,CClass,Discount,Visits,Email,SMS,MStatus,Purchases,Sales
0,1,0,41,2,0,1,0,1,1,1,0.92
1,2,0,41,1,0,0,0,0,0,0,0.0
2,3,0,30,2,0,3,0,0,0,1,1.54
3,4,0,28,2,0,2,0,0,0,2,0.88
6,7,0,30,2,0,1,0,0,0,1,0.58


In [15]:
cols = coupon_upsample.columns.tolist()
print(cols)

['CID', 'Gender', 'Age', 'CClass', 'Discount', 'Visits', 'Email', 'SMS', 'MStatus', 'Purchases', 'Sales']


In [16]:
cols[1:4]

['Gender', 'Age', 'CClass']

In [17]:
cols1 = [cols[4]] + cols[1:4] + cols[5:]
print(cols1)

['Discount', 'Gender', 'Age', 'CClass', 'Visits', 'Email', 'SMS', 'MStatus', 'Purchases', 'Sales']


In [18]:
coupon_upsample[cols1]

Unnamed: 0,Discount,Gender,Age,CClass,Visits,Email,SMS,MStatus,Purchases,Sales
0,0,0,41,2,1,0,1,1,1,0.920
1,0,0,41,1,0,0,0,0,0,0.000
2,0,0,30,2,3,0,0,0,1,1.540
3,0,0,28,2,2,0,0,0,2,0.880
6,0,0,30,2,1,0,0,0,1,0.580
...,...,...,...,...,...,...,...,...,...,...
347,1,0,26,2,1,0,0,0,1,0.145
168,1,0,45,2,10,0,0,1,3,1.460
168,1,0,45,2,10,0,0,1,3,1.460
307,1,0,26,2,6,0,0,0,1,0.275


In [19]:
coupon2 = coupon_upsample[cols1]
coupon2.head()

Unnamed: 0,Discount,Gender,Age,CClass,Visits,Email,SMS,MStatus,Purchases,Sales
0,0,0,41,2,1,0,1,1,1,0.92
1,0,0,41,1,0,0,0,0,0,0.0
2,0,0,30,2,3,0,0,0,1,1.54
3,0,0,28,2,2,0,0,0,2,0.88
6,0,0,30,2,1,0,0,0,1,0.58


In [21]:
y = coupon2.iloc[:, 0]
x = coupon2.iloc[:, 1:]

### 1.3 Scaling

In [22]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler_model = scaler.fit(x)
scaled_x = scaler_model.transform(x)

### 2. Models

In [25]:
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

In [26]:
models = []

In [27]:
lr = LogisticRegression()
models.append(("LR", lr))

In [28]:
models

[('LR', LogisticRegression())]

In [29]:
dt = DecisionTreeClassifier(criterion='entropy', max_depth=5)
# gini index, ID3 difference is minimal, not much difference in performance
# more important thing is max_depth -> we have to decide

In [31]:
models.append(("DT", dt))

In [32]:
models

[('LR', LogisticRegression()),
 ('DT', DecisionTreeClassifier(criterion='entropy', max_depth=5))]

In [33]:
rf = RandomForestClassifier(n_estimators=5, random_state=0)
# n_estimators: number of trees
# we have to do some simulation to decide
models.append(("RF", rf))

In [40]:
ann = MLPClassifier(activation='relu', hidden_layer_sizes=(5), max_iter=3000, random_state=0)
models.append(("ANN", ann))
# guideline for hidden_layer_size: (number of features + number of classes) / 2 --> 확인 필요함

In [41]:
svm = SVC(kernel='rbf', gamma=1, C=1.0, random_state=0)
models.append(("SVM", svm))

In [42]:
nb = GaussianNB()
models.append(("NB", nb))

In [43]:
knn = KNeighborsClassifier(n_neighbors=5)
models.append(("KNN", knn))

In [44]:
models

[('LR', LogisticRegression()),
 ('DT', DecisionTreeClassifier(criterion='entropy', max_depth=5)),
 ('RF', RandomForestClassifier(n_estimators=5, random_state=0)),
 ('ANN', MLPClassifier(hidden_layer_sizes=5, max_iter=3000, random_state=0)),
 ('SVM', SVC(gamma=1, random_state=0)),
 ('NB', GaussianNB()),
 ('KNN', KNeighborsClassifier())]

In [46]:
for name, model in models:
    scores = cross_val_score(model, scaled_x, y, cv=5)
    mean = np.mean(scores)
    std = np.std(scores)
    print((name + ": " + str(mean) + " ( " + str(std) + ")"))

LR: 0.7466666666666667 ( 0.02678918683831544)
DT: 0.897916114790287 ( 0.017410788450942637)
RF: 0.9456335540838852 ( 0.02023712380670089)
ANN: 0.7506843267108168 ( 0.05298263423085062)
SVM: 0.8700485651214128 ( 0.012780236095219557)
NB: 0.6631611479028697 ( 0.02624008019414461)
KNN: 0.7917439293598234 ( 0.01985066734274962)


### 3. Final Model

In [35]:
RF = RandomForestClassifier(n_estimators=5, random_state=0)

In [36]:
RF_model = RF.fit(scaled_x, y)

In [47]:
coupon_test = pd.read_excel("http://byungwan.com/class/Coupon_Test.xlsx")

In [48]:
coupon_test.head()

Unnamed: 0,CID,Gender,Age,CClass,Discount,Visits,Email,SMS,MStatus,Purchases,Sales
0,1,1,34,2,0,1,1,1,0,1,0.51
1,2,1,23,2,0,12,0,0,0,4,1.72
2,3,1,20,1,0,1,0,0,0,0,0.0
3,4,0,17,1,0,3,0,0,0,0,0.0
4,5,0,16,2,0,3,0,0,1,1,0.37


In [50]:
coupon_test[cols1].head()

Unnamed: 0,Discount,Gender,Age,CClass,Visits,Email,SMS,MStatus,Purchases,Sales
0,0,1,34,2,1,1,1,0,1,0.51
1,0,1,23,2,12,0,0,0,4,1.72
2,0,1,20,1,1,0,0,0,0,0.0
3,0,0,17,1,3,0,0,0,0,0.0
4,0,0,16,2,3,0,0,1,1,0.37


In [51]:
coupon_test2 = coupon_test[cols1]
coupon_test2.head()

Unnamed: 0,Discount,Gender,Age,CClass,Visits,Email,SMS,MStatus,Purchases,Sales
0,0,1,34,2,1,1,1,0,1,0.51
1,0,1,23,2,12,0,0,0,4,1.72
2,0,1,20,1,1,0,0,0,0,0.0
3,0,0,17,1,3,0,0,0,0,0.0
4,0,0,16,2,3,0,0,1,1,0.37


In [52]:
y_test = coupon_test2.iloc[:, 0]
x_test = coupon_test2.iloc[:, 1:]

In [53]:
scaled_x_test = scaler_model.transform(x_test)

In [54]:
y_pred = RF_model.predict(scaled_x_test)

In [56]:
from sklearn import metrics

In [57]:
metrics.accuracy_score(y_test, y_pred)

0.725

In [60]:
y_pred

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1,
       0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0,
       1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
       1, 0])

In [58]:
coupon_test["Predicted_Discount"] = y_pred

In [59]:
coupon_test.head()

Unnamed: 0,CID,Gender,Age,CClass,Discount,Visits,Email,SMS,MStatus,Purchases,Sales,Predicted_Discount
0,1,1,34,2,0,1,1,1,0,1,0.51,0
1,2,1,23,2,0,12,0,0,0,4,1.72,0
2,3,1,20,1,0,1,0,0,0,0,0.0,0
3,4,0,17,1,0,3,0,0,0,0,0.0,0
4,5,0,16,2,0,3,0,0,1,1,0.37,0
