##### Importing required packages

In [1]:
import pathlib
import os

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from datetime import date

from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_curve

import pickle

##### Setting notebook and plotting options

In [2]:
pd.set_option("display.max_columns", None)

In [3]:
## Setting plotting styles and context
sns.set(font_scale = 1)
sns.set(rc={"figure.dpi":100})

sns.set_style('whitegrid')
sns.set_context('notebook')

sns.set_palette("deep")

In [4]:
repo_path = pathlib.Path().resolve()

os.chdir(str(repo_path))

##### Reading in the data for modeling

In [5]:
modeling_data = pd.read_csv("Data/Kmeans_Labeled_Customer_Data.csv")

In [6]:
modeling_data.shape

(2206, 30)

# Classification


### Checking correlations

In [7]:
def _color_red_or_green(val):
    color = 'red' if val < -0.6 else 'light'
    color = 'green' if val > 0.6 else color
    return 'background-color: %s' % color

modeling_data.corr().style.applymap(_color_red_or_green)

Unnamed: 0,ID,Relationship_Status,Income,Children,Kidhome,Teenhome,Enroll_Age,Wines,Fruits,Meat,Fish,Sweets,Gold,AcceptedCmp1,AcceptedCmp2,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,Response,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumDealsPurchases,NumWebVisitsMonth,Age,Family_Size,CampAccepted,Education,spend_per_unit_income,labels
ID,1.0,0.019864,0.00029,-0.001268,0.001412,-0.003139,0.005361,-0.019631,0.009701,-0.003731,-0.022914,-0.004489,-0.008646,-0.019163,-0.014859,-0.035572,-0.023577,-0.006213,-0.023985,-0.017537,-0.001373,-0.011724,-0.040457,-0.00813,-0.005122,0.00944,-0.034613,-0.007692,-0.0092,-0.032583
Relationship_Status,0.019864,1.0,-0.00452,0.041079,0.024734,0.032142,-0.009134,-0.009533,-0.027238,-0.025813,-0.016564,-0.01765,-0.023951,0.010822,-0.003326,-0.019725,-0.007546,0.018878,-0.149034,0.0029,-0.010892,0.003663,0.026861,0.00293,0.003364,0.562065,7e-06,0.006291,-0.001016,-0.930589
Income,0.00029,-0.00452,1.0,-0.3423,-0.514377,0.036372,-0.026953,0.687753,0.506868,0.692107,0.520041,0.524777,0.388138,0.327117,0.104356,-0.014733,0.220382,0.393279,0.161289,0.459606,0.695795,0.6323,-0.106969,-0.649718,0.199838,-0.28574,0.36485,0.12885,0.25325,0.040467
Children,-0.001268,0.041079,-0.3423,1.0,0.687878,0.697832,-0.031119,-0.353026,-0.394706,-0.504198,-0.426711,-0.390576,-0.266566,-0.229646,-0.070402,-0.020179,-0.089197,-0.282914,-0.168059,-0.149209,-0.442225,-0.324552,0.435971,0.414305,0.095395,0.849484,-0.244694,0.049114,-0.219919,-0.061829
Kidhome,0.001412,0.024734,-0.514377,0.687878,1.0,-0.039858,-0.052533,-0.497615,-0.373551,-0.439618,-0.389087,-0.379286,-0.355109,-0.174073,-0.082251,0.01552,-0.162912,-0.203716,-0.077037,-0.372101,-0.505156,-0.502481,0.217313,0.447709,-0.237484,0.582481,-0.211261,-0.037825,-0.242802,-0.043686
Teenhome,-0.003139,0.032142,0.036372,0.697832,-0.039858,1.0,0.009001,0.005066,-0.174742,-0.260278,-0.203471,-0.163398,-0.016537,-0.144362,-0.015752,-0.043095,0.037972,-0.188441,-0.155336,0.161789,-0.110288,0.049066,0.385728,0.128542,0.36568,0.594633,-0.12838,0.104939,-0.063149,-0.042006
Enroll_Age,0.005361,-0.009134,-0.026953,-0.031119,-0.052533,0.009001,1.0,0.154695,0.054988,0.080289,0.068045,0.073471,0.14354,-0.040316,-0.00074,-0.010799,0.009475,-0.024009,0.169663,0.16794,0.085508,0.096879,0.184532,0.253238,-0.02174,-0.030583,-0.024291,-0.040702,0.106264,0.008103
Wines,-0.019631,-0.009533,0.687753,-0.353026,-0.497615,0.005066,0.154695,1.0,0.386177,0.567244,0.39855,0.39112,0.393906,0.352942,0.206882,0.061853,0.374511,0.471038,0.247344,0.55369,0.633673,0.641721,0.010713,-0.320949,0.163548,-0.297266,0.510291,0.163565,0.3796,0.061913
Fruits,0.009701,-0.027238,0.506868,-0.394706,-0.373551,-0.174742,0.054988,0.386177,1.0,0.547173,0.591841,0.571793,0.391286,0.189976,-0.009858,0.014856,0.006799,0.206662,0.12212,0.30343,0.484369,0.460115,-0.133386,-0.416226,0.014956,-0.341118,0.153798,-0.087068,0.251001,0.051516
Meat,-0.003731,-0.025813,0.692107,-0.504198,-0.439618,-0.260278,0.080289,0.567244,0.547173,1.0,0.574237,0.536829,0.359048,0.315688,0.043748,0.018711,0.092135,0.374541,0.240397,0.307204,0.73413,0.487546,-0.120077,-0.538704,0.03094,-0.431002,0.306603,0.005546,0.47124,0.064087


##### Dropping some of the correlated attributes and other unwanted attributes such as clustering labels etc.

In [8]:
# 'Kidhome', 'Teenhome', 
# 'AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4','AcceptedCmp5'

# c_to_drop = ['ID','CampAccepted', 'Family_Size', 'Children']
c_to_drop = ['Kidhome', 'Teenhome', 'Family_Size', 'AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4','AcceptedCmp5','labels']
c_to_drop.extend(['NumCatalogPurchases','NumStorePurchases','Meat','NumWebVisitsMonth'])
c_temp = modeling_data.drop(c_to_drop, axis = 1)


In [9]:
c_temp.shape

(2206, 17)

In [10]:
c_temp.columns

Index(['ID', 'Relationship_Status', 'Income', 'Children', 'Enroll_Age',
       'Wines', 'Fruits', 'Fish', 'Sweets', 'Gold', 'Response',
       'NumWebPurchases', 'NumDealsPurchases', 'Age', 'CampAccepted',
       'Education', 'spend_per_unit_income'],
      dtype='object')

##### Splitting response and features from the data

In [11]:
c_X = c_temp.drop('Response', axis = 1)
c_y = c_temp['Response']



### Train and test split of data

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test  = train_test_split(c_X.values, c_y, test_size = 0.2, stratify = c_y, random_state=1995)


In [13]:
train_ids = X_train[:,0]
test_ids = X_test[:,0]

X_train =  X_train[:,1:]
X_test =  X_test[:,1:]

##### Checking if the train and test ids conform to the original dataset

In [14]:
print(sum(c_X.ID))
print(sum(train_ids) + sum(test_ids))


12317053
12317053.0


In [15]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1764, 15)
(442, 15)
(1764,)
(442,)


In [16]:
print("Training Target Class Distribution:")
print(pd.Series(y_train).value_counts().sort_values().to_string(),"\n\n")

print("Test Target Class Distribution:")
print(pd.Series(y_test).value_counts().sort_values().to_string())


Training Target Class Distribution:
1     265
0    1499 


Test Target Class Distribution:
1     66
0    376


In [17]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1764, 15)
(442, 15)
(1764,)
(442,)


### Setting up the Grid Search for best model

##### Initializing the estimator

In [28]:
# Setting up the pipleline steps and initializing the grid search object
c_scaler = MinMaxScaler(feature_range=(-1,1))

# Initializing the estimator
# clf = LogisticRegression()
# clf = RandomForestClassifier(random_state=21)
clf = SVC(random_state=21)

steps = [('Scale', c_scaler),('Clf', clf)]
pipeline = Pipeline(steps)

##### Building the parameters dictionary for grid search

In [29]:
# Logistic Parameters for GridSearchCV

# param_grid = { 
#     'Clf__C': np.arange(0.1,1,0.1),
#     'Clf__penalty':['l2','elasticnet']
# }

# # SVM Parameters for GridSearchCV
param_grid = { 
    'Clf__C': np.arange(0.1,2,0.05),
    'Clf__kernel':['rbf', 'linear']

}

# param_grid = { 
#     'Clf__kernel': ['rbf'],
# #     'Clf__max_iter':[1000,2000,3000],
#     'Clf__gamma' : np.arange(0.1,1,0.1)    
# }


# Random Forest Parameters for GridSearchCV

# param_grid = { 
#     'Clf__n_estimators': [100, 200, 300],
#     'Clf__max_features': ['sqrt', 'log2', 'auto'],
#     'Clf__max_depth' : [None,2,4,6],
#     'Clf__criterion' :['gini','logloss'],
#     'Clf__min_samples_split':[2,3,4]
# }


##### Initializing the Grid Search Object

In [30]:
cv = GridSearchCV(pipeline, param_grid=param_grid, cv=4, scoring = 'f1')

##### Fitting the GridSearchCV object and predicting the class labels for both train and test set.

In [31]:
cv.fit(X_train, y_train)
y_tpred = cv.predict(X_train)
y_pred = cv.predict(X_test)

In [32]:
print(cv.best_estimator_.get_params)

<bound method Pipeline.get_params of Pipeline(steps=[('Scale', MinMaxScaler(feature_range=(-1, 1))),
                ('Clf', SVC(C=1.8500000000000008, random_state=21))])>


##### Evaluation metrics for the best estimator and calculating test metrics

In [33]:

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

print(cv.best_params_)

print("Training F1 Score: ", round(cv.score(X_train, y_train),2))
print("Testing F1 Score: ", round(cv.score(X_test, y_test),2))

print("\n")

print("Training Accuracy Score: ", round(cv.best_estimator_.score(X_train, y_train)*100,2))
print("Testing Accuracy Score: ", round(cv.best_estimator_.score(X_test, y_test)*100,2))

print("\n")

print("Training Data Confusion Matrix")
print(confusion_matrix(y_train, y_tpred))

print("\n")

print("Testing Data Confusion Matrix")
print(confusion_matrix(y_test, y_pred))

print("\n")

print("Testing Data Classification Report")
print(classification_report(y_test, y_pred))

{'Clf__C': 1.8500000000000008, 'Clf__kernel': 'rbf'}
Training F1 Score:  0.46
Testing F1 Score:  0.31


Training Accuracy Score:  89.0
Testing Accuracy Score:  86.65


Training Data Confusion Matrix
[[1489   10]
 [ 184   81]]


Testing Data Confusion Matrix
[[370   6]
 [ 53  13]]


Testing Data Classification Report
              precision    recall  f1-score   support

           0       0.87      0.98      0.93       376
           1       0.68      0.20      0.31        66

    accuracy                           0.87       442
   macro avg       0.78      0.59      0.62       442
weighted avg       0.85      0.87      0.83       442



##### Concatenating all the data (features, response, predictions and IDs) and creating flags to identify the observation split (train or test)

In [34]:
temp_1 = pd.concat([
    pd.DataFrame(data = X_train, columns = c_X.columns[1:]).reset_index(drop = True),
    pd.DataFrame(data = y_train, columns = ['Response']).reset_index(drop = True),
    pd.DataFrame(data = y_tpred, columns = ['Predictions']).reset_index(drop = True),
    pd.DataFrame(data = train_ids, columns = ['ID']).reset_index(drop = True)]
    ,axis = 1)

temp_1["split"] = 'train'

temp_2 = pd.concat([
    pd.DataFrame(data = X_test, columns = c_X.columns[1:]).reset_index(drop = True),
    pd.DataFrame(data = y_test, columns = ['Response']).reset_index(drop = True),
    pd.DataFrame(data = y_pred, columns = ['Predictions']).reset_index(drop = True),
    pd.DataFrame(data = test_ids, columns = ['ID']).reset_index(drop = True)]
    ,axis = 1)   

temp_2["split"] = 'test'

In [35]:
campaign_data_w_predictions = pd.concat([temp_1,temp_2])

##### ID sum check to make sure of concatenated data sanity

In [36]:
print(campaign_data_w_predictions.ID.sum())
print(c_X.ID.sum())

12317053.0
12317053


##### Saving the modeling results (predictions to csv and the cv estimator as a pickled file)

In [37]:
campaign_data_w_predictions.to_csv("campaign_data_w_predictions.csv")

In [39]:
filename = 'Model/svc_cv_obj.sav'
pickle.dump(cv, open(filename, 'wb'))

# filename = 'Model/svc_cv_obj.sav'
# pickle.load(open(filename, 'rb'))