# Download data and import libraries

In [5]:
%matplotlib inline 

import pandas as pd
import numpy as np

from pandas_profiling import ProfileReport
import statsmodels.api as sm

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, recall_score

import seaborn as sns
import matplotlib.pyplot as plt

import pickle

with open('df_data.pickle', 'rb') as f:
    df_data = pickle.load(f)

with open('X_train.pickle', 'rb') as f:
    X_train = pickle.load(f)

with open('y_train.pickle', 'rb') as f:
    y_train = pickle.load(f)

with open('X_test.pickle', 'rb') as f:
    X_test = pickle.load(f)

with open('y_test.pickle', 'rb') as f:
    y_test = pickle.load(f)

In [6]:
# print('X_train shape:', X_train.shape)
# print('y_train shape:', y_train.shape)

# print('X_test shape:', X_test.shape)
# print('y_test shape:', y_test.shape)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(6523, 14) (6523,)
(2035, 14) (2035,)


In [7]:
#Standardize features by removing the mean and scaling to unit variance
scaler = StandardScaler()
X_train_st = scaler.fit(X_train).transform(X_train)
X_test_st = scaler.transform(X_test)

# Linear models
Since labels are very skewed towards "False" hyper-parameter "class_weight" value is "balanced".

Model selection process:
1. for each model candidate find hyper-parameters that guarantee its best performance
2. among models traind using best parameters select one that delivers best recall for "Revenue"=True (Purchase) class

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

In [9]:
# creating recall scoribg function for CV that focuses on just "Purchase" class
recall_score2 = make_scorer(recall_score, pos_label=1)

In [10]:
# creating a dict of linear models to choose from
model_dict = {'Logistic Regression' : LogisticRegression(),
              "Linear SVM": LinearSVC()}

In [11]:
# creating a dict of parameters for the models
params_LR = dict(penalty=['l1', 'l2', 'elasticnet'],
                 C=[0.2,0.5,1,2,5,10],
                 fit_intercept=[True, False],
                 class_weight = ['balanced']
                 )
params_SVM = dict(penalty=['l1', 'l2'],
                  loss=['hinge', 'squared_hinge'],
                  C=[0.2,0.5,1,2,5,10],
                  fit_intercept=[True, False],
                  class_weight = ['balanced']
                  )
parameters_dict = {'Logistic Regression': params_LR,
                  "Linear SVM" : params_SVM}

In [12]:
# 5-fold gridsearch through parameters maximazing Recall for Purchases
model_name, model_parameters, model_best_score = [], [], []

model_names = model_dict.keys()

for MN in model_names:
    print("="*45)
    print(MN)
    clf = model_dict[MN]
    params = parameters_dict[MN]

    gridsearch = GridSearchCV(clf,
                              params,
                              scoring=recall_score2,
                              cv=5,
                              verbose=1,
                              n_jobs=-1)
    
    best_model = gridsearch.fit(X_train, y_train)
    model_name.append(MN)
    model_parameters.append(gridsearch.best_params_)
    model_best_score.append(gridsearch.best_score_ )

Logistic Regression
Fitting 5 folds for each of 36 candidates, totalling 180 fits


120 fits failed out of a total of 180.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\jayde\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\jayde\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\Users\jayde\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbf

Linear SVM
Fitting 5 folds for each of 48 candidates, totalling 240 fits


120 fits failed out of a total of 240.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\jayde\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\jayde\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\svm\_classes.py", line 274, in fit
    self.coef_, self.intercept_, n_iter_ = _fit_liblinear(
  File "c:\Users\jayde\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\svm\_base.py", line 1223, in _fit_liblinear
    solver_type = _get_liblinear_solver_type(multi_class, penalty, loss

In [13]:
# comparing different models
model_comparison_df = pd.DataFrame([model_name, model_best_score, model_parameters]).T
model_comparison_df.columns = ['model_name', 'recall','parameters']

model_comparison_df = model_comparison_df.sort_values(by='recall', ascending=False)
clf_name = model_comparison_df['model_name'].iloc[0]
clf_params = model_comparison_df['parameters'].iloc[0]
clf_accuracy = model_comparison_df['recall'].iloc[0]

print('Best model:', clf_name)
model_comparison_df[['model_name', 'recall', 'parameters']]

Best model: Logistic Regression


Unnamed: 0,model_name,recall,parameters
0,Logistic Regression,0.678992,"{'C': 2, 'class_weight': 'balanced', 'fit_inte..."
1,Linear SVM,0.547059,"{'C': 2, 'class_weight': 'balanced', 'fit_inte..."


In [14]:
#check parameter values
model_comparison_df['parameters'].iloc[0]

{'C': 2, 'class_weight': 'balanced', 'fit_intercept': False, 'penalty': 'l2'}

Cross validation parametrs are chosen so that for any model best performance is achieved on parameter values that are not on a boundary.

In [15]:
# getting predictions from the best model
clf = model_dict[clf_name]
clf.set_params(**clf_params)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print(clf_name)
print("Model performance:")
print(classification_report(y_test, y_pred))
print("Confusion matrix")
print(cm)

Logistic Regression
Model performance:
              precision    recall  f1-score   support

       False       0.98      0.92      0.95      1855
        True       0.51      0.84      0.63       180

    accuracy                           0.91      2035
   macro avg       0.75      0.88      0.79      2035
weighted avg       0.94      0.91      0.92      2035

Confusion matrix
[[1708  147]
 [  28  152]]


# Non-Linear models


In [16]:
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

In [17]:
# creating a dict of models
model_dict = {'Stochastic Gradient Descent' : SGDClassifier(),
              'Random Forest': RandomForestClassifier(),
              'Neural Network': MLPClassifier()
             }

In [18]:
# creating a dict of parameters for the models
params_SGD = dict(loss=['log'],
                  penalty=['l2','l1'],
                  alpha=[1e-6, 1e-3, 1e-1, 1e0],
                  max_iter=[5, 1000, 10000],
                  tol=[None, 1e-3],
                  random_state=[3])
params_RF = dict(bootstrap = [True, False],
                  max_depth = [10, 50, 100, None],
                  max_features = ['auto', 'sqrt'],
                  min_samples_leaf = [1, 2, 4],
                  n_estimators = [100, 500, 1000],
                  random_state=[3])
params_NN = dict(hidden_layer_sizes = [(100,),(150,), (200,), (250,),(500,),(750,), (1000,)],
                 alpha=[0.00001, 0.00005, 0.0001, 0.0005],
                 activation=['relu'],
                 solver=['adam'],
                  random_state=[3])

parameters_dict = {'Stochastic Gradient Descent' : params_SGD,
                   'Random Forest': params_RF,
                   'Neural Network': params_NN
                  }

In [19]:
# 5-fold gridsearch through parameters maximazing Recall for Purchases
model_name, model_parameters, model_best_score = [], [], []

model_names = model_dict.keys()

for MN in model_names:
    print("="*45)
    print(MN)
    clf = model_dict[MN]
    params = parameters_dict[MN]

    gridsearch = GridSearchCV(clf,
                              params,
                              scoring=recall_score2,
                              cv=5,
                              verbose=1,
                              n_jobs=-1)
    
    best_model = gridsearch.fit(X_train, y_train)
    model_name.append(MN)
    model_parameters.append(gridsearch.best_params_)
    model_best_score.append(gridsearch.best_score_ )

Stochastic Gradient Descent
Fitting 5 folds for each of 48 candidates, totalling 240 fits




Random Forest
Fitting 5 folds for each of 144 candidates, totalling 720 fits


  warn(


Neural Network
Fitting 5 folds for each of 28 candidates, totalling 140 fits


KeyboardInterrupt: 

In [None]:
# comparing different models
model_comparison_df = pd.DataFrame([model_name, model_best_score, model_parameters]).T
model_comparison_df.columns = ['model_name', 'recall','parameters']

model_comparison_df = model_comparison_df.sort_values(by='recall', ascending=False)
clf_name = model_comparison_df['model_name'].iloc[0]
clf_params = model_comparison_df['parameters'].iloc[0]
clf_accuracy = model_comparison_df['recall'].iloc[0]

print('Best model:', clf_name)
model_comparison_df[['model_name', 'recall', 'parameters']]

Best model: Neural Network


Unnamed: 0,model_name,recall,parameters
2,Neural Network,0.689076,"{'activation': 'relu', 'alpha': 5e-05, 'hidden..."
0,Stochastic Gradient Descent,0.568067,"{'alpha': 0.001, 'loss': 'log', 'max_iter': 10..."
1,Random Forest,0.490756,"{'bootstrap': True, 'max_depth': 50, 'max_feat..."


In [None]:
#check parameter values
model_comparison_df['parameters'].iloc[0]

{'activation': 'relu',
 'alpha': 5e-05,
 'hidden_layer_sizes': (500,),
 'random_state': 3,
 'solver': 'adam'}

Cross validation parametrs are chosen so that for any model best performance is achieved on parameter values that are not on a boundary.

In [None]:
# getting predictions from the best model
clf = model_dict[clf_name]
clf.set_params(**clf_params)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print(clf_name)
print("Model performance:")
print(classification_report(y_test, y_pred))
print("Confusion matrix")
print(cm)

Neural Network
Model performance:
              precision    recall  f1-score   support

       False       0.99      0.91      0.95      1855
        True       0.50      0.93      0.65       180

    accuracy                           0.91      2035
   macro avg       0.75      0.92      0.80      2035
weighted avg       0.95      0.91      0.92      2035

Confusion matrix
[[1691  164]
 [  13  167]]


# Classification performance on data entries corresponding to Feb-March:
**Best Models:**

|         Classifier        | Accuracy |Precision| Recall | F1-score |
|---------------------------|----------|---------|--------|----------|
**Linear Classifier**
|        Logistic Regression|   0.91   |  0.51   |  0.84  |  0.63    |
**Non Linear Classiifer**
|             Neural Network|   0.91   |  0.50   |  0.93  |  0.65    |


Note: Precision, Recall and F1-score are measured for "purchase" class

