In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report, roc_auc_score ,roc_curve, auc
from sklearn.model_selection import GridSearchCV,StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC  

In [2]:
# Import dataset
X_train = pd.read_csv('data/X_train.csv')
X_test = pd.read_csv('data/X_test.csv')
y_train = pd.read_csv('data/y_train.csv')
y_test = pd.read_csv('data/y_test.csv')

In [3]:
# Check class distribution
y_train['repurchase_again'].value_counts()

1    3798
0    1231
Name: repurchase_again, dtype: int64

In [4]:
# Set seed
seed = 123

# Logistic Regression

In [None]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

In [None]:
kf = StratifiedKFold(n_splits=5,shuffle=True,random_state=seed)
pred_test_full =0
cv_score =[]
i=1
for train_index,test_index in kf.split(X_train,y_train):
    print('{} of KFold {}'.format(i,kf.n_splits))
    xtr,xvl = X.loc[train_index],X.loc[test_index]
    ytr,yvl = y.loc[train_index],y.loc[test_index]
    
    #model
    lr = LogisticRegression()
    lr.fit(xtr,ytr)
    score = roc_auc_score(yvl,lr.predict(xvl))
    print('ROC AUC score:',score)
    cv_score.append(score)    
    pred_test = lr.predict_proba(x_test)[:,1]
    pred_test_full +=pred_test
    i+=1

# SVM

In [5]:
# Tuning hyperparamaters
param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid']}

# Initialise GridSearch for SVC
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=2, n_jobs = -1, cv = 5)

# Fit GridSearchCV Model
svm = grid.fit(X_train,y_train)

# Optimal parameters
print(svm.best_estimator_)

# Model Prediction 
y_pred_svm = svm.predict(X_test)
y_pred_svm_proba = svm.predict_proba(X_test)[:, 1]

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3331, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-5-2e20a752448a>", line 8, in <module>
    svm = grid.fit(X_train,y_train)
  File "/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_search.py", line 710, in fit
    self._run_search(evaluate_candidates)
  File "/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_search.py", line 1151, in _run_search
    evaluate_candidates(ParameterGrid(self.param_grid))
  File "/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_search.py", line 689, in evaluate_candidates
    cv.split(X, y, groups)))
  File "/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 1017, in __call__
    self.retrieve()
  File "/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 909, in retrieve
    self._output.extend(job.g

KeyboardInterrupt: 

In [None]:
# Evaluation
print('Accuracy:', accuracy_score(y_train, y_pred_svm))
print('Precision:', precision_score(y_train, y_pred_svm))
print('Recall:', recall_score(y_train, y_pred_svm))
print('ROC_AUC:', roc_auc_score(y_train, y_pred_svm_proba))
print('ROC Curve:', roc_curve(y_train, y_pred_svm_proba))
print('AUC:', roc_curve(y_train, y_pred_svm_proba))

print('\nConfusion Matrix:')
print(metrics.confusion_matrix(y_test, y_pred_svm))

print('\nClassification Report:')
print(metrics.confusion_matrix(y_test, y_pred_svm))

# Naive Bayes

In [None]:
nb = MultinomialNB()
nb.fit(X_train_dtm_c, y_train)
y_pred_class_train_c = nb.predict(X_train_dtm_c)
y_pred_class_test_c = nb.predict(X_test_dtm_c)

In [None]:
print('--- Train ---')
print('Accuracy:', metrics.accuracy_score(y_train, y_pred_class_train_c))
print('Precision:', metrics.precision_score(y_train, y_pred_class_train_c))
print('Recall:', metrics.recall_score(y_train, y_pred_class_train_c))
print('ROC_AUC:', metrics.roc_auc_score(y_train, y_pred_class_train_c))

print('\nConfusion Matrix:')
print(metrics.confusion_matrix(y_train, y_pred_class_train_c))


print('\n--- Test ---')
print('Accuracy:', metrics.accuracy_score(y_test, y_pred_class_test_c))
print('Precision:', metrics.precision_score(y_test, y_pred_class_test_c))
print('Recall:', metrics.recall_score(y_test, y_pred_class_test_c))
print('ROC_AUC:', metrics.roc_auc_score(y_test, y_pred_class_test_c))

print('\nConfusion Matrix:')
print(metrics.confusion_matrix(y_test, y_pred_class_test_c))