In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import ttest_ind
from sqlalchemy import create_engine
from scipy.stats.mstats import winsorize
from scipy.stats import boxcox
from scipy.stats import jarque_bera
from scipy.stats import normaltest
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import cross_val_score
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import make_scorer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV
from statsmodels.tools.eval_measures import mse, rmse
from wordcloud import WordCloud
import statsmodels.api as sm
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.svm import SVC
from sklearn import tree
from sklearn import ensemble
from sklearn import datasets
from sklearn.utils import shuffle

from IPython.display import Image


import pydotplus
from sklearn import ensemble

import warnings

%matplotlib inline
sns.set()

warnings.filterwarnings('ignore')
import time

In [88]:
#assign data frame
df_telco = (pd.read_excel('Data/Telco_Churn.xlsx'))

In [89]:
#examine the data frame

print(
    df_telco.head(),
    df_telco.info(),
    df_telco.describe()
)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
customerID          7043 non-null object
gender              7043 non-null object
SeniorCitizen       7043 non-null int64
Partner             7043 non-null object
Dependents          7043 non-null object
tenure              7043 non-null int64
PhoneService        7043 non-null object
MultipleLines       7043 non-null object
InternetService     7043 non-null object
OnlineSecurity      7043 non-null object
OnlineBackup        7043 non-null object
DeviceProtection    7043 non-null object
TechSupport         7043 non-null object
StreamingTV         7043 non-null object
StreamingMovies     7043 non-null object
Contract            7043 non-null object
PaperlessBilling    7043 non-null object
PaymentMethod       7043 non-null object
MonthlyCharges      7043 non-null float64
TotalCharges        7043 non-null object
Churn               7043 non-null object
dtypes: float64(1), int64(2), obj

In [90]:
#df_telco.select_dtypes(['object']).columns

In [91]:
df_telco.TotalCharges=pd.to_numeric(df_telco.TotalCharges, errors='coerce')

In [92]:
#Check for missing values

missing_values_ratios = (df_telco.isnull().sum()/df_telco.isnull().count())
missing_values_ratios.sort_values(ascending=False).head()

TotalCharges      0.001562
Churn             0.000000
OnlineSecurity    0.000000
gender            0.000000
SeniorCitizen     0.000000
dtype: float64

In [93]:
df_telco.dropna(inplace=True)

In [94]:
missing_values_ratios = (df_telco.isnull().sum()/df_telco.isnull().count())
missing_values_ratios.sort_values(ascending=False).head()

Churn             0.0
OnlineSecurity    0.0
gender            0.0
SeniorCitizen     0.0
Partner           0.0
dtype: float64

In [95]:
df_telco = pd.concat([df_telco,pd.get_dummies(
    df_telco[['gender','Partner','Dependents','PhoneService','MultipleLines','InternetService',
                'OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV',
                'StreamingMovies','Contract','PaperlessBilling','PaymentMethod']], drop_first=True)], axis=1)


## Feature Engineering

In [218]:
#Make a boolean version of the target for algorithmic friendliness
churn_bool = []

for idx, val in enumerate(df_telco['Churn']):
    if val == 'Yes':
        churn_bool.append(1)
    else:
        churn_bool.append(0)
        
df_telco['churn_bool'] = churn_bool

In [142]:
#check the correlation of various features with the target

np.abs(df_telco[df_telco.select_dtypes(['int64', 'float64', 'uint8']).columns].iloc[:,1:].corr().loc[:,'churn_bool']).sort_values(ascending=False).head(35)

#NTS: the ^above^ logic fails if the target is categorical

churn_bool                               1.000000
tenure                                   0.354049
InternetService_Fiber optic              0.307463
Contract_Two year                        0.301552
PaymentMethod_Electronic check           0.301455
StreamingTV_No internet service          0.227578
InternetService_No                       0.227578
OnlineSecurity_No internet service       0.227578
DeviceProtection_No internet service     0.227578
TechSupport_No internet service          0.227578
OnlineBackup_No internet service         0.227578
StreamingMovies_No internet service      0.227578
TotalCharges                             0.199484
MonthlyCharges                           0.192858
PaperlessBilling_Yes                     0.191454
Contract_One year                        0.178225
OnlineSecurity_Yes                       0.171270
TechSupport_Yes                          0.164716
Dependents_Yes                           0.163128
Partner_Yes                              0.149982


In [None]:
#INCLUDE HERE! correlation map of features 

In [190]:
#assigning features

#X is standardized
X = StandardScaler().fit_transform(pd.concat([df_telco[['SeniorCitizen','tenure','MonthlyCharges','TotalCharges']], 
                                              df_telco.select_dtypes(['uint8'])], axis=1, sort=False)
                                  )

#XX is the same data as X but not standardized
XX = pd.concat([df_telco[['SeniorCitizen','tenure','MonthlyCharges','TotalCharges']], 
                df_telco.select_dtypes(['uint8'])], axis=1, sort=False
              )

#Y is the target
Y = df_telco.churn_bool

In [174]:
#NTS: these functions are useful diagnostic tools

#np.isinf(X).any()
#np.isinf(Y).any()

#np.isnan(X).any()
#np.isnan(Y).any()

## Feature Engineering: PCA

In [175]:
#setting up parameters for GridSearchCV(X)... the plan is PCA(GridSearchCV(X))

n_comps = np.arange(0, 25)
param_grid_pca = [{'pca__n_components':n_comps}]
pipe_tree_pca = make_pipeline(PCA())

In [176]:
gs_pca = GridSearchCV(pipe_tree_pca, param_grid=param_grid_pca, cv=10)

In [177]:
gs_pca.fit(X,Y)
print(gs_pca.best_params_)

{'pca__n_components': 22}


In [241]:
#using the results of GridSearchCV to perform PCA

sklearn_pca = PCA(n_components=22)  
X_pca = sklearn_pca.fit_transform(X)

print(
    'The percentage of total variance in the dataset explained by each',
    'component from Sklearn PCA:\n',
    sklearn_pca.explained_variance_ratio_
)

#X_pca is now our PCA engineered feature set.
#NTS: try a feature set applying PCA to the four numerical variables and adding the raw uint8 features separately?

The percentage of total variance in the dataset explained by each component from Sklearn PCA:
 [0.33160138 0.12009062 0.09014536 0.04754993 0.04143011 0.04120627
 0.03815289 0.03336784 0.03123044 0.02954298 0.02659613 0.02379643
 0.02249445 0.02055908 0.02024829 0.01748569 0.01555265 0.01525414
 0.01467132 0.00923412 0.00784362 0.00191635]


## Feature Engineering: SelectKBest

In [205]:
#SelectKBest contrasts with PCA as a feature selection tool

selection = SelectKBest(score_func=f_regression, k='all') 
X_kbest = selection.fit(XX, Y).transform(XX)

#we can compare the performance of models with a PCA derived feature set (X_pca(X)),
#    vs a SelectKBest derived feature set (X_kbest(XX))

In [274]:
#splitting the data into testing and training sets
X_train_skb, X_test_skb, Y_train_skb, Y_test_skb = train_test_split(
    X_kbest, Y, test_size = 0.20, random_state = 1
)

X_train_pca, X_test_pca, Y_train_pca, Y_test_pca = train_test_split(
    X_pca, Y, test_size = 0.20, random_state = 1
)


## Model 1: Decision Tree(SelectKBest)

In [228]:
#setting up parameters for GridSearchCV(DecisionTreeClassifier())

depths = np.arange(1, 25)
num_features = [1, 2, 3, 4, 5, 6, 7, 8]

param_grid_dt_skb = [{'decisiontreeclassifier__max_depth':depths,
               'decisiontreeclassifier__max_features':num_features}]

pipe_tree_dt_skb = make_pipeline(tree.DecisionTreeClassifier(criterion='entropy'))

In [229]:
gs_dt_skb = GridSearchCV(pipe_tree_dt_skb, param_grid=param_grid_dt_skb, cv=10)

In [230]:
gs_dt_skb.fit(X_train_skb,Y_train_skb)
print(gs_dt_skb.best_params_)

{'decisiontreeclassifier__max_depth': 6, 'decisiontreeclassifier__max_features': 7}


In [262]:
#initialize and train the decision tree model using GridSearchCV results
#    {'decisiontreeclassifier__max_depth': 6, 'decisiontreeclassifier__max_features': 7}

decision_tree = tree.DecisionTreeClassifier(
    criterion='entropy',
    max_features=7,
    max_depth=6
)

decision_tree.fit(X_train_skb, Y_train_skb)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=6,
            max_features=7, max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [263]:
#making predictions for model evaluation
Y_pred_dt = decision_tree.fit(X_train_skb, Y_train_skb).predict(X_test_skb)

In [264]:
#evaluating the model performance through xvalidation of the training set

start_time = time.time()

print(cross_val_score(decision_tree, X_train_skb, Y_train_skb, cv=10))
print('The 10-fold cross validation average for the training set is ', 
      cross_val_score(decision_tree, X_train_skb, Y_train_skb, cv=10).mean())

print("--- %s seconds ---" % (time.time() - start_time))

[0.78900709 0.78723404 0.78152753 0.79715302 0.76868327 0.79003559
 0.78469751 0.76868327 0.78291815 0.77580071]
The 10-fold cross validation average for the training set is  0.7752786448222126
--- 0.6514401435852051 seconds ---


In [265]:
#evaluating the model performance through xvalidation of the testing set

print(cross_val_score(decision_tree, X_test_skb, Y_test_skb, cv=10))
print('The 10-fold cross validation average for the testing set  is ', 
      cross_val_score(decision_tree, X_test_skb, Y_test_skb, cv=10).mean())

[0.78873239 0.75177305 0.71631206 0.78723404 0.76595745 0.75177305
 0.76428571 0.78571429 0.77142857 0.75714286]
The 10-fold cross validation average for the testing set  is  0.761233642992708


In [266]:
#classification report

print(classification_report(Y_test_skb, Y_pred_dt, labels=None, target_names=None, 
                      sample_weight=None, digits=2, output_dict=False
                     ))

              precision    recall  f1-score   support

           0       0.83      0.89      0.86      1041
           1       0.61      0.50      0.55       366

   micro avg       0.79      0.79      0.79      1407
   macro avg       0.72      0.69      0.70      1407
weighted avg       0.77      0.79      0.78      1407



In [267]:
#confusion matrix
print('Test set confusion matrix:', '\n', 
      confusion_matrix(Y_test_skb, Y_pred_dt, labels=None, sample_weight=None)
     )

Test set confusion matrix: 
 [[923 118]
 [184 182]]


## Model 2: Decision Tree(PCA)

In [288]:
#setting up parameters for GridSearchCV(DecisionTreeClassifier())

depths = np.arange(1, 25)
num_features = [1, 2, 3, 4, 5, 6, 7, 8]

param_grid_dt_pca = [{'decisiontreeclassifier__max_depth':depths,
               'decisiontreeclassifier__max_features':num_features}]

pipe_tree_dt_pca = make_pipeline(tree.DecisionTreeClassifier(criterion='entropy'))

In [289]:
gs_dt_pca = GridSearchCV(pipe_tree_dt_pca, param_grid=param_grid_dt_pca, cv=10)

In [290]:
gs_dt_pca.fit(X_train_pca,Y_train_pca)
print(gs_dt_pca.best_params_)

{'decisiontreeclassifier__max_depth': 4, 'decisiontreeclassifier__max_features': 8}


In [291]:
#initialize and train the decision tree model using GridSearchCV results
#    {'decisiontreeclassifier__max_depth': 4, 'decisiontreeclassifier__max_features': 8}

decision_tree = tree.DecisionTreeClassifier(
    criterion='entropy',
    max_features=4,
    max_depth=8
)

decision_tree.fit(X_train_pca, Y_train_pca)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=8,
            max_features=4, max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [292]:
#making predictions for model evaluation
Y_pred_dt_pca = decision_tree.fit(X_train_pca, Y_train_pca).predict(X_test_pca)

In [293]:
#evaluating the model performance through xvalidation of the training set

start_time = time.time()

print(cross_val_score(decision_tree, X_train_pca, Y_train_pca, cv=10))
print('The 10-fold cross validation average for the training set is ', 
      cross_val_score(decision_tree, X_train_pca, Y_train_pca, cv=10).mean())

print("--- %s seconds ---" % (time.time() - start_time))

[0.75886525 0.79255319 0.76198934 0.78647687 0.77224199 0.75978648
 0.77935943 0.75266904 0.77402135 0.77402135]
The 10-fold cross validation average for the training set is  0.7688871607773172
--- 0.5928928852081299 seconds ---


In [294]:
#evaluating the model performance through xvalidation of the testing set

print(cross_val_score(decision_tree, X_test_pca, Y_test_pca, cv=10))
print('The 10-fold cross validation average for the testing set  is ', 
      cross_val_score(decision_tree, X_test_pca, Y_test_pca, cv=10).mean())

[0.72535211 0.74468085 0.72340426 0.78723404 0.75886525 0.75177305
 0.78571429 0.75       0.78571429 0.72142857]
The 10-fold cross validation average for the testing set  is  0.7534015440158683


In [295]:
#classification report

print(classification_report(Y_test_pca, Y_pred_dt_pca, labels=None, target_names=None, 
                      sample_weight=None, digits=2, output_dict=False
                     ))

              precision    recall  f1-score   support

           0       0.81      0.87      0.84      1041
           1       0.54      0.42      0.47       366

   micro avg       0.76      0.76      0.76      1407
   macro avg       0.67      0.65      0.66      1407
weighted avg       0.74      0.76      0.75      1407



In [280]:
#confusion matrix
print('Test set confusion matrix:', '\n', 
      confusion_matrix(Y_test_pca, Y_pred_dt_pca, labels=None, sample_weight=None)
     )

Test set confusion matrix: 
 [[963  78]
 [232 134]]


## Model 3: Random Forest Classifier(SelectKBest)

In [268]:
rfc = ensemble.RandomForestClassifier()
rfc.fit(X_train_skb,Y_train_skb)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [269]:
#making predictions for model evaluation
Y_preds_rfc_skb=rfc.predict(X_test_skb)

In [270]:
#evaluating the model performance through xvalidation of the training set

start_time = time.time()

print(cross_val_score(rfc, X_train_skb, Y_train_skb, cv=10))
print('The 10-fold cross validation average for the training set is ', 
      cross_val_score(rfc, X_train_skb, Y_train_skb, cv=10).mean())

print("--- %s seconds ---" % (time.time() - start_time))

[0.7712766  0.75       0.76376554 0.78469751 0.76512456 0.77402135
 0.78291815 0.76156584 0.76512456 0.77935943]
The 10-fold cross validation average for the training set is  0.7701415460780414
--- 2.888967990875244 seconds ---


In [271]:
#evaluating the model performance through xvalidation of the testing set

print(cross_val_score(rfc, X_test_skb, Y_test_skb, cv=10))
print('The 10-fold cross validation average for the testing set  is ', 
      cross_val_score(rfc, X_test_skb, Y_test_skb, cv=10).mean())

[0.77464789 0.76595745 0.73758865 0.77304965 0.82269504 0.75886525
 0.78571429 0.75       0.8        0.75714286]
The 10-fold cross validation average for the testing set  is  0.7804281718680879


In [272]:
#classification report

print(classification_report(Y_test_skb, Y_preds_rfc_skb, labels=None, target_names=None, 
                      sample_weight=None, digits=2, output_dict=False
                     ))

              precision    recall  f1-score   support

           0       0.81      0.90      0.85      1041
           1       0.58      0.41      0.48       366

   micro avg       0.77      0.77      0.77      1407
   macro avg       0.69      0.65      0.66      1407
weighted avg       0.75      0.77      0.75      1407



In [273]:
#confusion matrix
print('Test set confusion matrix:', '\n', 
      confusion_matrix(Y_test_skb, Y_preds_rfc_skb, labels=None, sample_weight=None)
     )

Test set confusion matrix: 
 [[932 109]
 [217 149]]


## Model 4: Random Forest Classifier(PCA)

In [281]:
rfc = ensemble.RandomForestClassifier()
rfc.fit(X_train_pca,Y_train_pca)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [282]:
#making predictions for model evaluation
Y_preds_rfc_pca=rfc.predict(X_test_pca)

In [283]:
#evaluating the model performance through xvalidation of the training set

start_time = time.time()

print(cross_val_score(rfc, X_train_pca, Y_train_pca, cv=10))
print('The 10-fold cross validation average for the training set is ', 
      cross_val_score(rfc, X_train_pca, Y_train_pca, cv=10).mean())

print("--- %s seconds ---" % (time.time() - start_time))

[0.77836879 0.7748227  0.75666075 0.79181495 0.76512456 0.75978648
 0.78291815 0.7633452  0.77935943 0.79003559]
The 10-fold cross validation average for the training set is  0.7688953612566097
--- 3.2938950061798096 seconds ---


In [284]:
#evaluating the model performance through xvalidation of the testing set

print(cross_val_score(rfc, X_test_pca, Y_test_pca, cv=10))
print('The 10-fold cross validation average for the testing set  is ', 
      cross_val_score(rfc, X_test_pca, Y_test_pca, cv=10).mean())

[0.76760563 0.72340426 0.75177305 0.78014184 0.80851064 0.75886525
 0.77142857 0.77857143 0.86428571 0.8       ]
The 10-fold cross validation average for the testing set  is  0.771932374388173


In [285]:
#classification report

print(classification_report(Y_test_pca, Y_preds_rfc_pca, labels=None, target_names=None, 
                      sample_weight=None, digits=2, output_dict=False
                     ))

              precision    recall  f1-score   support

           0       0.81      0.90      0.85      1041
           1       0.59      0.40      0.47       366

   micro avg       0.77      0.77      0.77      1407
   macro avg       0.70      0.65      0.66      1407
weighted avg       0.75      0.77      0.75      1407



In [286]:
#confusion matrix
print('Test set confusion matrix:', '\n', 
      confusion_matrix(Y_test_pca, Y_preds_rfc_pca, labels=None, sample_weight=None)
     )

Test set confusion matrix: 
 [[939 102]
 [221 145]]


## Model 5: SVM(SelectKBest)

In [296]:
#kernels = ['linear', 'poly', 'rbf', 'sigmoid']
degrees = [1, 2, 3, 4]
param_grid_svm_skb = [{'svc__degree':degrees}]
pipe_tree_svm_skb = make_pipeline(SVC(kernel = 'poly'))

In [297]:
gs_svm = GridSearchCV(pipe_tree_svm_skb, param_grid=param_grid_svm_skb, cv=10)

#gs_svm.get_params().keys()

In [299]:
gs_svm.fit(X_train_skb,Y_train_skb)
print(gs_svm.best_params_)

{'svc__degree': 1}


In [300]:
svm = SVC(kernel='poly',degree=1)
svm.fit(X_train_skb, Y_train_skb)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=1, gamma='auto_deprecated',
  kernel='poly', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [306]:
#Setting the predictions for analysis
Y_pred_svm_skb = svm.fit(X_train_skb, Y_train_skb).predict(X_test_skb)

In [303]:
#evaluating the model performance through xvalidation of the training set

start_time = time.time()

print(cross_val_score(svm, X_train_skb, Y_train_skb, cv=10))
print('The 10-fold cross validation average for the training set is ', 
      cross_val_score(svm, X_train_skb, Y_train_skb, cv=10).mean())

print("--- %s seconds ---" % (time.time() - start_time))

[0.80851064 0.80319149 0.79040853 0.81672598 0.78825623 0.80960854
 0.82206406 0.79359431 0.78825623 0.79181495]
The 10-fold cross validation average for the training set is  0.8012430938111967
--- 8.648271083831787 seconds ---


In [304]:
#evaluating the model performance through xvalidation of the testing set

print(cross_val_score(svm, X_test_skb, Y_test_skb, cv=10))
print('The 10-fold cross validation average for the testing set  is ', 
      cross_val_score(svm, X_test_skb, Y_test_skb, cv=10).mean())

[0.76760563 0.73049645 0.78014184 0.80851064 0.78723404 0.78014184
 0.8        0.79285714 0.85714286 0.75      ]
The 10-fold cross validation average for the testing set  is  0.7854130456497852


In [307]:
#classification report

print(classification_report(Y_test_skb, Y_pred_svm_skb, labels=None, target_names=None, 
                      sample_weight=None, digits=2, output_dict=False
                     ))

              precision    recall  f1-score   support

           0       0.84      0.88      0.86      1041
           1       0.62      0.54      0.58       366

   micro avg       0.79      0.79      0.79      1407
   macro avg       0.73      0.71      0.72      1407
weighted avg       0.79      0.79      0.79      1407



In [308]:
#confusion matrix
print('Test set confusion matrix:', '\n', 
      confusion_matrix(Y_test_skb, Y_pred_svm_skb, labels=None, sample_weight=None)
     )

Test set confusion matrix: 
 [[919 122]
 [169 197]]


## Model 6: SVM(PCA)

In [310]:
#kernels = ['linear', 'poly', 'rbf', 'sigmoid']
degrees = [1, 2, 3, 4]
param_grid_svm_pca = [{'svc__degree':degrees}]
pipe_tree_svm_pca = make_pipeline(SVC(kernel = 'poly'))

In [311]:
gs_svm_pca = GridSearchCV(pipe_tree_svm_pca, param_grid=param_grid_svm_pca, cv=10)

#gs_svm.get_params().keys()

In [312]:
gs_svm_pca.fit(X_train_pca,Y_train_pca)
print(gs_svm_pca.best_params_)

{'svc__degree': 1}


In [313]:
svm = SVC(kernel='poly',degree=1)
svm.fit(X_train_pca, Y_train_pca)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=1, gamma='auto_deprecated',
  kernel='poly', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [314]:
#Setting the predictions for analysis
Y_pred_svm_pca = svm.fit(X_train_pca, Y_train_pca).predict(X_test_pca)

In [315]:
#evaluating the model performance through xvalidation of the training set

start_time = time.time()

print(cross_val_score(svm, X_train_pca, Y_train_pca, cv=10))
print('The 10-fold cross validation average for the training set is ', 
      cross_val_score(svm, X_train_pca, Y_train_pca, cv=10).mean())

print("--- %s seconds ---" % (time.time() - start_time))

[0.80851064 0.80319149 0.79040853 0.81672598 0.78825623 0.80960854
 0.82206406 0.79359431 0.78825623 0.79181495]
The 10-fold cross validation average for the training set is  0.8012430938111967
--- 9.206144571304321 seconds ---


In [316]:
#evaluating the model performance through xvalidation of the testing set

print(cross_val_score(svm, X_test_pca, Y_test_pca, cv=10))
print('The 10-fold cross validation average for the testing set  is ', 
      cross_val_score(svm, X_test_pca, Y_test_pca, cv=10).mean())

[0.76760563 0.73049645 0.78014184 0.80851064 0.78723404 0.78014184
 0.8        0.79285714 0.85714286 0.75      ]
The 10-fold cross validation average for the testing set  is  0.7854130456497852


In [317]:
#classification report

print(classification_report(Y_test_pca, Y_pred_svm_pca, labels=None, target_names=None, 
                      sample_weight=None, digits=2, output_dict=False
                     ))

              precision    recall  f1-score   support

           0       0.84      0.88      0.86      1041
           1       0.62      0.54      0.58       366

   micro avg       0.79      0.79      0.79      1407
   macro avg       0.73      0.71      0.72      1407
weighted avg       0.79      0.79      0.79      1407



In [318]:
#confusion matrix
print('Test set confusion matrix:', '\n', 
      confusion_matrix(Y_test_pca, Y_pred_svm_pca, labels=None, sample_weight=None)
     )

Test set confusion matrix: 
 [[919 122]
 [169 197]]
