In [33]:
import pandas as pd
import numpy as np

#Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

#Classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

#Accuracy Metrix
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_auc_score

#Stacking and SMOTE
!pip install vecstack
from vecstack import stacking
from imblearn.over_sampling import SMOTE 
from collections import Counter

import warnings
warnings.filterwarnings("ignore")
#Please ignore the warnings with version change

from google.colab import drive
drive.mount('/gdrive')
#Change current working directory to gdrive
%cd /gdrive

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive


In [2]:
#Read training data file
trainfile = r'/gdrive/My Drive/Data Mining/Assignment 6/churn_train.csv'
trainData = pd.read_csv(trainfile) #creates a dataframe

#Read test data file
testfile = r'/gdrive/My Drive/Data Mining/Assignment 6/churn_test.csv'
testData = pd.read_csv(testfile) #creates a dataframe

In [None]:
Exploratory Data Analysis

In [3]:
trainData.head()

Unnamed: 0,State,Account Length,Area Code,Int'l Plan,VMail Plan,VMail Message,Day Mins,Day Calls,Day Charge,Eve Mins,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge,CustServ Calls,TARGET
0,KS,128,415,no,yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,0
1,OH,107,415,no,yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,0
2,NJ,137,415,no,no,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,0
3,OH,84,408,yes,no,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,0
4,OK,75,415,yes,no,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,0


In [4]:
testData.head()

Unnamed: 0,State,Account Length,Area Code,Int'l Plan,VMail Plan,VMail Message,Day Mins,Day Calls,Day Charge,Eve Mins,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge,CustServ Calls,TARGET
0,VT,50,415,yes,yes,26,307.1,94,52.21,289.4,78,24.6,174.9,109,7.87,8.0,3,2.16,0,0
1,UT,72,415,no,no,0,118.2,106,20.09,167.2,136,14.21,214.2,106,9.64,12.2,3,3.29,3,0
2,KS,130,510,no,no,0,154.0,95,26.18,205.9,106,17.5,233.7,75,10.52,12.9,1,3.48,1,0
3,NV,143,408,no,no,0,155.5,101,26.44,213.4,89,18.14,237.9,61,10.71,7.6,11,2.05,1,0
4,DE,89,510,yes,no,0,125.6,108,21.35,213.0,90,18.11,181.7,108,8.18,5.4,5,1.46,1,0


In [5]:
trainData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1165 entries, 0 to 1164
Data columns (total 20 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   State           1165 non-null   object 
 1   Account Length  1165 non-null   int64  
 2   Area Code       1165 non-null   int64  
 3   Int'l Plan      1165 non-null   object 
 4   VMail Plan      1165 non-null   object 
 5   VMail Message   1165 non-null   int64  
 6   Day Mins        1165 non-null   float64
 7   Day Calls       1165 non-null   int64  
 8   Day Charge      1165 non-null   float64
 9   Eve Mins        1165 non-null   float64
 10  Eve Calls       1165 non-null   int64  
 11  Eve Charge      1165 non-null   float64
 12  Night Mins      1165 non-null   float64
 13  Night Calls     1165 non-null   int64  
 14  Night Charge    1165 non-null   float64
 15  Intl Mins       1165 non-null   float64
 16  Intl Calls      1165 non-null   int64  
 17  Intl Charge     1165 non-null   f

One Hot Encode

In [8]:
#To get list of names of all Columns from a dataframe

TrainCols = list(trainData.columns.values)
TestCols = list(testData.columns.values)
print(TrainCols)
print(TestCols)

['State', 'Account Length', 'Area Code', "Int'l Plan", 'VMail Plan', 'VMail Message', 'Day Mins', 'Day Calls', 'Day Charge', 'Eve Mins', 'Eve Calls', 'Eve Charge', 'Night Mins', 'Night Calls', 'Night Charge', 'Intl Mins', 'Intl Calls', 'Intl Charge', 'CustServ Calls', 'TARGET']
['State', 'Account Length', 'Area Code', "Int'l Plan", 'VMail Plan', 'VMail Message', 'Day Mins', 'Day Calls', 'Day Charge', 'Eve Mins', 'Eve Calls', 'Eve Charge', 'Night Mins', 'Night Calls', 'Night Charge', 'Intl Mins', 'Intl Calls', 'Intl Charge', 'CustServ Calls', 'TARGET']


In [9]:
# Seperate Target column from Train Data
x_train = trainData[TrainCols[0:len(TrainCols)-1]].copy()
y_train = trainData[['TARGET']].copy()

print("Train Set shape, x_train, y_train:")
print(x_train.shape)
print(y_train.shape)

x_test = testData[TestCols[0:len(TestCols)-1]].copy()
y_test = testData[['TARGET']].copy()

print("Test Set shape, x_test, y_test:")
print(x_test.shape)
print(y_test.shape)

Train Set shape, x_train, y_train:
(1165, 19)
(1165, 1)
Test Set shape, x_test, y_test:
(1261, 19)
(1261, 1)


In [14]:
CategoricalFeatures = ["State", "Int'l Plan", "VMail Plan"]

In [15]:
# OneHotEncoding on Train (fit & transform)
# OneHotEncoding is to be done on Categorical variables.
ohe = OneHotEncoder(handle_unknown='ignore',sparse=False)
x_cat = pd.DataFrame(ohe.fit_transform(x_train[CategoricalFeatures]),columns=ohe.get_feature_names(),index=x_train.index)
x_train = pd.concat([x_train,x_cat],axis=1)
x_train.drop(labels=CategoricalFeatures,axis=1,inplace=True)
x_train.sample(5)

Unnamed: 0,Account Length,Area Code,VMail Message,Day Mins,Day Calls,Day Charge,Eve Mins,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge,CustServ Calls,x0_AK,x0_AL,x0_AR,x0_AZ,x0_CA,x0_CO,x0_CT,x0_DC,x0_DE,x0_FL,x0_GA,x0_HI,x0_IA,x0_ID,x0_IL,x0_IN,x0_KS,x0_KY,x0_LA,x0_MA,x0_MD,x0_ME,x0_MI,x0_MN,x0_MO,x0_MS,x0_MT,x0_NC,x0_ND,x0_NE,x0_NH,x0_NJ,x0_NM,x0_NV,x0_NY,x0_OH,x0_OK,x0_OR,x0_PA,x0_RI,x0_SC,x0_SD,x0_TN,x0_TX,x0_UT,x0_VA,x0_VT,x0_WA,x0_WI,x0_WV,x0_WY,x1_no,x1_yes,x2_no,x2_yes
33,12,408,0,249.6,118,42.43,252.4,119,21.45,280.2,90,12.61,11.8,3,3.19,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
438,113,510,0,155.0,93,26.35,330.6,106,28.1,189.4,123,8.52,13.5,3,3.65,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
183,19,408,0,186.1,98,31.64,254.3,57,21.62,214.0,127,9.63,14.6,7,3.94,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
421,51,510,0,259.9,114,44.18,176.2,94,14.98,77.2,112,3.47,15.3,1,4.13,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
469,41,510,34,194.4,63,33.05,254.9,110,21.67,160.2,115,7.21,17.2,9,4.64,2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [16]:
x_train.shape

(1165, 71)

In [17]:
y_train.shape

(1165, 1)

In [18]:
# OneHotEncoding on Test (only transform)
# OneHotEncoding is to be done on Categorical variables.
x_cat = pd.DataFrame(ohe.transform(x_test[CategoricalFeatures]),columns=ohe.get_feature_names(),index=x_test.index)
x_test = pd.concat([x_test,x_cat],axis=1)
x_test.drop(labels=CategoricalFeatures,axis=1,inplace=True)
x_test.sample(5)

Unnamed: 0,Account Length,Area Code,VMail Message,Day Mins,Day Calls,Day Charge,Eve Mins,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge,CustServ Calls,x0_AK,x0_AL,x0_AR,x0_AZ,x0_CA,x0_CO,x0_CT,x0_DC,x0_DE,x0_FL,x0_GA,x0_HI,x0_IA,x0_ID,x0_IL,x0_IN,x0_KS,x0_KY,x0_LA,x0_MA,x0_MD,x0_ME,x0_MI,x0_MN,x0_MO,x0_MS,x0_MT,x0_NC,x0_ND,x0_NE,x0_NH,x0_NJ,x0_NM,x0_NV,x0_NY,x0_OH,x0_OK,x0_OR,x0_PA,x0_RI,x0_SC,x0_SD,x0_TN,x0_TX,x0_UT,x0_VA,x0_VT,x0_WA,x0_WI,x0_WV,x0_WY,x1_no,x1_yes,x2_no,x2_yes
1209,157,415,0,180.4,123,30.67,194.0,98,16.49,227.3,88,10.23,8.4,5,2.27,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
937,111,415,0,246.5,108,41.91,216.3,89,18.39,179.6,99,8.08,12.7,3,3.43,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
334,86,415,0,136.4,104,23.19,202.5,110,17.21,230.7,86,10.38,11.5,1,3.11,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
531,141,510,0,215.6,113,36.65,200.6,81,17.05,153.8,107,6.92,12.4,6,3.35,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
318,107,510,0,234.1,91,39.8,163.1,105,13.86,282.5,100,12.71,10.0,3,2.7,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


In [19]:
x_test.shape

(1261, 71)

In [20]:
y_test.shape

(1261, 1)

**Part 1: Default Mode for all Classifiers**

Decision Tree Classifier

In [21]:
#default decision tree
dt = DecisionTreeClassifier()
dt.fit(x_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [22]:
#predict to new test dataset
dtp = dt.predict(x_test)
dtp = pd.DataFrame(dtp, columns=['TARGET'])

In [42]:
print("Test Accuracy:", metrics.accuracy_score(y_test,dtp))

print(" ------ ")

print("Confusion Matrix for Decision Tree:")
print(confusion_matrix(y_test,dtp))

print(" ------ ")

print('Printing the precision and recall, among other metrics')
print(metrics.classification_report(y_test, dtp))

print(" ------ ")

print("AUC Score", roc_auc_score(y_test,dtp))

Test Accuracy: 0.9175257731958762
 ------ 
Confusion Matrix for Decision Tree:
[[1030   39]
 [  65  127]]
 ------ 
Printing the precision and recall, among other metrics
              precision    recall  f1-score   support

           0       0.94      0.96      0.95      1069
           1       0.77      0.66      0.71       192

    accuracy                           0.92      1261
   macro avg       0.85      0.81      0.83      1261
weighted avg       0.91      0.92      0.92      1261

 ------ 
AUC Score 0.8124878196133458


In [29]:
print("Max Depth",dt.get_depth())
print("Leaf",dt.get_n_leaves())

Max Depth 16
Leaf 81


Random Forest Classifier

In [35]:
#default random forest
rf = RandomForestClassifier()
rf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [36]:
#predict to new test dataset
rfp = rf.predict(x_test)
rfp = pd.DataFrame(rfp, columns=['TARGET'])

In [41]:
print("Test Accuracy:", metrics.accuracy_score(y_test,rfp))

print(" ------ ")

print("Confusion Matrix for Random Forest:")
print(confusion_matrix(y_test,rfp))

print(" ------ ")

print('Printing the precision and recall, among other metrics')
print(metrics.classification_report(y_test, rfp))

print(" ------ ")

print("AUC Score", roc_auc_score(y_test,rfp))

Test Accuracy: 0.9064234734337827
 ------ 
Confusion Matrix for Random Forest:
[[1066    3]
 [ 115   77]]
 ------ 
Printing the precision and recall, among other metrics
              precision    recall  f1-score   support

           0       0.90      1.00      0.95      1069
           1       0.96      0.40      0.57       192

    accuracy                           0.91      1261
   macro avg       0.93      0.70      0.76      1261
weighted avg       0.91      0.91      0.89      1261

 ------ 
AUC Score 0.6991176527907702


MLP Classifier

In [43]:
#default MLP
mlp = MLPClassifier()
mlp.fit(x_train, y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [44]:
#predict to new test dataset
mlpp = mlp.predict(x_test)
mlpp = pd.DataFrame(mlpp, columns=['TARGET'])

In [45]:
print("Test Accuracy:", metrics.accuracy_score(y_test,mlpp))

print(" ------ ")

print("Confusion Matrix for MLP:")
print(confusion_matrix(y_test,mlpp))

print(" ------ ")

print('Printing the precision and recall, among other metrics')
print(metrics.classification_report(y_test, mlpp))

print(" ------ ")

print("AUC Score", roc_auc_score(y_test,mlpp))

Test Accuracy: 0.8794607454401269
 ------ 
Confusion Matrix for MLP:
[[1030   39]
 [ 113   79]]
 ------ 
Printing the precision and recall, among other metrics
              precision    recall  f1-score   support

           0       0.90      0.96      0.93      1069
           1       0.67      0.41      0.51       192

    accuracy                           0.88      1261
   macro avg       0.79      0.69      0.72      1261
weighted avg       0.87      0.88      0.87      1261

 ------ 
AUC Score 0.6874878196133457


Gradient Boosting Classifier

In [51]:
gb = GradientBoostingClassifier()
gb.fit(x_train, y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [53]:
#predict to new test dataset
gbp = gb.predict(x_test)
gbp = pd.DataFrame(gbp, columns=['TARGET'])

In [55]:
print("Test Accuracy:", metrics.accuracy_score(y_test,gbp))

print(" ------ ")

print("Confusion Matrix for Gradient Boosting:")
print(confusion_matrix(y_test,gbp))

print(" ------ ")

print('Printing the precision and recall, among other metrics')
print(metrics.classification_report(y_test, gbp))

print(" ------ ")

print("AUC Score", roc_auc_score(y_test,gbp))

Test Accuracy: 0.9254559873116575
 ------ 
Confusion Matrix for Gradient Boosting:
[[1056   13]
 [  81  111]]
 ------ 
Printing the precision and recall, among other metrics
              precision    recall  f1-score   support

           0       0.93      0.99      0.96      1069
           1       0.90      0.58      0.70       192

    accuracy                           0.93      1261
   macro avg       0.91      0.78      0.83      1261
weighted avg       0.92      0.93      0.92      1261

 ------ 
AUC Score 0.7829820509822264


**Part 2: Create an enseble model (one-layer stacking) by combining the predictions from various classifier. Use at least three different base classifiers. Use random forest, in default mode, as the stacking classifier.**

In [56]:
#SMOTE==============================================================================
print("___________________________________________________________________\nSMOTE\n")
print('Original dataset shape %s' % Counter(y_train))
sm = SMOTE(sampling_strategy='float', ratio=0.5)
x_res, y_res = sm.fit_resample(x_train, y_train)
print('Resampled dataset shape %s' % Counter(y_res))

___________________________________________________________________
SMOTE

Original dataset shape Counter({'TARGET': 1})
Resampled dataset shape Counter({0: 1019, 1: 509})


In [57]:
#STACKING MODELS =====================================================================
print("___________________________________________________________________________________________\nEnsemble Methods Predictions using Decision Tree, Random Forest, MLP, and Gradient Boosting\n")

models = [ DecisionTreeClassifier(), RandomForestClassifier(), MLPClassifier(), GradientBoostingClassifier() ]
      
s_train, s_test = stacking(models,                   
                           x_res, y_res, x_test,   
                           regression=False, 
     
                           mode='oof_pred_bag', 
       
                           needs_proba=False,
         
                           save_dir=None, 
            
                           metric=accuracy_score, 
    
                           n_folds=4, 
                 
                           stratified=True,
            
                           shuffle=True,  
            
                           random_state=0,    
         
                           verbose=2)

___________________________________________________________________________________________
Ensemble Methods Predictions using Decision Tree, Random Forest, MLP, and Gradient Boosting

task:         [classification]
n_classes:    [2]
metric:       [accuracy_score]
mode:         [oof_pred_bag]
n_models:     [4]

model  0:     [DecisionTreeClassifier]
    fold  0:  [0.89790576]
    fold  1:  [0.89790576]
    fold  2:  [0.91884817]
    fold  3:  [0.91884817]
    ----
    MEAN:     [0.90837696] + [0.01047120]
    FULL:     [0.90837696]

model  1:     [RandomForestClassifier]
    fold  0:  [0.95026178]
    fold  1:  [0.93979058]
    fold  2:  [0.95811518]
    fold  3:  [0.95549738]
    ----
    MEAN:     [0.95091623] + [0.00701820]
    FULL:     [0.95091623]

model  2:     [MLPClassifier]
    fold  0:  [0.77748691]
    fold  1:  [0.79057592]
    fold  2:  [0.84031414]
    fold  3:  [0.75654450]
    ----
    MEAN:     [0.79123037] + [0.03082871]
    FULL:     [0.79123037]

model  3:     [Gra

In [60]:
#Stacking, default mode

stacked_model = RandomForestClassifier()

stacked_model = stacked_model.fit(s_train, y_res)
y_pred = stacked_model.predict(s_test)

In [61]:
print("Test Accuracy:", metrics.accuracy_score(y_test,y_pred))

print(" ------ ")

print("Confusion Matrix for Stacked Random Forest:")
print(confusion_matrix(y_test,y_pred))

print(" ------ ")

print('Printing the precision and recall, among other metrics')
print(metrics.classification_report(y_test, y_pred))

print(" ------ ")

print("AUC Score", roc_auc_score(y_test,y_pred))

Test Accuracy: 0.9191118160190325
 ------ 
Confusion Matrix for Stacked Random Forest:
[[1061    8]
 [  94   98]]
 ------ 
Printing the precision and recall, among other metrics
              precision    recall  f1-score   support

           0       0.92      0.99      0.95      1069
           1       0.92      0.51      0.66       192

    accuracy                           0.92      1261
   macro avg       0.92      0.75      0.81      1261
weighted avg       0.92      0.92      0.91      1261

 ------ 
AUC Score 0.7514665185531649


 **Part 3: Do hyperparameter tuning on the random forest stacking classifier by changing at least three different parameters. Use random search for hyperparameter tuning**

In [69]:
#Hyperparameter tuning done for random forest classifier

parameters1={'min_samples_split' : range(5,100,200),'max_depth': range(1,10,2), 'n_estimators': (5,10,20,30,50)}

random_model = RandomizedSearchCV(stacked_model,parameters1,n_iter=10)
random_model.fit(x_res, y_res)
random_parm=random_model.best_params_
print("Best parameters:")
print(random_parm)

Best parameters:
{'n_estimators': 20, 'min_samples_split': 5, 'max_depth': 9}


In [70]:
#Using the parameters from hyperparameter tuning
model = RandomForestClassifier(**random_parm)

In [71]:
model.fit(s_train, y_res)
model_prediction = model.predict(s_test)

In [72]:
print("Test Accuracy:", metrics.accuracy_score(y_test,model_prediction))

print(" ------ ")

print("Confusion Matrix for Stacked Random Forest Model Prediction:")
print(confusion_matrix(y_test,model_prediction))

print(" ------ ")

print('Printing the precision and recall, among other metrics')
print(metrics.classification_report(y_test, model_prediction))

print(" ------ ")

print("AUC Score", roc_auc_score(y_test,model_prediction))

Test Accuracy: 0.9191118160190325
 ------ 
Confusion Matrix for Stacked Random Forest Model Prediction:
[[1061    8]
 [  94   98]]
 ------ 
Printing the precision and recall, among other metrics
              precision    recall  f1-score   support

           0       0.92      0.99      0.95      1069
           1       0.92      0.51      0.66       192

    accuracy                           0.92      1261
   macro avg       0.92      0.75      0.81      1261
weighted avg       0.92      0.92      0.91      1261

 ------ 
AUC Score 0.7514665185531649
