In [891]:
import sys 

In [892]:
!pip install imbalanced-learn



In [893]:
# check version number
import imblearn
print(imblearn.__version__)

0.7.0


In [894]:
from __future__ import print_function
from sklearn import datasets
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor
from sklearn.ensemble import BaggingClassifier, BaggingRegressor
from sklearn.tree import plot_tree
from sklearn import metrics
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import GridSearchCV, cross_validate
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sn
import numpy as np
import pandas as pd
import math
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Binarizer
from sklearn.model_selection import train_test_split
%matplotlib inline

In [895]:
df_train = pd.read_excel("train_data.xlsx")
df_train.shape

(8000, 24)

In [896]:
df_train['Target'].nunique()
pd.value_counts(df_train.Target)

3    2079
6    1540
5    1183
4    1114
2     981
0     791
1     312
Name: Target, dtype: int64

In [897]:
df_test = pd.read_excel("test_data.xlsx")
df_test.shape

(4000, 23)

In [898]:
# Creating X and y dataframes 
x = df_train.iloc[:,1:-1]
y = df_train.iloc[:,-1]

print("Shape of X and Y :\nX:",x.shape,"\nY:",y.shape)
randState = 520

# Creating training and validation data sets from X and y dataframes using train-test split
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=randState)

# Balancing training data using resampling
print("Before SMOTE, Shape of X_train and Y_train :\nX:",x_train.shape,"\nY:",y_train.shape)
from imblearn.over_sampling import SMOTENC
smote_nc = SMOTENC(categorical_features=[0,1,2,3,4,5,6,7], random_state=0)
x_train, y_train = smote_nc.fit_resample(x_train, y_train)
print("After SMOTEShape of X_train and Y_train :\nX:",x_train.shape,"\nY:",y_train.shape)

Shape of X and Y :
X: (8000, 22) 
Y: (8000,)
Before SMOTE, Shape of X_train and Y_train :
X: (6400, 22) 
Y: (6400,)
After SMOTEShape of X_train and Y_train :
X: (11655, 22) 
Y: (11655,)


In [899]:
y_train.nunique()
pd.value_counts(y_train)

6    1665
5    1665
4    1665
3    1665
2    1665
1    1665
0    1665
Name: Target, dtype: int64

In [900]:
x_train

Unnamed: 0,b0,b1,b2,b3,b4,cat0,cat1,cat2,num0,num1,...,num4,num5,num6,num7,num8,num9,num10,num11,num12,num13
0,0,0,1,0,1,3,0,0,708405.012596,2468.883295,...,96831.745168,2925.400836,5.012945,6.167060,23.528146,16.164761,0.093018,0.001771,0.688312,0.510797
1,0,0,1,0,0,0,1,2,363376.744961,2001.995816,...,49792.029716,2095.188653,4.882218,6.151933,24.753519,18.287463,0.114801,0.003580,0.880955,0.510396
2,0,1,1,0,1,3,0,2,735251.592498,2469.103063,...,100227.875634,2980.317653,3.862507,6.183890,24.124929,15.861580,0.093049,0.001642,0.662735,0.510580
3,0,0,1,1,0,0,4,1,883431.455989,3071.643446,...,121643.706244,3266.863569,4.787213,6.122059,25.301346,17.818687,0.075564,0.002124,0.836370,0.505394
4,0,1,1,0,0,3,4,0,446541.483592,2177.795686,...,60994.431632,2322.605627,4.380100,6.171432,26.173060,17.924876,0.105655,0.003042,0.846368,0.509810
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11650,0,1,0,1,1,2,1,0,528956.093548,2343.505270,...,72188.351203,2527.867929,4.092550,6.176847,25.837098,17.679936,0.098442,0.002683,0.823581,0.508584
11651,0,1,1,1,1,0,4,1,661430.472570,2643.386089,...,90929.965976,2826.745166,4.522836,6.131857,25.495834,17.795619,0.087446,0.002446,0.834241,0.507501
11652,0,1,1,1,1,2,2,4,497222.651747,2363.713736,...,67678.300893,2450.868551,4.437165,6.193211,26.536904,18.444286,0.097307,0.003140,0.896139,0.510013
11653,1,0,0,0,0,2,3,3,622608.400130,2612.538269,...,85184.371297,2742.534226,4.930486,6.161260,26.157023,18.205534,0.088105,0.002699,0.873143,0.509667


In [901]:
x_train.shape


(11655, 22)

In [902]:
# Creating test datset
x_test = df_test.iloc[:,1:]
x_test.shape

(4000, 22)

In [903]:
# Scaling training data
scaler = StandardScaler()
# scaler = MinMaxScaler()
x_train.iloc[:, 8:23] = scaler.fit_transform(x_train.iloc[:, 8:23])
x_train

Unnamed: 0,b0,b1,b2,b3,b4,cat0,cat1,cat2,num0,num1,...,num4,num5,num6,num7,num8,num9,num10,num11,num12,num13
0,0,0,1,0,1,3,0,0,-0.356352,-0.631140,...,-0.356862,-0.292744,1.359568,0.023139,-1.170298,-1.420250,0.630352,-0.756943,-1.372864,0.687201
1,0,0,1,0,0,0,1,2,-0.930196,-1.162053,...,-0.928762,-1.171221,0.905939,-0.518968,-0.453266,0.121536,1.807270,1.035804,0.082460,0.501484
2,0,1,1,0,1,3,0,2,-0.311701,-0.630890,...,-0.315572,-0.234634,-2.632510,0.626282,-0.821088,-1.640460,0.632038,-0.884381,-1.566088,0.586697
3,0,0,1,1,0,0,4,1,-0.065251,0.054279,...,-0.055203,0.068570,0.576267,-1.589574,-0.132703,-0.218951,-0.312680,-0.407012,-0.254362,-1.818121
4,0,1,1,0,0,3,4,0,-0.791878,-0.962145,...,-0.792565,-0.930583,-0.836437,0.179810,0.377384,-0.141823,1.313123,0.501862,-0.178830,0.229747
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11650,0,1,0,1,1,2,1,0,-0.654808,-0.773711,...,-0.656471,-0.713387,-1.834250,0.373886,0.180795,-0.319730,0.923416,0.147034,-0.350974,-0.338698
11651,0,1,1,1,1,0,4,1,-0.434479,-0.432707,...,-0.428614,-0.397135,-0.341137,-1.238442,-0.018897,-0.235706,0.329312,-0.088463,-0.270442,-0.840938
11652,0,1,1,1,1,2,2,4,-0.707586,-0.750731,...,-0.711304,-0.794863,-0.638418,0.960311,0.590289,0.235441,0.862090,0.599777,0.197164,0.323672
11653,1,0,0,0,0,2,3,3,-0.499047,-0.467785,...,-0.498468,-0.486241,1.073432,-0.184707,0.368000,0.062028,0.364870,0.162793,0.023441,0.163282


In [904]:
# Performing one-hot encoding of categorical variables cat0, cat1, and cat2 in training data
cat0_one_hot = pd.get_dummies(x_train['cat0'])
cat0_one_hot.columns = ['cat0.0', 'cat0.1', 'cat0.2', 'cat0.3', 'cat0.4']
cat1_one_hot = pd.get_dummies(x_train['cat1'])
cat1_one_hot.columns = ['cat1.0', 'cat1.1', 'cat1.2', 'cat1.3', 'cat1.4']
cat2_one_hot = pd.get_dummies(x_train['cat2'])
cat2_one_hot.columns = ['cat2.0', 'cat2.1', 'cat2.2', 'cat2.3', 'cat2.4']
x_train = x_train.drop('cat0',axis = 1)
x_train = x_train.drop('cat1',axis = 1)
x_train = x_train.drop('cat2',axis = 1)
x_train = x_train.join(cat0_one_hot)
x_train = x_train.join(cat1_one_hot)
x_train = x_train.join(cat2_one_hot)
x_train


Unnamed: 0,b0,b1,b2,b3,b4,num0,num1,num2,num3,num4,...,cat1.0,cat1.1,cat1.2,cat1.3,cat1.4,cat2.0,cat2.1,cat2.2,cat2.3,cat2.4
0,0,0,1,0,1,-0.356352,-0.631140,1.572706,1.189129,-0.356862,...,1,0,0,0,0,1,0,0,0,0
1,0,0,1,0,0,-0.930196,-1.162053,-0.219370,0.065242,-0.928762,...,0,1,0,0,0,0,0,1,0,0
2,0,1,1,0,1,-0.311701,-0.630890,1.884502,1.307619,-0.315572,...,1,0,0,0,0,0,0,1,0,0
3,0,0,1,1,0,-0.065251,0.054279,0.054881,0.309383,-0.055203,...,0,0,0,0,1,0,1,0,0,0
4,0,1,1,0,0,-0.791878,-0.962145,0.033734,0.291858,-0.792565,...,0,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11650,0,1,0,1,1,-0.654808,-0.773711,0.206819,0.421054,-0.656471,...,0,1,0,0,0,1,0,0,0,0
11651,0,1,1,1,1,-0.434479,-0.432707,0.100911,0.344934,-0.428614,...,0,0,0,0,1,0,1,0,0,0
11652,0,1,1,1,1,-0.707586,-0.750731,-0.331900,-0.047158,-0.711304,...,0,0,1,0,0,0,0,0,0,1
11653,1,0,0,0,0,-0.499047,-0.467785,-0.169649,0.109170,-0.498468,...,0,0,0,1,0,0,0,0,1,0


In [905]:
# Scaling validation data
scaler = StandardScaler()
# scaler = MinMaxScaler()
x_val.iloc[:, 8:23] = scaler.fit_transform(x_val.iloc[:, 8:23])
x_val

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())


Unnamed: 0,b0,b1,b2,b3,b4,cat0,cat1,cat2,num0,num1,...,num4,num5,num6,num7,num8,num9,num10,num11,num12,num13
2299,0,0,0,1,0,4,2,2,-0.559356,-0.695673,...,-0.563211,-0.633265,0.190003,0.753637,0.597324,-0.157386,0.810309,0.143226,-0.194760,0.622468
5422,1,0,0,1,1,2,0,0,-0.390484,0.107989,...,-0.397475,-0.390860,1.062117,1.059141,1.634222,1.653511,-0.370311,1.335972,1.716612,0.868024
6642,1,0,1,1,1,1,4,3,-0.360145,-0.998629,...,-0.366122,-0.348932,-2.912176,0.874351,-1.573744,-2.125816,1.434425,-1.186115,-1.984375,-0.506827
7293,0,0,1,0,0,2,1,3,4.944476,4.942331,...,4.935826,4.221712,1.176382,-0.193872,0.447170,0.650902,-3.131577,-1.339256,0.626999,-1.131822
376,0,1,1,0,1,3,1,4,-0.854782,-1.293206,...,-0.855523,-1.102789,-1.513305,0.484555,0.056904,-0.616272,2.087745,0.239591,-0.638780,0.452914
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4584,0,1,1,0,0,3,2,4,-0.497592,-0.139733,...,-0.501671,-0.542737,0.673361,0.724563,1.318868,1.418102,-0.055017,1.301722,1.453782,0.822954
6225,1,1,1,0,1,1,4,4,0.993659,0.999008,...,0.978385,1.199569,0.714473,0.849226,-0.408187,-0.684890,-1.228434,-1.120207,-0.703773,-0.745912
1321,1,0,0,1,0,4,4,4,-0.224573,-0.097252,...,-0.228047,-0.166910,0.100947,0.411001,0.101251,0.146864,-0.083393,-0.010664,0.108624,-0.303056
2056,1,0,1,1,0,4,1,3,1.423934,1.325499,...,1.405696,1.605699,-0.053855,0.846300,-0.260624,-0.853254,-1.485484,-1.309644,-0.861700,-0.811093


In [906]:
# Performing one hot encoding of categorical variables cat0, cat1, and cat2 in validation data
cat0_one_hot = pd.get_dummies(x_val['cat0'])
cat0_one_hot.columns = ['cat0.0', 'cat0.1', 'cat0.2', 'cat0.3', 'cat0.4']
cat1_one_hot = pd.get_dummies(x_val['cat1'])
cat1_one_hot.columns = ['cat1.0', 'cat1.1', 'cat1.2', 'cat1.3', 'cat1.4']
cat2_one_hot = pd.get_dummies(x_val['cat2'])
cat2_one_hot.columns = ['cat2.0', 'cat2.1', 'cat2.2', 'cat2.3', 'cat2.4']
x_val = x_val.drop('cat0',axis = 1)
x_val = x_val.drop('cat1',axis = 1)
x_val = x_val.drop('cat2',axis = 1)
x_val = x_val.join(cat0_one_hot)
x_val = x_val.join(cat1_one_hot)
x_val = x_val.join(cat2_one_hot)
x_val

Unnamed: 0,b0,b1,b2,b3,b4,num0,num1,num2,num3,num4,...,cat1.0,cat1.1,cat1.2,cat1.3,cat1.4,cat2.0,cat2.1,cat2.2,cat2.3,cat2.4
2299,0,0,0,1,0,-0.559356,-0.695673,0.059705,0.318983,-0.563211,...,0,0,1,0,0,0,0,1,0,0
5422,1,0,0,1,1,-0.390484,0.107989,-1.438815,-1.867983,-0.397475,...,1,0,0,0,0,1,0,0,0,0
6642,1,0,1,1,1,-0.360145,-0.998629,2.586050,1.548484,-0.366122,...,0,0,0,0,1,0,0,0,1,0
7293,0,0,1,0,0,4.944476,4.942331,-0.730349,-0.540194,4.935826,...,0,1,0,0,0,0,0,0,1,0
376,0,1,1,0,1,-0.854782,-1.293206,0.545121,0.679870,-0.855523,...,0,1,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4584,0,1,1,0,0,-0.497592,-0.139733,-1.275912,-1.487108,-0.501671,...,0,0,1,0,0,0,0,0,0,1
6225,1,1,1,0,1,0.993659,0.999008,0.586086,0.706327,0.978385,...,0,0,0,0,1,0,0,0,0,1
1321,1,0,0,1,0,-0.224573,-0.097252,-0.262520,0.019029,-0.228047,...,0,0,0,0,1,0,0,0,0,1
2056,1,0,1,1,0,1.423934,1.325499,0.780288,0.824659,1.405696,...,0,1,0,0,0,0,0,0,1,0


In [907]:
# Scaling testing data
scaler = StandardScaler()
# scaler = MinMaxScaler()
x_test.iloc[:, 8:23] = scaler.fit_transform(x_test.iloc[:, 8:23])
x_test

Unnamed: 0,b0,b1,b2,b3,b4,cat0,cat1,cat2,num0,num1,...,num4,num5,num6,num7,num8,num9,num10,num11,num12,num13
0,1,0,1,1,1,1,3,1,0.371115,0.298802,...,0.371236,0.543787,-1.069897,-0.203958,-1.000495,-0.739514,-0.562379,-0.892992,-0.754191,0.274683
1,0,1,0,0,1,4,1,2,-0.085010,-0.585619,...,-0.087226,0.009208,-0.727765,0.171879,-1.158962,-1.704109,0.622839,-1.141329,-1.622882,0.242330
2,0,1,0,0,1,1,1,1,-0.658079,-0.123665,...,-0.661040,-0.774263,0.565399,0.661501,1.395882,2.329252,-0.073683,2.467462,2.492484,0.978259
3,0,0,1,0,0,1,4,4,-0.771615,-0.728852,...,-0.772318,-0.950909,1.528028,0.335076,1.031071,0.914079,0.868942,1.311545,0.903946,0.135406
4,0,0,0,1,0,1,4,0,-0.432676,-0.438763,...,-0.432460,-0.447247,0.239960,-0.019902,0.414755,0.044410,0.391478,0.122286,0.005283,0.234270
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,0,0,1,1,1,3,3,2,0.758389,0.612712,...,0.767657,0.955482,0.723179,-1.004930,-0.511841,-1.032769,-0.855787,-1.175349,-1.025973,-1.047372
3996,1,1,1,0,1,4,1,4,-0.245247,-0.809941,...,-0.242330,-0.194920,-2.810009,-0.444767,-1.922494,-1.801638,1.006612,-1.104018,-1.706672,0.392524
3997,1,0,1,0,1,2,0,4,-0.818803,-1.090171,...,-0.816486,-1.027096,0.260768,-0.259558,0.136808,-0.078396,1.569434,0.544240,-0.116861,0.154508
3998,1,1,1,1,1,3,3,3,-0.190896,-0.159291,...,-0.193641,-0.124573,-1.405494,0.287582,0.227098,-0.185494,0.014454,-0.251572,-0.222421,-0.563713


In [908]:
# Performing one hot encoding of categorical variables cat0, cat1, and cat2 in test data
cat0_one_hot = pd.get_dummies(x_test['cat0'])
cat0_one_hot.columns = ['cat0.0', 'cat0.1', 'cat0.2', 'cat0.3', 'cat0.4']
cat1_one_hot = pd.get_dummies(x_test['cat1'])
cat1_one_hot.columns = ['cat1.0', 'cat1.1', 'cat1.2', 'cat1.3', 'cat1.4']
cat2_one_hot = pd.get_dummies(x_test['cat2'])
cat2_one_hot.columns = ['cat2.0', 'cat2.1', 'cat2.2', 'cat2.3', 'cat2.4']
x_test = x_test.drop('cat0',axis = 1)
x_test = x_test.drop('cat1',axis = 1)
x_test = x_test.drop('cat2',axis = 1)
x_test = x_test.join(cat0_one_hot)
x_test = x_test.join(cat1_one_hot)
x_test = x_test.join(cat2_one_hot)
x_test

Unnamed: 0,b0,b1,b2,b3,b4,num0,num1,num2,num3,num4,...,cat1.0,cat1.1,cat1.2,cat1.3,cat1.4,cat2.0,cat2.1,cat2.2,cat2.3,cat2.4
0,1,0,1,1,1,0.371115,0.298802,0.683761,0.757393,0.371236,...,0,0,0,1,0,0,1,0,0,0
1,0,1,0,0,1,-0.085010,-0.585619,1.962445,1.332878,-0.087226,...,0,1,0,0,0,0,0,1,0,0
2,0,1,0,0,1,-0.658079,-0.123665,-1.870217,-3.238047,-0.661040,...,0,1,0,0,0,0,1,0,0,0
3,0,0,1,0,0,-0.771615,-0.728852,-0.916731,-0.798971,-0.772318,...,0,0,0,0,1,0,0,0,0,1
4,0,0,0,1,0,-0.432676,-0.438763,-0.151615,0.132102,-0.432460,...,0,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,0,0,1,1,1,0.758389,0.612712,0.995743,0.929781,0.767657,...,0,0,0,1,0,0,0,1,0,0
3996,1,1,1,0,1,-0.245247,-0.809941,2.116680,1.383473,-0.242330,...,0,1,0,0,0,0,0,0,0,1
3997,1,0,1,0,1,-0.818803,-1.090171,-0.033356,0.239261,-0.816486,...,1,0,0,0,0,0,0,0,0,1
3998,1,1,1,1,1,-0.190896,-0.159291,0.053856,0.313673,-0.193641,...,0,0,0,1,0,0,0,0,1,0


In [909]:
# function to compute BER of each class
from sklearn.metrics import confusion_matrix
def computeBER(y_val,y_val_hat):
    cm = confusion_matrix(y_val, y_val_hat)
    cnf_matrix = cm
    FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)  
    FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
    TP = np.diag(cnf_matrix)
    TN = cnf_matrix.sum() - (FP + FN + TP)
    FP = FP.astype(float)
    FN = FN.astype(float)
    TP = TP.astype(float)
    TN = TN.astype(float)
    # Sensitivity, hit rate, recall, or true positive rate
    TPR = TP/(TP+FN)
    print('TPR : ',TPR)
    # Specificity or true negative rate
    TNR = TN/(TN+FP)
    print('TNR : ',TNR)
    # Precision or positive predictive value
    PPV = TP/(TP+FP)
    print('PPV : ',PPV)
    # Fall out or false positive rate
    FPR = FP/(FP+TN)
    print('FPR : ',FPR)
    # False negative rate
    FNR = FN/(TP+FN)
    print('FNR : ',FNR)
    # Overall accuracy
    ACC = (TP+TN)/(TP+FP+FN+TN)
    print('ACC : ',ACC)
    # Balanced Error Ratio
    BER = (FPR + FNR)/2
    print('BER : ',BER)
    return BER

In [910]:
# function to evaluate model
def evaluate_model(evaluatetype, y, y_hat, y_prob):
    
    print('Model evaluation (' + evaluatetype + ')')
    print('Accuracy:')
    print(metrics.accuracy_score(y, y_hat))
    print('Classification report:')
    print(metrics.classification_report(y, y_hat))
    print('Confusion matrix (' + evaluatetype + ')')
    df = pd.DataFrame({'y_Actual':y, 'y_Predicted':y_hat})
    confusion_matrix = pd.crosstab(df['y_Actual'], df['y_Predicted'], rownames=['Actual'], colnames=['Predicted'])
    print (confusion_matrix)
    


In [911]:
# Decision Tree Classifier
model_tree = GridSearchCV(DecisionTreeClassifier(random_state=520),
                          cv=5,
                          param_grid={
                              "max_depth": list(range(1, 40, 2)),
                              "min_samples_split": list(range(2, 5, 2))
                          })
model_tree.fit(x_train, y_train)
print("The best parameters from CV")
print(model_tree.best_params_)
y_val_hat = model_tree.predict(x_val)
y_val_prob = model_tree.predict_proba(x_val)[:,1]

The best parameters from CV
{'max_depth': 11, 'min_samples_split': 2}


In [912]:
y_val_hat

array([6, 5, 4, ..., 0, 1, 1], dtype=int64)

In [913]:
# Computing BER of the Decision Tree Classifier
BER = computeBER(y_val,y_val_hat)
print(BER)

TPR :  [0.88590604 1.         0.81770833 0.51207729 0.76824034 0.8974359
 0.48242812]
TNR :  [0.92970365 0.97980456 0.9140625  0.99915683 0.98610095 0.9795022
 0.85081585]
PPV :  [0.56410256 0.67708333 0.5647482  0.99530516 0.9040404  0.88235294
 0.44023324]
FPR :  [0.07029635 0.02019544 0.0859375  0.00084317 0.01389905 0.0204978
 0.14918415]
FNR :  [0.11409396 0.         0.18229167 0.48792271 0.23175966 0.1025641
 0.51757188]
ACC :  [0.925625 0.980625 0.9025   0.873125 0.954375 0.9675   0.77875 ]
BER :  [0.09219515 0.01009772 0.13411458 0.24438294 0.12282935 0.06153095
 0.33337802]
[0.09219515 0.01009772 0.13411458 0.24438294 0.12282935 0.06153095
 0.33337802]


In [914]:
#  Evaluating Decision Tree Classifier
train_sc = model_tree.score(x_train, y_train)
test_sc = model_tree.score(x_val, y_val)
print("Best Parameter", model_tree.best_params_)
print('Accuracy:', metrics.accuracy_score(y_val, y_val_hat))
print('Train score:', train_sc)
print('Test score:', test_sc)

evaluate_model('Train', y_val, y_val_hat, y_val_prob)

Best Parameter {'max_depth': 11, 'min_samples_split': 2}
Accuracy: 0.69125
Train score: 0.9743457743457743
Test score: 0.69125
Model evaluation (Train)
Accuracy:
0.69125
Classification report:
              precision    recall  f1-score   support

           0       0.56      0.89      0.69       149
           1       0.68      1.00      0.81        65
           2       0.56      0.82      0.67       192
           3       1.00      0.51      0.68       414
           4       0.90      0.77      0.83       233
           5       0.88      0.90      0.89       234
           6       0.44      0.48      0.46       313

    accuracy                           0.69      1600
   macro avg       0.72      0.77      0.72      1600
weighted avg       0.75      0.69      0.69      1600

Confusion matrix (Train)
Predicted    0   1    2    3    4    5    6
Actual                                     
0          132   7    8    0    0    2    0
1            0  65    0    0    0    0    0
2        

In [915]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
model_forest = RandomForestClassifier(n_estimators=200, random_state=100, 
                                      max_depth=model_tree.best_params_['max_depth'],
                                      min_samples_split=model_tree.best_params_['min_samples_split'],
                                      n_jobs=-1, criterion = "entropy", max_features = "auto")
model_forest.fit(x_train, y_train)
y_val_hat = model_forest.predict(x_val)


In [916]:
y_val_hat

array([6, 5, 4, ..., 0, 1, 1], dtype=int64)

In [917]:
# Computing BER of the Random Forest Classifier
BER = computeBER(y_val,y_val_hat)
print(BER)

TPR :  [0.8590604  1.         0.859375   0.57729469 0.81974249 0.87179487
 0.55591054]
TNR :  [0.93452791 0.98175896 0.91335227 0.99915683 0.99414777 0.98682284
 0.87412587]
PPV :  [0.57399103 0.69892473 0.57491289 0.99583333 0.95979899 0.91891892
 0.51785714]
FPR :  [0.06547209 0.01824104 0.08664773 0.00084317 0.00585223 0.01317716
 0.12587413]
FNR :  [0.1409396  0.         0.140625   0.42270531 0.18025751 0.12820513
 0.44408946]
ACC :  [0.9275   0.9825   0.906875 0.89     0.96875  0.97     0.811875]
BER :  [0.10320584 0.00912052 0.11363636 0.21177424 0.09305487 0.07069114
 0.28498179]
[0.10320584 0.00912052 0.11363636 0.21177424 0.09305487 0.07069114
 0.28498179]


In [918]:
# Evaluating Random Forest Classifier
train_sc = model_forest.score(x_train, y_train)
test_sc = model_forest.score(x_val, y_val)
print('Accuracy:', metrics.accuracy_score(y_val, y_val_hat))
print('Train score:', train_sc)
print('Test score:', test_sc)
evaluate_model('Train', y_val, y_val_hat, y_val_prob)

Accuracy: 0.72875
Train score: 0.9881595881595882
Test score: 0.72875
Model evaluation (Train)
Accuracy:
0.72875
Classification report:
              precision    recall  f1-score   support

           0       0.57      0.86      0.69       149
           1       0.70      1.00      0.82        65
           2       0.57      0.86      0.69       192
           3       1.00      0.58      0.73       414
           4       0.96      0.82      0.88       233
           5       0.92      0.87      0.89       234
           6       0.52      0.56      0.54       313

    accuracy                           0.73      1600
   macro avg       0.75      0.79      0.75      1600
weighted avg       0.78      0.73      0.73      1600

Confusion matrix (Train)
Predicted    0   1    2    3    4    5    6
Actual                                     
0          128   7   14    0    0    0    0
1            0  65    0    0    0    0    0
2            6  21  165    0    0    0    0
3            2   0    

In [919]:
# AdaBoost Classifier
model_adaboost = AdaBoostClassifier(DecisionTreeClassifier(random_state=100,
                                                           max_depth=model_tree.best_params_['max_depth'],
                                                           min_samples_split=model_tree.best_params_['min_samples_split']), n_estimators = 50)
model_adaboost.fit(x_train, y_train)
y_val_hat = model_adaboost.predict(x_val)
y_val_hat

array([6, 5, 4, ..., 2, 1, 1], dtype=int64)

In [920]:
# Computing BER of AdaBoost Classifier
BER = computeBER(y_val,y_val_hat)
print(BER)

TPR :  [0.88590604 1.         0.84895833 0.58454106 0.83261803 0.94017094
 0.57188498]
TNR :  [0.96347347 0.97980456 0.90482955 0.99915683 0.99561083 0.9875549
 0.87334887]
PPV :  [0.71351351 0.67708333 0.54882155 0.99588477 0.97       0.92827004
 0.52339181]
FPR :  [0.03652653 0.02019544 0.09517045 0.00084317 0.00438917 0.0124451
 0.12665113]
FNR :  [0.11409396 0.         0.15104167 0.41545894 0.16738197 0.05982906
 0.42811502]
ACC :  [0.95625  0.980625 0.898125 0.891875 0.971875 0.980625 0.814375]
BER :  [0.07531025 0.01009772 0.12310606 0.20815105 0.08588557 0.03613708
 0.27738307]
[0.07531025 0.01009772 0.12310606 0.20815105 0.08588557 0.03613708
 0.27738307]


In [921]:
# Evaluating Adaboost Classifier
train_sc = model_adaboost.score(x_train, y_train)
test_sc = model_adaboost.score(x_val, y_val)

print('Accuracy:', metrics.accuracy_score(y_val, y_val_hat))
print('Train score:', train_sc)
print('Test score:', test_sc)
evaluate_model('Train', y_val, y_val_hat, y_val_prob)

Accuracy: 0.746875
Train score: 1.0
Test score: 0.746875
Model evaluation (Train)
Accuracy:
0.746875
Classification report:
              precision    recall  f1-score   support

           0       0.71      0.89      0.79       149
           1       0.68      1.00      0.81        65
           2       0.55      0.85      0.67       192
           3       1.00      0.58      0.74       414
           4       0.97      0.83      0.90       233
           5       0.93      0.94      0.93       234
           6       0.52      0.57      0.55       313

    accuracy                           0.75      1600
   macro avg       0.77      0.81      0.77      1600
weighted avg       0.80      0.75      0.75      1600

Confusion matrix (Train)
Predicted    0   1    2    3    4    5    6
Actual                                     
0          132   7   10    0    0    0    0
1            0  65    0    0    0    0    0
2            5  24  163    0    0    0    0
3            1   0    0  242    2 

In [922]:
# Bagging Classifier
model_bagging = BaggingClassifier(DecisionTreeClassifier(random_state=520,
                                                         max_depth=model_tree.best_params_['max_depth'],
                                                         min_samples_split=model_tree.best_params_['min_samples_split']),
                                  bootstrap=False,
                                  max_samples=0.6,
                                  bootstrap_features=False,
                                  max_features=0.6,
                                  n_estimators=100)
model_bagging.fit(x_train, y_train)
y_val_hat = model_bagging.predict(x_val)
y_val_hat

array([6, 5, 4, ..., 0, 1, 1], dtype=int64)

In [923]:
# Computing BER of the Bagging Classifier
BER = computeBER(y_val,y_val_hat)
print(BER)

TPR :  [0.84563758 1.         0.84375    0.53864734 0.82832618 0.87606838
 0.55910543]
TNR :  [0.94141971 0.97980456 0.90909091 1.         0.99341624 0.98682284
 0.86013986]
PPV :  [0.5971564  0.67708333 0.55862069 1.         0.95544554 0.91928251
 0.49295775]
FPR :  [0.05858029 0.02019544 0.09090909 0.         0.00658376 0.01317716
 0.13986014]
FNR :  [0.15436242 0.         0.15625    0.46135266 0.17167382 0.12393162
 0.44089457]
ACC :  [0.9325   0.980625 0.90125  0.880625 0.969375 0.970625 0.80125 ]
BER :  [0.10647135 0.01009772 0.12357955 0.23067633 0.08912879 0.06855439
 0.29037735]
[0.10647135 0.01009772 0.12357955 0.23067633 0.08912879 0.06855439
 0.29037735]


In [924]:
# Evaluating Bagging Classifier
train_sc = model_bagging.score(x_train, y_train)
test_sc = model_bagging.score(x_val, y_val)

print('Accuracy:', metrics.accuracy_score(y_val, y_val_hat))
print('Train score:', train_sc)
print('Test score:', test_sc)
evaluate_model('Train', y_val, y_val_hat, y_val_prob)

Accuracy: 0.718125
Train score: 0.9770913770913771
Test score: 0.718125
Model evaluation (Train)
Accuracy:
0.718125
Classification report:
              precision    recall  f1-score   support

           0       0.60      0.85      0.70       149
           1       0.68      1.00      0.81        65
           2       0.56      0.84      0.67       192
           3       1.00      0.54      0.70       414
           4       0.96      0.83      0.89       233
           5       0.92      0.88      0.90       234
           6       0.49      0.56      0.52       313

    accuracy                           0.72      1600
   macro avg       0.74      0.78      0.74      1600
weighted avg       0.78      0.72      0.72      1600

Confusion matrix (Train)
Predicted    0   1    2    3    4    5    6
Actual                                     
0          126   7   16    0    0    0    0
1            0  65    0    0    0    0    0
2            6  24  162    0    0    0    0
3            2   0 

In [925]:
# Support Vector Machine Classifier
seed = 100
def create_model_svc():
    model = SVC(random_state=seed, kernel = 'poly', degree = 4, probability=True, gamma = 'scale', 
                decision_function_shape = 'ovr')
    return model

create_model = create_model_svc
model_svc = create_model()
model_svc.fit(x_train, y_train)
y_val_hat = model_svc.predict(x_val)
y_val_hat

array([6, 5, 4, ..., 6, 1, 1], dtype=int64)

In [926]:
# Computing BER of Support Vector Machine Classifier
BER = computeBER(y_val,y_val_hat)
print(BER)

TPR :  [0.8590604  1.         0.83854167 0.54589372 0.82832618 0.91025641
 0.88817891]
TNR :  [0.98208132 0.98957655 0.953125   0.98988196 0.9948793  0.98828697
 0.85003885]
PPV :  [0.83116883 0.80246914 0.7092511  0.94957983 0.965      0.930131
 0.59023355]
FPR :  [0.01791868 0.01042345 0.046875   0.01011804 0.0051207  0.01171303
 0.14996115]
FNR :  [0.1409396  0.         0.16145833 0.45410628 0.17167382 0.08974359
 0.11182109]
ACC :  [0.970625 0.99     0.939375 0.875    0.970625 0.976875 0.8575  ]
BER :  [0.07942914 0.00521173 0.10416667 0.23211216 0.08839726 0.05072831
 0.13089112]
[0.07942914 0.00521173 0.10416667 0.23211216 0.08839726 0.05072831
 0.13089112]


In [927]:
# Evaluating Support Vector Machine Classifier
train_sc = model_svc.score(x_train, y_train)
test_sc = model_svc.score(x_val, y_val)

print('Accuracy:', metrics.accuracy_score(y_val, y_val_hat))
print('Train score:', train_sc)
print('Test score:', test_sc)
evaluate_model('Train', y_val, y_val_hat, y_val_prob)

Accuracy: 0.79
Train score: 0.9441441441441442
Test score: 0.79
Model evaluation (Train)
Accuracy:
0.79
Classification report:
              precision    recall  f1-score   support

           0       0.83      0.86      0.84       149
           1       0.80      1.00      0.89        65
           2       0.71      0.84      0.77       192
           3       0.95      0.55      0.69       414
           4       0.96      0.83      0.89       233
           5       0.93      0.91      0.92       234
           6       0.59      0.89      0.71       313

    accuracy                           0.79      1600
   macro avg       0.83      0.84      0.82      1600
weighted avg       0.83      0.79      0.79      1600

Confusion matrix (Train)
Predicted    0   1    2    3    4    5    6
Actual                                     
0          128   3   10    4    2    1    1
1            0  65    0    0    0    0    0
2            6  13  161    8    0    3    1
3            1   0    0  226   

In [928]:
# Multi layer Perceptron Classifier
def create_model_mlpclassifier():
    model = MLPClassifier(hidden_layer_sizes=(100,), random_state=seed, activation = 'relu', solver = 'adam', 
                          alpha = 13.0, learning_rate = 'constant', max_iter = 500)
    return model
create_model = create_model_mlpclassifier
model_mlp = create_model()
model_mlp.fit(x_train, y_train)
y_val_hat = model_mlp.predict(x_val)
y_val_hat

array([6, 5, 4, ..., 6, 2, 1], dtype=int64)

In [929]:
# Computing BER of Multi layer Perceptron Classifier
BER = computeBER(y_val,y_val_hat)
print(BER)

TPR :  [0.80536913 1.         0.91145833 0.68115942 0.81974249 0.94017094
 0.73801917]
TNR :  [0.97312198 0.98697068 0.92684659 0.99662732 0.99195318 0.97657394
 0.91686092]
PPV :  [0.75471698 0.76470588 0.6294964  0.98601399 0.94554455 0.87301587
 0.68343195]
FPR :  [0.02687802 0.01302932 0.07315341 0.00337268 0.00804682 0.02342606
 0.08313908]
FNR :  [0.19463087 0.         0.08854167 0.31884058 0.18025751 0.05982906
 0.26198083]
ACC :  [0.9575   0.9875   0.925    0.915    0.966875 0.97125  0.881875]
BER :  [0.11075444 0.00651466 0.08084754 0.16110663 0.09415216 0.04162756
 0.17255996]
[0.11075444 0.00651466 0.08084754 0.16110663 0.09415216 0.04162756
 0.17255996]


In [930]:
# Evaluating Multi layer Perceptron Classifier
train_sc = model_mlp.score(x_train, y_train)
test_sc = model_mlp.score(x_val, y_val)

print('Accuracy:', metrics.accuracy_score(y_val, y_val_hat))
print('Train score:', train_sc)
print('Test score:', test_sc)
evaluate_model('Train', y_val, y_val_hat, y_val_prob)

Accuracy: 0.8025
Train score: 0.9336765336765337
Test score: 0.8025
Model evaluation (Train)
Accuracy:
0.8025
Classification report:
              precision    recall  f1-score   support

           0       0.75      0.81      0.78       149
           1       0.76      1.00      0.87        65
           2       0.63      0.91      0.74       192
           3       0.99      0.68      0.81       414
           4       0.95      0.82      0.88       233
           5       0.87      0.94      0.91       234
           6       0.68      0.74      0.71       313

    accuracy                           0.80      1600
   macro avg       0.81      0.84      0.81      1600
weighted avg       0.83      0.80      0.80      1600

Confusion matrix (Train)
Predicted    0   1    2    3    4    5    6
Actual                                     
0          120   9   18    0    0    2    0
1            0  65    0    0    0    0    0
2            6  11  175    0    0    0    0
3            3   0    0  

In [931]:
# K nearest neighbors classifier
def create_model_knn():
    model=KNeighborsClassifier(n_neighbors=3)
    return model
create_model = create_model_knn
model_knn = create_model()
model_knn.fit(x_train, y_train)
y_val_hat = model_mlp.predict(x_val)
y_val_hat

array([6, 5, 4, ..., 6, 2, 1], dtype=int64)

In [932]:
# Computing BER of K nearest neighbors classifier
BER = computeBER(y_val,y_val_hat)
print(BER)

TPR :  [0.80536913 1.         0.91145833 0.68115942 0.81974249 0.94017094
 0.73801917]
TNR :  [0.97312198 0.98697068 0.92684659 0.99662732 0.99195318 0.97657394
 0.91686092]
PPV :  [0.75471698 0.76470588 0.6294964  0.98601399 0.94554455 0.87301587
 0.68343195]
FPR :  [0.02687802 0.01302932 0.07315341 0.00337268 0.00804682 0.02342606
 0.08313908]
FNR :  [0.19463087 0.         0.08854167 0.31884058 0.18025751 0.05982906
 0.26198083]
ACC :  [0.9575   0.9875   0.925    0.915    0.966875 0.97125  0.881875]
BER :  [0.11075444 0.00651466 0.08084754 0.16110663 0.09415216 0.04162756
 0.17255996]
[0.11075444 0.00651466 0.08084754 0.16110663 0.09415216 0.04162756
 0.17255996]


In [933]:
# Evaluating K nearest neighbors classifier
train_sc = model_knn.score(x_train, y_train)
test_sc = model_knn.score(x_val, y_val)

print('Accuracy:', metrics.accuracy_score(y_val, y_val_hat))
print('Train score:', train_sc)
print('Test score:', test_sc)
evaluate_model('Train', y_val, y_val_hat, y_val_prob)

Accuracy: 0.8025
Train score: 0.9554697554697554
Test score: 0.76
Model evaluation (Train)
Accuracy:
0.8025
Classification report:
              precision    recall  f1-score   support

           0       0.75      0.81      0.78       149
           1       0.76      1.00      0.87        65
           2       0.63      0.91      0.74       192
           3       0.99      0.68      0.81       414
           4       0.95      0.82      0.88       233
           5       0.87      0.94      0.91       234
           6       0.68      0.74      0.71       313

    accuracy                           0.80      1600
   macro avg       0.81      0.84      0.81      1600
weighted avg       0.83      0.80      0.80      1600

Confusion matrix (Train)
Predicted    0   1    2    3    4    5    6
Actual                                     
0          120   9   18    0    0    2    0
1            0  65    0    0    0    0    0
2            6  11  175    0    0    0    0
3            3   0    0  28

In [934]:
# Predicting target values of test data using Support Vector Machine Classifier
y_test_hat = model_svc.predict(x_test)
y_test_hat

array([2, 4, 5, ..., 3, 6, 5], dtype=int64)

In [935]:
y_test_hat.shape

(4000,)

In [936]:
# Loading predicted data into a new csv file
df = pd.DataFrame(y_test_hat, columns = ['ouput'])
df.to_csv('BMI555IEE520Results2021KumarGaurav.csv') 