In [34]:
"""
Authors: Abdul Samad            samad19472002@gmail.com
         Janzaib Masood         janzaibaloch786@gmail.com
"""

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split # ,KFold
from sklearn.cross_validation import KFold

from collections import Counter
from imblearn.under_sampling import RandomUnderSampler, CondensedNearestNeighbour, NearMiss
from imblearn.under_sampling import EditedNearestNeighbours, RepeatedEditedNearestNeighbours, TomekLinks
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.ensemble import BalanceCascade, EasyEnsemble
from sklearn.ensemble import AdaBoostClassifier
import warnings

warnings.filterwarnings('ignore')
%pylab inline
pylab.rcParams['figure.figsize'] = (12, 6)
plt.style.use('fivethirtyeight')

Populating the interactive namespace from numpy and matplotlib


In [35]:
# Here you begin working on the data
df = pd.read_excel('Data.xlsx')

# Deleting the two mentioned columns from the dataframe
df.drop('VchMotherTounge', axis=1, inplace =True)
df.drop('MotherTongueBin', axis=1, inplace =True)
df.head()

Unnamed: 0,GenderCode,IntBrothers,IntSisters,IntSchoolBrothers,IntSchoolSisters,ClassSchoolStatus,Disability01,Lang1,Lang2,Lang3,Lang4,Religion,RESULT
0,0,3,3,3,2,1,0,1,0,0,0,1,FAIL
1,0,4,5,1,1,1,0,1,0,0,0,1,PASS
2,0,5,3,0,0,1,0,1,0,0,0,1,PASS
3,0,2,4,1,4,1,0,1,0,0,0,1,PASS
4,0,2,1,0,0,1,0,1,0,0,0,1,FAIL


In [36]:
# Slicing from main dataframe to Input Data(X) and output Data(y)
y = df.iloc[:, 12]
X = df.iloc[:,:12]
# Replacng PASS and FAIL with integers 1 and 0 respectively
y[y == 'PASS'] = 1
y[y == 'FAIL'] = 0
display(X.head())
display(y.head())

Unnamed: 0,GenderCode,IntBrothers,IntSisters,IntSchoolBrothers,IntSchoolSisters,ClassSchoolStatus,Disability01,Lang1,Lang2,Lang3,Lang4,Religion
0,0,3,3,3,2,1,0,1,0,0,0,1
1,0,4,5,1,1,1,0,1,0,0,0,1
2,0,5,3,0,0,1,0,1,0,0,0,1
3,0,2,4,1,4,1,0,1,0,0,0,1
4,0,2,1,0,0,1,0,1,0,0,0,1


0    0
1    1
2    1
3    1
4    0
Name: RESULT, dtype: object

In [37]:
a = y.values
b = X.values
del(y)
del(X)
X = []
Y = []
length = len(a)
for i in range(length):
    X.append(b[i,:])
    Y.append(a[i])

In [38]:
dTree  = DecisionTreeClassifier()  
logReg = LogisticRegression()
knn    = KNeighborsClassifier(n_neighbors=5)
rF     = RandomForestClassifier()
MLP    = MLPClassifier()

report = np.zeros((11, 5))

In [39]:
scores_dTree = cross_val_score(dTree, X, Y, cv = 10, scoring='roc_auc')
scores_dTree = scores_dTree.mean()

scores_logReg = cross_val_score(logReg, X, Y, cv = 10, scoring='roc_auc')
scores_logReg = scores_logReg.mean()

scores_knn = cross_val_score(knn, X, Y, cv = 10, scoring='roc_auc')
scores_knn = scores_knn.mean()

scores_rF = cross_val_score(rF, X, Y, cv = 10, scoring='roc_auc')
scores_rF = scores_rF.mean()

scores_MLP = cross_val_score(MLP, X, Y, cv = 10, scoring='roc_auc')
scores_MLP = scores_MLP.mean()

print('Before Resampling of the real data: ')
print('   dTree,       logReg   ,         KNN    ,       rF   ,           MLP')
print(scores_dTree, scores_logReg, scores_knn, scores_rF, scores_MLP)

report[0, :] = [scores_dTree, scores_logReg, scores_knn, scores_rF, scores_MLP]

Before Resampling of the real data: 
   dTree,       logReg   ,         KNN    ,       rF   ,           MLP
0.495314174232 0.509189781938 0.507560502765 0.516232414359 0.518804630924


In [40]:
# Let's try Random Undersampling
rmun = RandomUnderSampler()
X_resampled, Y_resampled = rmun.fit_sample(X, Y)

scores_dTree = cross_val_score(dTree, X_resampled, Y_resampled, cv = 10, scoring='roc_auc')
scores_dTree = scores_dTree.mean()

scores_logReg = cross_val_score(logReg, X_resampled, Y_resampled, cv = 10, scoring='roc_auc')
scores_logReg = scores_logReg.mean()

scores_knn = cross_val_score(knn, X_resampled, Y_resampled, cv = 10, scoring='roc_auc')
scores_knn = scores_knn.mean()

scores_rF = cross_val_score(rF, X_resampled, Y_resampled, cv = 10, scoring='roc_auc')
scores_rF = scores_rF.mean()

scores_MLP = cross_val_score(MLP, X_resampled, Y_resampled, cv = 10, scoring='roc_auc')
scores_MLP = scores_MLP.mean()

print('After applying Random Undersampling: ')
print('   dTree,       logReg   ,         KNN    ,       rF   ,           MLP')
print(scores_dTree, scores_logReg, scores_knn, scores_rF, scores_MLP)

report[1, :] = [scores_dTree, scores_logReg, scores_knn, scores_rF, scores_MLP]

After applying Random Undersampling: 
   dTree,       logReg   ,         KNN    ,       rF   ,           MLP
0.453601843326 0.335188023706 0.495799206936 0.436014041514 0.377797287381


In [41]:
# Let's try Condensed Nearest Neighbour
cnn = CondensedNearestNeighbour()
X_resampled, Y_resampled = cnn.fit_sample(X, Y)

scores_dTree = cross_val_score(dTree, X_resampled, Y_resampled, cv = 10, scoring='roc_auc')
scores_dTree = scores_dTree.mean()

scores_logReg = cross_val_score(logReg, X_resampled, Y_resampled, cv = 10, scoring='roc_auc')
scores_logReg = scores_logReg.mean()

scores_knn = cross_val_score(knn, X_resampled, Y_resampled, cv = 10, scoring='roc_auc')
scores_knn = scores_knn.mean()

scores_rF = cross_val_score(rF, X_resampled, Y_resampled, cv = 10, scoring='roc_auc')
scores_rF = scores_rF.mean()

scores_MLP = cross_val_score(MLP, X_resampled, Y_resampled, cv = 10, scoring='roc_auc')
scores_MLP = scores_MLP.mean()

print('After applying Condensed Nearest Neighbours: ')
print('   dTree,       logReg   ,         KNN    ,       rF   ,           MLP')
print(scores_dTree, scores_logReg, scores_knn, scores_rF, scores_MLP)

report[2, :] = [scores_dTree, scores_logReg, scores_knn, scores_rF, scores_MLP]

After applying Condensed Nearest Neighbours: 
   dTree,       logReg   ,         KNN    ,       rF   ,           MLP
0.336206899984 0.402026378141 0.364855111035 0.293699916139 0.39243590188


In [42]:
# Let's try NearMiss
nm = NearMiss()
X_resampled, Y_resampled = nm.fit_sample(X, Y)

scores_dTree = cross_val_score(dTree, X_resampled, Y_resampled, cv = 10, scoring='roc_auc')
scores_dTree = scores_dTree.mean()

scores_logReg = cross_val_score(logReg, X_resampled, Y_resampled, cv = 10, scoring='roc_auc')
scores_logReg = scores_logReg.mean()

scores_knn = cross_val_score(knn, X_resampled, Y_resampled, cv = 10, scoring='roc_auc')
scores_knn = scores_knn.mean()

scores_rF = cross_val_score(rF, X_resampled, Y_resampled, cv = 10, scoring='roc_auc')
scores_rF = scores_rF.mean()

scores_MLP = cross_val_score(MLP, X_resampled, Y_resampled, cv = 10, scoring='roc_auc')
scores_MLP = scores_MLP.mean()

print('After applying NearMiss: ')
print('   dTree,       logReg   ,         KNN    ,       rF   ,           MLP')
print(scores_dTree, scores_logReg, scores_knn, scores_rF, scores_MLP)

report[3, :] = [scores_dTree, scores_logReg, scores_knn, scores_rF, scores_MLP]

After applying NearMiss: 
   dTree,       logReg   ,         KNN    ,       rF   ,           MLP
0.421589244204 0.617305627256 0.489123468717 0.452012519456 0.672929213377


In [43]:
# Let's try Edited Nearest Neighbour
enn = EditedNearestNeighbours()
X_resampled, Y_resampled = enn.fit_sample(X, Y)

scores_dTree = cross_val_score(dTree, X_resampled, Y_resampled, cv = 10, scoring='roc_auc')
scores_dTree = scores_dTree.mean()

scores_logReg = cross_val_score(logReg, X_resampled, Y_resampled, cv = 10, scoring='roc_auc')
scores_logReg = scores_logReg.mean()

scores_knn = cross_val_score(knn, X_resampled, Y_resampled, cv = 10, scoring='roc_auc')
scores_knn = scores_knn.mean()

scores_rF = cross_val_score(rF, X_resampled, Y_resampled, cv = 10, scoring='roc_auc')
scores_rF = scores_rF.mean()

scores_MLP = cross_val_score(MLP, X_resampled, Y_resampled, cv = 10, scoring='roc_auc')
scores_MLP = scores_MLP.mean()

print('After applying Edited Nearest Neighbours: ')
print('   dTree,       logReg   ,         KNN    ,       rF   ,           MLP')
print(scores_dTree, scores_logReg, scores_knn, scores_rF, scores_MLP)

report[4, :] = [scores_dTree, scores_logReg, scores_knn, scores_rF, scores_MLP]

After applying Edited Nearest Neighbours: 
   dTree,       logReg   ,         KNN    ,       rF   ,           MLP
0.668346746821 0.524898561075 0.668403219136 0.735491825571 0.572154300814


In [44]:
# Let's try Repeated Edited Nearest Neighbour
renn = RepeatedEditedNearestNeighbours()
X_resampled, Y_resampled = renn.fit_sample(X, Y)

scores_dTree = cross_val_score(dTree, X_resampled, Y_resampled, cv = 10, scoring='roc_auc')
scores_dTree = scores_dTree.mean()

scores_logReg = cross_val_score(logReg, X_resampled, Y_resampled, cv = 10, scoring='roc_auc')
scores_logReg = scores_logReg.mean()

scores_knn = cross_val_score(knn, X_resampled, Y_resampled, cv = 10, scoring='roc_auc')
scores_knn = scores_knn.mean()

scores_rF = cross_val_score(rF, X_resampled, Y_resampled, cv = 10, scoring='roc_auc')
scores_rF = scores_rF.mean()

scores_MLP = cross_val_score(MLP, X_resampled, Y_resampled, cv = 10, scoring='roc_auc')
scores_MLP = scores_MLP.mean()

print('After applying Repeated Edited Nearest Neighbours: ')
print('   dTree,       logReg   ,         KNN    ,       rF   ,           MLP')
print(scores_dTree, scores_logReg, scores_knn, scores_rF, scores_MLP)

report[5, :] = [scores_dTree, scores_logReg, scores_knn, scores_rF, scores_MLP]

After applying Repeated Edited Nearest Neighbours: 
   dTree,       logReg   ,         KNN    ,       rF   ,           MLP
0.835295529713 0.527254436502 0.860550606352 0.931867331697 0.70573517906


In [45]:
# Let's try Tomek Link Removal
tkLink = TomekLinks()
X_resampled, Y_resampled = tkLink.fit_sample(X, Y)

scores_dTree = cross_val_score(dTree, X_resampled, Y_resampled, cv = 10, scoring='roc_auc')
scores_dTree = scores_dTree.mean()

scores_logReg = cross_val_score(logReg, X_resampled, Y_resampled, cv = 10, scoring='roc_auc')
scores_logReg = scores_logReg.mean()

scores_knn = cross_val_score(knn, X_resampled, Y_resampled, cv = 10, scoring='roc_auc')
scores_knn = scores_knn.mean()

scores_rF = cross_val_score(rF, X_resampled, Y_resampled, cv = 10, scoring='roc_auc')
scores_rF = scores_rF.mean()

scores_MLP = cross_val_score(MLP, X_resampled, Y_resampled, cv = 10, scoring='roc_auc')
scores_MLP = scores_MLP.mean()

print('After applying Tomek Link Removal: ')
print('   dTree,       logReg   ,         KNN    ,       rF   ,           MLP')
print(scores_dTree, scores_logReg, scores_knn, scores_rF, scores_MLP)

report[6, :] = [scores_dTree, scores_logReg, scores_knn, scores_rF, scores_MLP]

After applying Tomek Link Removal: 
   dTree,       logReg   ,         KNN    ,       rF   ,           MLP
0.489433489293 0.511774231686 0.505590188824 0.511649330506 0.516169564212


In [46]:
# Let's try Random Oversampling
rmov = RandomOverSampler()
X_resampled, Y_resampled = rmov.fit_sample(X, Y)

scores_dTree = cross_val_score(dTree, X_resampled, Y_resampled, cv = 10, scoring='roc_auc')
scores_dTree = scores_dTree.mean()

scores_logReg = cross_val_score(logReg, X_resampled, Y_resampled, cv = 10, scoring='roc_auc')
scores_logReg = scores_logReg.mean()

scores_knn = cross_val_score(knn, X_resampled, Y_resampled, cv = 10, scoring='roc_auc')
scores_knn = scores_knn.mean()

scores_rF = cross_val_score(rF, X_resampled, Y_resampled, cv = 10, scoring='roc_auc')
scores_rF = scores_rF.mean()

scores_MLP = cross_val_score(MLP, X_resampled, Y_resampled, cv = 10, scoring='roc_auc')
scores_MLP = scores_MLP.mean()

print('After applying Random Oversampling: ')
print('   dTree,       logReg   ,         KNN    ,       rF   ,           MLP')
print(scores_dTree, scores_logReg, scores_knn, scores_rF, scores_MLP)

report[7, :] = [scores_dTree, scores_logReg, scores_knn, scores_rF, scores_MLP]

After applying Random Oversampling: 
   dTree,       logReg   ,         KNN    ,       rF   ,           MLP
0.726059273511 0.378928498061 0.7168737493 0.771856306836 0.50259778079


In [47]:
# Let's try SMOTE
sm = SMOTE(ratio = 'auto', kind = 'regular')
X_resampled, Y_resampled = sm.fit_sample(X, Y)

scores_dTree = cross_val_score(dTree, X_resampled, Y_resampled, cv = 10, scoring='roc_auc')
scores_dTree = scores_dTree.mean()

scores_logReg = cross_val_score(logReg, X_resampled, Y_resampled, cv = 10, scoring='roc_auc')
scores_logReg = scores_logReg.mean()

scores_knn = cross_val_score(knn, X_resampled, Y_resampled, cv = 10, scoring='roc_auc')
scores_knn = scores_knn.mean()

scores_rF = cross_val_score(rF, X_resampled, Y_resampled, cv = 10, scoring='roc_auc')
scores_rF = scores_rF.mean()

scores_MLP = cross_val_score(MLP, X_resampled, Y_resampled, cv = 10, scoring='roc_auc')
scores_MLP = scores_MLP.mean()

print('After appling SMOTE: ')
print('   dTree,       logReg   ,         KNN    ,       rF   ,           MLP')
print(scores_dTree, scores_logReg, scores_knn, scores_rF, scores_MLP)

report[8, :] = [scores_dTree, scores_logReg, scores_knn, scores_rF, scores_MLP]

After appling SMOTE: 
   dTree,       logReg   ,         KNN    ,       rF   ,           MLP
0.759849985679 0.359485578592 0.737199411434 0.801370105876 0.556297849749


In [48]:
# Let's try SMOTENN
sm_en = SMOTEENN()
X_resampled, Y_resampled = sm_en.fit_sample(X, Y)

scores_dTree = cross_val_score(dTree, X_resampled, Y_resampled, cv = 10, scoring='roc_auc')
scores_dTree = scores_dTree.mean()

scores_logReg = cross_val_score(logReg, X_resampled, Y_resampled, cv = 10, scoring='roc_auc')
scores_logReg = scores_logReg.mean()

scores_knn = cross_val_score(knn, X_resampled, Y_resampled, cv = 10, scoring='roc_auc')
scores_knn = scores_knn.mean()

scores_rF = cross_val_score(rF, X_resampled, Y_resampled, cv = 10, scoring='roc_auc')
scores_rF = scores_rF.mean()

scores_MLP = cross_val_score(MLP, X_resampled, Y_resampled, cv = 10, scoring='roc_auc')
scores_MLP = scores_MLP.mean()

print('After appling SMOTENN: ')
print('   dTree,       logReg   ,         KNN    ,       rF   ,           MLP')
print(scores_dTree, scores_logReg, scores_knn, scores_rF, scores_MLP)

report[9, :] = [scores_dTree, scores_logReg, scores_knn, scores_rF, scores_MLP]

After appling SMOTENN: 
   dTree,       logReg   ,         KNN    ,       rF   ,           MLP
0.848565605841 0.481939517389 0.944085405096 0.933680106329 0.706719705035


In [49]:
# Let's try SMOTE+Tomek Link removal
sm_tk = SMOTETomek()
X_resampled, Y_resampled = sm_tk.fit_sample(X, Y)

scores_dTree = cross_val_score(dTree, X_resampled, Y_resampled, cv = 10, scoring='roc_auc')
scores_dTree = scores_dTree.mean()

scores_logReg = cross_val_score(logReg, X_resampled, Y_resampled, cv = 10, scoring='roc_auc')
scores_logReg = scores_logReg.mean()

scores_knn = cross_val_score(knn, X_resampled, Y_resampled, cv = 10, scoring='roc_auc')
scores_knn = scores_knn.mean()

scores_rF = cross_val_score(rF, X_resampled, Y_resampled, cv = 10, scoring='roc_auc')
scores_rF = scores_rF.mean()

scores_MLP = cross_val_score(MLP, X_resampled, Y_resampled, cv = 10, scoring='roc_auc')
scores_MLP = scores_MLP.mean()

print('After appling SMOTE+Tomek Link Removal: ')
print('   dTree,       logReg   ,         KNN    ,       rF   ,           MLP')
print(scores_dTree, scores_logReg, scores_knn, scores_rF, scores_MLP)

report[10, :] = [scores_dTree, scores_logReg, scores_knn, scores_rF, scores_MLP]

After appling SMOTE+Tomek Link Removal: 
   dTree,       logReg   ,         KNN    ,       rF   ,           MLP
0.758312107477 0.350414586899 0.73690348045 0.793735150136 0.517859539892


In [51]:
report = np.transpose(report)

print(report[0,:])
report.shape

[ 0.49531417  0.50918978  0.5075605   0.51623241  0.51880463]


(11, 5)

In [55]:
Classifiers = ['DecisionTreeClassifier', 'LogisticRegression', 'KNeighborsClassifier', 'RandomForestClassifier',  'MLPClassifier']
Samplers = ['None','Random Undersampling', 'CNN', 'NearMiss', 'ENN', 'RENN','Tomek Links','SMOTE', 'Random Oversampling', 'SMOTEENN','SMOTETomek']

report = np.transpose(report)
#del(df)
df = pd.DataFrame(report, columns = Samplers, index = Classifiers)
df

Unnamed: 0,None,Random Undersampling,CNN,NearMiss,ENN,RENN,Tomek Links,SMOTE,Random Oversampling,SMOTEENN,SMOTETomek
DecisionTreeClassifier,0.495314,0.453602,0.336207,0.421589,0.668347,0.835296,0.489433,0.726059,0.75985,0.848566,0.758312
LogisticRegression,0.50919,0.335188,0.402026,0.617306,0.524899,0.527254,0.511774,0.378928,0.359486,0.48194,0.350415
KNeighborsClassifier,0.507561,0.495799,0.364855,0.489123,0.668403,0.860551,0.50559,0.716874,0.737199,0.944085,0.736903
RandomForestClassifier,0.516232,0.436014,0.2937,0.452013,0.735492,0.931867,0.511649,0.771856,0.80137,0.93368,0.793735
MLPClassifier,0.518805,0.377797,0.392436,0.672929,0.572154,0.705735,0.51617,0.502598,0.556298,0.70672,0.51786
