In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
from sklearn.metrics import confusion_matrix
from sklearn import metrics,tree
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [3]:
pd.options.display.max_columns=None   # To display all the columns

In [4]:
# Loading data
train = pd.read_csv('/content/drive/MyDrive/Projects/Classifying-employees-based-on-salary-master/train.csv')
test = pd.read_csv('/content/drive/MyDrive/Projects/Classifying-employees-based-on-salary-master/test.csv')
test.head()

Unnamed: 0,ID,Year Type,Year,Organization Group Code,Organization Group,Department Code,Department,Union Code,Union,Job Family Code,Job Family,Job Code,Job,Employee Identifier,Overtime,Other Salaries,Retirement,Health/Dental,Other Benefits,Total Benefits,Class
0,50877,Fiscal,2016,6,General Administration & Finance,ADM,ADM Gen Svcs Agency-City Admin,790,"SEIU - Miscellaneous, Local 1021",2500,Med Therapy & Auxiliary,2578,Med Examiner's Investigator II,12805,1387.31,3644.19,6767.72,4361.97,3377.16,14506.85,1
1,161168,Fiscal,2016,1,Public Protection,POL,POL Police,911,Police Officers' Association,Q000,Police Services,Q002,Police Officer,39215,0.0,12087.57,8368.04,6977.73,922.73,16268.5,2
2,165825,Fiscal,2014,2,"Public Works, Transportation & Commerce",MTA,MTA Municipal Transprtn Agncy,253,"Transport Workers - Transit Operators, Local 2...",9100,Street Transit,9163,Transit Operator,56734,22693.82,4991.83,19958.0,13105.21,7216.69,40279.9,2
3,134872,Fiscal,2014,3,Human Welfare & Neighborhood Development,DSS,HSA Human Services Agency,250,"SEIU - Health Workers, Local 1021",7500,Semi-Skilled & General Labor,7524,Institution Utility Worker,28377,0.0,626.4,12914.72,12516.72,4291.71,29723.15,2
4,99872,Fiscal,2016,2,"Public Works, Transportation & Commerce",DPW,DPW GSA - Public Works,1,Miscellaneous Unrepresented Employees,9900,Public Service Aide,9916,Public Svc Aide-Public Works,44544,356.4,992.81,7085.63,13125.67,2637.39,22848.69,1


In [5]:
train.isnull().sum()

ID                         0
Year Type                  0
Year                       0
Organization Group Code    0
Organization Group         0
Department Code            0
Department                 0
Union Code                 0
Union                      0
Job Family Code            0
Job Family                 0
Job Code                   0
Job                        0
Employee Identifier        0
Overtime                   0
Other Salaries             0
Retirement                 0
Health/Dental              0
Other Benefits             0
Total Benefits             0
Class                      0
dtype: int64





# Feature Engineering



In [6]:
# Dropping irrelevant and duplicate columns
col = ['Year','Organization Group Code','Department Code','Union','Job Family Code',\
       'Job','Overtime','Other Salaries',\
       'Retirement','Health/Dental','Other Benefits','Total Benefits','Class']

In [7]:
# Data with new set of columns
train_new = train[col].copy()
test_new = test[col].copy()
train_new.head()

Unnamed: 0,Year,Organization Group Code,Department Code,Union,Job Family Code,Job,Overtime,Other Salaries,Retirement,Health/Dental,Other Benefits,Total Benefits,Class
0,2015,3,DSS,"SEIU - Human Services, Local 1021",2900,Senior Eligibility Worker,4091.51,2066.4,17570.65,12501.45,6740.39,36812.49,2
1,2014,2,MTA,"Transport Workers - Transit Operators, Local 2...",9100,Transit Operator,27816.21,1185.88,19338.0,13445.88,7298.74,40082.62,2
2,2016,5,LIB,"SEIU - Miscellaneous, Local 1021",3600,Librarian 2,0.0,1516.36,18149.13,13175.97,7831.28,39156.38,3
3,2014,1,ECD,"SEIU - Miscellaneous, Local 1021",8200,Public Safetycomm Supv,12235.56,13348.36,22177.39,12444.79,9566.78,44188.96,3
4,2015,1,SHF,Deputy Sheriffs' Association,8300,Deputy Sheriff,36837.03,13675.04,23377.98,11113.55,2310.65,36802.18,3


In [8]:
# number of unique values in categorical columns
for i in ['Year','Organization Group Code','Department Code','Union','Job Family Code','Job']:
  print(i,' : ',train[i].nunique())

Year  :  5
Organization Group Code  :  7
Department Code  :  56
Union  :  77
Job Family Code  :  62
Job  :  1140


## Mean Encoding 


In [9]:
import sklearn
from sklearn.model_selection import StratifiedKFold
## 10 fold cv
kf = sklearn.model_selection.KFold(n_splits = 10, shuffle = False) 

In [10]:
# Mean encoding the categorical features using kfold to minimize bias
x = train_new.copy()
cols = ['Department Code','Union','Job Family Code','Job']
for i in cols: ## Looping through all features   
  x['mean_encoded_on'] = np.nan
  for tr_ind, val_ind in kf.split(x):
      X_tr, X_val = x.iloc[tr_ind], x.iloc[val_ind] ## train-test hold out
      x.loc[x.index[val_ind], 'mean_encoded_on'] = X_val[i].map(X_tr.groupby(i).Class.mean())

  x = x.rename(index=str, columns={"mean_encoded_on": i +"_K_Encoded"})

In [11]:
# Filling the null values with mean of labels taken from the whole dataset
x['Job_K_Encoded'].fillna(x['Job'].map(train_new.groupby(['Job']).Class.mean()),inplace=True)
x['Job Family Code_K_Encoded'].fillna(x['Job Family Code'].map(train_new.groupby(['Job Family Code']).Class.mean()),inplace=True)
x['Union_K_Encoded'].fillna(x['Union'].map(train_new.groupby(['Union']).Class.mean()),inplace=True)

In [12]:
x1 = test_new.copy()
cols = ['Department Code','Union','Job Family Code','Job']
for i in cols:
  x1['mean_encoded_on'] = np.nan
  x1['mean_encoded_on'] = x1[i].map(x.groupby(i).Class.mean())
  x1 = x1.rename(index=str, columns={"mean_encoded_on": i +"_K_Encoded"})

In [13]:
x1['Job_K_Encoded'].fillna(x1['Job'].map(test_new.groupby(['Job']).Class.mean()),inplace=True)

## One hot encoding on year and organization features

In [14]:
x['group'] = 'train'
x1['group'] = 'test'

In [15]:
data = pd.concat((x,x1))
data.head()

Unnamed: 0,Year,Organization Group Code,Department Code,Union,Job Family Code,Job,Overtime,Other Salaries,Retirement,Health/Dental,Other Benefits,Total Benefits,Class,Department Code_K_Encoded,Union_K_Encoded,Job Family Code_K_Encoded,Job_K_Encoded,group
0,2015,3,DSS,"SEIU - Human Services, Local 1021",2900,Senior Eligibility Worker,4091.51,2066.4,17570.65,12501.45,6740.39,36812.49,2,1.700986,1.99364,2.066503,1.8607,train
1,2014,2,MTA,"Transport Workers - Transit Operators, Local 2...",9100,Transit Operator,27816.21,1185.88,19338.0,13445.88,7298.74,40082.62,2,1.942702,1.734379,1.866387,1.734379,train
2,2016,5,LIB,"SEIU - Miscellaneous, Local 1021",3600,Librarian 2,0.0,1516.36,18149.13,13175.97,7831.28,39156.38,3,1.698099,1.620886,1.645994,2.688889,train
3,2014,1,ECD,"SEIU - Miscellaneous, Local 1021",8200,Public Safetycomm Supv,12235.56,13348.36,22177.39,12444.79,9566.78,44188.96,3,2.407821,1.620886,1.691344,2.657143,train
4,2015,1,SHF,Deputy Sheriffs' Association,8300,Deputy Sheriff,36837.03,13675.04,23377.98,11113.55,2310.65,36802.18,3,2.552316,2.738871,2.451553,2.877804,train


In [16]:
year = pd.get_dummies(data['Year'],drop_first=True,prefix='year')
organization = pd.get_dummies(data['Organization Group Code'],drop_first=True,prefix='org')

In [17]:
data_mean_encoded = pd.concat((data,year,organization),axis=1)
data_mean_encoded.drop(['Year','Organization Group Code','Department Code','Union','Job Family Code','Job'],axis=1,inplace=True)
data_mean_encoded.head()

Unnamed: 0,Overtime,Other Salaries,Retirement,Health/Dental,Other Benefits,Total Benefits,Class,Department Code_K_Encoded,Union_K_Encoded,Job Family Code_K_Encoded,Job_K_Encoded,group,year_2014,year_2015,year_2016,year_2017,org_2,org_3,org_4,org_5,org_6,org_7
0,4091.51,2066.4,17570.65,12501.45,6740.39,36812.49,2,1.700986,1.99364,2.066503,1.8607,train,0,1,0,0,0,1,0,0,0,0
1,27816.21,1185.88,19338.0,13445.88,7298.74,40082.62,2,1.942702,1.734379,1.866387,1.734379,train,1,0,0,0,1,0,0,0,0,0
2,0.0,1516.36,18149.13,13175.97,7831.28,39156.38,3,1.698099,1.620886,1.645994,2.688889,train,0,0,1,0,0,0,0,1,0,0
3,12235.56,13348.36,22177.39,12444.79,9566.78,44188.96,3,2.407821,1.620886,1.691344,2.657143,train,1,0,0,0,0,0,0,0,0,0
4,36837.03,13675.04,23377.98,11113.55,2310.65,36802.18,3,2.552316,2.738871,2.451553,2.877804,train,0,1,0,0,0,0,0,0,0,0


### Eliminating irrelavant rows






In [18]:
df = data_mean_encoded.copy()

In [19]:
# Eliminate rows having negative values as they don't make sense
df = df[(df['Other Salaries']>=0)&(df['Overtime']>=0)&(df['Retirement']>=0)&\
            (df['Health/Dental']>=0)&(df['Other Benefits']>=0)]

In [20]:
df.shape

(148502, 22)

In [21]:
# Splitting the data into train and test datasets using group column
train = df[df['group']=='train'].copy()
test = df[df['group']=='test'].copy()

In [22]:
# dropping group column
train.drop('group',axis=1,inplace=True) 

In [23]:
# dropping group and class columns
test.drop('group',axis=1,inplace=True)

# Building Models

In [24]:
# defining target variable 'y' 
x = train.drop('Class',axis=1)
y = train['Class']

In [130]:
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,BaggingClassifier,AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV

## cross validation on all models

In [36]:
clf = tree.DecisionTreeClassifier(random_state=0,max_depth=3)
rf = RandomForestClassifier(random_state=0)
knn = KNeighborsClassifier()
gb = GradientBoostingClassifier(random_state=0,learning_rate=0.1)
bagging = BaggingClassifier(base_estimator=tree.DecisionTreeClassifier(random_state=0))
adaboost = AdaBoostClassifier(random_state=0)

In [42]:
models = [clf,rf,knn,gb,bagging,adaboost]
accuracy = []
for model in models:
  cv = []
  cv = cross_val_score(model,x,y,cv=5,scoring='accuracy')
  accuracy.append(cv.mean())
print(accuracy)

[0.937122644131134, 0.9855974543189209, 0.9691149730558284, 0.9763573820661021, 0.9840711502749953, 0.9386788616999869]


In [47]:
temp = pd.DataFrame({'Model':['Decision Tree','Random Forest','KNearestNeigbours','Gradient Boosting','Bagging','Adaboost'],
                     'Accuracy':accuracy})
temp.sort_values(by='Accuracy',ascending=False)

Unnamed: 0,Model,Accuracy
1,Random Forest,0.985597
4,Bagging,0.984071
3,Gradient Boosting,0.976357
2,KNearestNeigbours,0.969115
5,Adaboost,0.938679
0,Decision Tree,0.937123


## Hyperparameter Tuning




In [60]:
xtest = test.drop(['Class'],axis=1).copy()
ytest = test['Class']

### RandomizedSearchCV with Decision Tree

In [77]:
# Defining parameter grid
param_grid = {'criterion':['gini','entropy'],'min_samples_split':[2,10,20],\
              'max_depth':[None,2,5,10],'min_samples_leaf':[1,5,10],'max_leaf_nodes':[None,5,10,20]}

In [78]:
# Grid Search with decision tree model
dt = tree.DecisionTreeClassifier(random_state=0)  
grid_dt = RandomizedSearchCV(dt,param_grid,cv=2,n_jobs=-1,n_iter=40)   # Grid Search and Cross Validation
grid_dt.fit(x,y)    # Building model

RandomizedSearchCV(cv=2, error_score=nan,
                   estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features=None,
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    presort='deprecated',
                                                    random_state=0,
             

In [79]:
grid_dt.best_params_   # best parameters among parameter grid

{'criterion': 'entropy',
 'max_depth': None,
 'max_leaf_nodes': None,
 'min_samples_leaf': 5,
 'min_samples_split': 2}

In [80]:
grid_dt.best_estimator_

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')

In [82]:
tuned_dt = tree.DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')
tuned_dt.fit(x,y)

dt_pred_grid = tuned_dt.predict(xtest)   # prediction using built model
dt_pred_grid

array([1, 2, 2, ..., 3, 2, 2])

In [83]:
dt_accuracy = accuracy_score(val_pred_grid,ytest)   # calculating accuracy score
dt_accuracy

0.9831593128999663

In [84]:
print(classification_report(ytest,dt_pred_grid))

              precision    recall  f1-score   support

           1       0.99      0.99      0.99      4836
           2       0.98      0.97      0.98      4901
           3       0.99      0.99      0.99      5108

    accuracy                           0.98     14845
   macro avg       0.98      0.98      0.98     14845
weighted avg       0.98      0.98      0.98     14845



### RandomizedSearchCV on Random forest

In [85]:
param_grid = {'criterion':['gini','entropy'],'min_samples_split':[2,10,20],\
              'max_depth':[None,2,5,10],'min_samples_leaf':[1,5,10],'max_leaf_nodes':[None,5,10,20]}

In [87]:
# Grid search with Random Forest Model
rf = RandomForestClassifier(random_state=0)
grid_rf = RandomizedSearchCV(rf,param_grid,cv=2,n_jobs=4,n_iter=40)
grid_rf.fit(x,y)

RandomizedSearchCV(cv=2, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
               

In [88]:
grid_rf.best_params_   # best parameters among parameter grid

{'criterion': 'entropy',
 'max_depth': None,
 'max_leaf_nodes': None,
 'min_samples_leaf': 1,
 'min_samples_split': 20}

In [121]:
# feature importance
importances = grid_rf.best_estimator_.feature_importances_
importances = pd.DataFrame([x.columns,importances]).transpose().sort_values(by = 1,ascending=False)
importances.columns = ['Variables','Importance']
importances

Unnamed: 0,Variables,Importance
2,Retirement,0.296347
5,Total Benefits,0.192943
4,Other Benefits,0.161613
3,Health/Dental,0.104317
9,Job_K_Encoded,0.0931139
7,Union_K_Encoded,0.05491
8,Job Family Code_K_Encoded,0.0287091
1,Other Salaries,0.0227501
6,Department Code_K_Encoded,0.0141601
0,Overtime,0.0133426


In [101]:
grid_rf.best_estimator_

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=20,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [102]:
tuned_rf = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=20,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

tuned_rf.fit(x,y)
rf_pred_grid = tuned_rf.predict(xtest) # prediction using RF model
rf_pred_grid

array([1, 2, 2, ..., 3, 2, 2])

In [103]:
accuracy_score(rf_pred_grid,ytest)

0.9868642640619737

In [127]:
# Using the top_n important features for model building
top_n = importances['Variables'][:14]
tuned_rf_n = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=20,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

tuned_rf_n.fit(x[top_n],y)
rf_n_pred_grid = tuned_rf_n.predict(xtest[top_n])
rf_n_pred_grid

array([1, 2, 2, ..., 3, 2, 2])

In [157]:
rf_accuracy = accuracy_score(rf_n_pred_grid,ytest)
rf_accuracy

0.9870663523071741

### GridSearchCV on KNN

In [138]:
param_grid = {'n_neighbors' : [3,5,7,9],'metric' : ['euclidean','manhattan','minkowski']}

In [139]:
from sklearn.model_selection import GridSearchCV
knn = KNeighborsClassifier()
grid_knn = GridSearchCV(knn,param_grid,cv=2)
grid_knn.fit(x,y)

GridSearchCV(cv=2, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=None,
             param_grid={'metric': ['euclidean', 'manhattan', 'minkowski'],
                         'n_neighbors': [3, 5, 7, 9]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [140]:
grid_knn.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='manhattan',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [141]:
grid_knn = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='manhattan',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')
grid_knn.fit(x,y)
knn_pred_grid = grid_knn.predict(xtest)   # prediction of validation target variable using model

In [158]:
knn_accuracy = accuracy_score(knn_pred_grid,ytest)
knn_accuracy

0.9729201751431459

### Bagging Classifier

In [151]:
bagging = BaggingClassifier(tree.DecisionTreeClassifier(random_state=1))
bagging.fit(x,y)
bagging_pred = bagging.predict(xtest)

In [159]:
bagging_accuracy = accuracy_score(bagging_pred,ytest)
bagging_accuracy

0.9872684405523745

### Gradient Boosting

In [153]:
boost = GradientBoostingClassifier(random_state=1)
param_grid = {'learning_rate':[0.01,0.05,0.1,0.005]}
boost_grid = GridSearchCV(boost,param_grid,cv=2,scoring='accuracy')
boost_grid.fit(x,y)

GridSearchCV(cv=2, error_score=nan,
             estimator=GradientBoostingClassifier(ccp_alpha=0.0,
                                                  criterion='friedman_mse',
                                                  init=None, learning_rate=0.1,
                                                  loss='deviance', max_depth=3,
                                                  max_features=None,
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=100,
                                                  n_iter_no_change=None,
         

In [155]:
boost_grid.best_estimator_

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=1, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [156]:
boost = GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=1, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)
boost.fit(x,y)
boost_pred = boost.predict(xtest)

In [160]:
gb_accuracy = accuracy_score(boost_pred,ytest)
gb_accuracy

0.9768272145503537

### AdaBoost Classifier

In [143]:
param_grid = {"n_estimators": [5, 10, 15, 20, 25, 50, 75, 100], 
              "learning_rate": [0.001, 0.01, 0.1, 1]}

In [145]:
adaboost = AdaBoostClassifier(random_state=1)
adaboost_grid = GridSearchCV(adaboost,param_grid,cv=2,scoring='accuracy')
adaboost_grid.fit(x,y)

GridSearchCV(cv=2, error_score=nan,
             estimator=AdaBoostClassifier(algorithm='SAMME.R',
                                          base_estimator=None,
                                          learning_rate=1.0, n_estimators=50,
                                          random_state=1),
             iid='deprecated', n_jobs=None,
             param_grid={'learning_rate': [0.001, 0.01, 0.1, 1],
                         'n_estimators': [5, 10, 15, 20, 25, 50, 75, 100]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [146]:
adaboost_grid.best_estimator_

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1,
                   n_estimators=75, random_state=1)

In [147]:
tuned_adaboost = AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1,
                   n_estimators=75, random_state=1)
tuned_adaboost.fit(x,y)
adaboost_pred_grid = tuned_adaboost.predict(xtest)

In [161]:
adaboost_accuracy = accuracy_score(ytest,adaboost_pred_grid)
adaboost_accuracy

0.9471202425058942

## Model Accuracies

In [162]:
accuracy = [dt_accuracy,rf_accuracy,knn_accuracy,gb_accuracy,bagging_accuracy,adaboost_accuracy]
temp = pd.DataFrame({'Model':['Decision Tree','Random Forest','KNearestNeigbours','Gradient Boosting','Bagging','Adaboost'],
                     'Accuracy':accuracy})
temp.sort_values(by='Accuracy',ascending=False)

Unnamed: 0,Model,Accuracy
4,Bagging,0.987268
1,Random Forest,0.987066
0,Decision Tree,0.983159
3,Gradient Boosting,0.976827
2,KNearestNeigbours,0.97292
5,Adaboost,0.94712


As the dataset is balanced, we can use accuracy as the metric to measure the performance of these models