In [21]:
import pandas as pd
from sklearn.preprocessing import label_binarize
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings("ignore")
#Please ignore the warnings with version change

from google.colab import drive
drive.mount('/gdrive')
#Change current working directory to gdrive
%cd /gdrive



Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive


In [22]:
trainfile = r'/gdrive/My Drive/CIS508/Assignment-2/Portugese Bank Data - TRAIN.csv'
trainData = pd.read_csv(trainfile) #creates a dataframe
testfile = r'/gdrive/My Drive/CIS508/Assignment-2/Portugese Bank Data - TEST.csv'
testData = pd.read_csv(testfile) #creates a dataframe

#print sizes (shape) of datasets
print(trainData.shape)
print(testData.shape)

print(trainData.head())
print(testData.head())

(4521, 17)
(45211, 17)
   age          job  marital  education  ... pdays  previous poutcome   y
0   30   unemployed  married    primary  ...    -1         0  unknown  no
1   33     services  married  secondary  ...   339         4  failure  no
2   35   management   single   tertiary  ...   330         1  failure  no
3   30   management  married   tertiary  ...    -1         0  unknown  no
4   59  blue-collar  married  secondary  ...    -1         0  unknown  no

[5 rows x 17 columns]
   age           job  marital  education  ... pdays  previous poutcome   y
0   58    management  married   tertiary  ...    -1         0  unknown  no
1   44    technician   single  secondary  ...    -1         0  unknown  no
2   33  entrepreneur  married  secondary  ...    -1         0  unknown  no
3   47   blue-collar  married    unknown  ...    -1         0  unknown  no
4   33       unknown   single    unknown  ...    -1         0  unknown  no

[5 rows x 17 columns]


In [23]:
#Copy Train data excluding target
trainData_Copy = trainData.iloc[:, :-1].copy()
testData_Copy = testData.iloc[:, :-1].copy()

print(trainData_Copy.head())
print(testData_Copy.head())

   age          job  marital  education  ... campaign  pdays previous poutcome
0   30   unemployed  married    primary  ...        1     -1        0  unknown
1   33     services  married  secondary  ...        1    339        4  failure
2   35   management   single   tertiary  ...        1    330        1  failure
3   30   management  married   tertiary  ...        4     -1        0  unknown
4   59  blue-collar  married  secondary  ...        1     -1        0  unknown

[5 rows x 16 columns]
   age           job  marital  education  ... campaign  pdays previous poutcome
0   58    management  married   tertiary  ...        1     -1        0  unknown
1   44    technician   single  secondary  ...        1     -1        0  unknown
2   33  entrepreneur  married  secondary  ...        1     -1        0  unknown
3   47   blue-collar  married    unknown  ...        1     -1        0  unknown
4   33       unknown   single    unknown  ...        1     -1        0  unknown

[5 rows x 16 columns]


In [24]:
#List of Categorical Features
categoricalFeatures = []
for col in trainData_Copy.columns:
  if col in testData_Copy.columns:
    categoricalFeatures.append(col)
#Removing age from categorical features as it does not add value
categoricalFeatures.remove("age")
print (categoricalFeatures)

['job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome']


In [25]:
#Combine Train and test for one Hot Encoding
combined_Data = pd.concat([trainData_Copy,testData_Copy], keys=[0,1])

#Do one Hot encoding for categorical features
combined_Data = pd.get_dummies(combined_Data,columns=categoricalFeatures)

#Separate Train data and test data
X_train = combined_Data.xs(0)
X_test = combined_Data.xs(1)

print('----------------------------------------------------------------')
print('----------------------------------------------------------------')
#print(X_train.head())
#print(X_test.head())
X_train.head()

----------------------------------------------------------------
----------------------------------------------------------------


Unnamed: 0,age,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,job_unknown,marital_divorced,marital_married,marital_single,education_primary,education_secondary,education_tertiary,education_unknown,default_no,default_yes,balance_-8019,balance_-6847,balance_-4057,balance_-3372,balance_-3313,balance_-3058,balance_-2827,balance_-2712,balance_-2604,balance_-2282,balance_-2122,balance_-2093,balance_-2082,balance_-2049,balance_-1980,balance_-1968,balance_-1965,balance_-1944,...,previous_5,previous_6,previous_7,previous_8,previous_9,previous_10,previous_11,previous_12,previous_13,previous_14,previous_15,previous_16,previous_17,previous_18,previous_19,previous_20,previous_21,previous_22,previous_23,previous_24,previous_25,previous_26,previous_27,previous_28,previous_29,previous_30,previous_32,previous_35,previous_37,previous_38,previous_40,previous_41,previous_51,previous_55,previous_58,previous_275,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,30,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,33,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
2,35,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3,30,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,59,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [26]:
#Extracting Target column from the training data set
y_train = trainData.iloc[:,-1]
print(y_train.head())
#X_train = trainData.drop(["Contraceptive method used"], axis=1) #extracting training data without the target column
#Extracting Target column from the test data set
y_test = testData.iloc[:,-1]
print(y_test.head())
#X_test = testData.drop(["Contraceptive method used"], axis=1) #extracting training data without the target column



0    no
1    no
2    no
3    no
4    no
Name: y, dtype: object
0    no
1    no
2    no
3    no
4    no
Name: y, dtype: object


In [27]:
#CONSTRUCT DEFAULT DECISION TREE AND OBTAIN RESPECTIVE ACCURACY 
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
clf_predict=clf.predict(X_test)
print("accuracy Score (testing) for Decision Tree:{0:6f}".format(clf.score(X_test,y_test)))
print("Confusion Matrix for Decision Tree")
print(confusion_matrix(y_test,clf_predict))
print("=== Classification Report ===")
print(classification_report(y_test,clf_predict))



accuracy Score (testing) for Decision Tree:0.879299
Confusion Matrix for Decision Tree
[[37983  1939]
 [ 3518  1771]]
=== Classification Report ===
              precision    recall  f1-score   support

          no       0.92      0.95      0.93     39922
         yes       0.48      0.33      0.39      5289

    accuracy                           0.88     45211
   macro avg       0.70      0.64      0.66     45211
weighted avg       0.86      0.88      0.87     45211



In [28]:
#Hyperparameter tuning done for decision tree classifier
#RANDOM SEARCH--------------------------------------------
print("RandomizedSearchCV-Decision tree")
parameters={'min_samples_leaf' : range(10,200,10),
            'max_depth': range(5,40,5),
            'criterion':['gini','entropy']
            }
clf_random = RandomizedSearchCV(clf,parameters,n_iter=20,cv=10)
clf_random.fit(X_train, y_train)
grid_parm=clf_random.best_params_
print(grid_parm)

#GRID SEARCH----------------------------------------
print("GridSearchCV-Decision tree")
clf_grid = GridSearchCV(clf,parameters)
clf_grid.fit(X_train, y_train)
grid_parm1=clf_grid.best_params_
print(grid_parm1)



RandomizedSearchCV-Decision tree
{'min_samples_leaf': 100, 'max_depth': 25, 'criterion': 'entropy'}
GridSearchCV-Decision tree
{'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 50}


In [29]:
#Using the parameters obtained from HyperParameterTuning in the DecisionTreeClassifier 
clf = DecisionTreeClassifier(**grid_parm)
clfr = DecisionTreeClassifier(**grid_parm1)

clf.fit(X_train,y_train)
clf_predict = clf.predict(X_test)
clfr.fit(X_train,y_train)
clfr_predict = clfr.predict(X_test)


#Obtain accuracy ,confusion matrix,classification report and AUC values for the result above.
print("accuracy Score (testing) after hypertuning randomized search for Decision Tree:{0:6f}".format(clf.score(X_test,y_test)))
print("accuracy Score (testing) after hypertuning grid search for Decision Tree:{0:6f}".format(clfr.score(X_test,y_test)))
print("Confusion Matrix after Random hypertuning for Decision Tree (Testing)")
print(confusion_matrix(y_test,clf_predict))
print("=== Classification Report for Random hypertesting (Testing)  ===")
print(classification_report(y_test,clf_predict))
print("Confusion Matrix after Grid hypertuning for Decision Tree (Testing)")
print(confusion_matrix(y_test,clfr_predict))
print("=== Classification Report for Grid hypertesting (Testing)  ===")
print(classification_report(y_test,clfr_predict))

clf_cv_score = cross_val_score(clf, X_train, y_train, cv=15, scoring="balanced_accuracy")
print(clf_cv_score)
print('\n')




accuracy Score (testing) after hypertuning randomized search for Decision Tree:0.892858
accuracy Score (testing) after hypertuning grid search for Decision Tree:0.892858
Confusion Matrix after Random hypertuning for Decision Tree (Testing)
[[39389   533]
 [ 4311   978]]
=== Classification Report for Random hypertesting (Testing)  ===
              precision    recall  f1-score   support

          no       0.90      0.99      0.94     39922
         yes       0.65      0.18      0.29      5289

    accuracy                           0.89     45211
   macro avg       0.77      0.59      0.61     45211
weighted avg       0.87      0.89      0.87     45211

Confusion Matrix after Grid hypertuning for Decision Tree (Testing)
[[39389   533]
 [ 4311   978]]
=== Classification Report for Grid hypertesting (Testing)  ===
              precision    recall  f1-score   support

          no       0.90      0.99      0.94     39922
         yes       0.65      0.18      0.29      5289

    accurac

In [30]:
#Normal randomforest
rand_parameters={'min_samples_leaf' : range(10,200,10),
                 'max_depth': range(1,20,2),
                 'max_features':range(10,100,10),
                 'n_estimators':[20,30,40]}
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
rfc_predict=rfc.predict(X_test)
print("accuracy Score (testing) for Default Random Forest:{0:6f}".format(rfc.score(X_test,y_test)))
print("Confusion Matrix for Default Random Forest (testing):")
print(confusion_matrix(y_test,rfc_predict))
print("=== Classification Report for Default Random Forest(Testing)  ===")
print(classification_report(y_test,rfc_predict))





accuracy Score (testing) for Default Random Forest:0.898565
Confusion Matrix for Default Random Forest (testing):
[[39656   266]
 [ 4320   969]]
=== Classification Report for Default Random Forest(Testing)  ===
              precision    recall  f1-score   support

          no       0.90      0.99      0.95     39922
         yes       0.78      0.18      0.30      5289

    accuracy                           0.90     45211
   macro avg       0.84      0.59      0.62     45211
weighted avg       0.89      0.90      0.87     45211



In [31]:
#RANDOMIZED SEARCH for Random Forest----------------------------------------
rfc_random = RandomizedSearchCV(rfc,rand_parameters,n_iter=20,cv=10)
rfc_random.fit(X_train, y_train)
grid_parm_rfc=rfc_random.best_params_
print(grid_parm_rfc)
rfc= RandomForestClassifier(**grid_parm_rfc)
rfc.fit(X_train,y_train)
rfc_predict = rfc.predict(X_test)
print("accuracy Score (testing) after hypertuning for Random Forest:{0:6f}".format(rfc.score(X_test,y_test)))
print("Confusion Matrix after hypertuning for Random Forest:")
print(confusion_matrix(y_test,rfc_predict))
print("=== Classification Report ===")
print(classification_report(y_test,rfc_predict))
rfc_cv_score = cross_val_score(rfc, X_train, y_train)
print(rfc_cv_score)
print('\n')


{'n_estimators': 30, 'min_samples_leaf': 80, 'max_features': 20, 'max_depth': 17}
accuracy Score (testing) after hypertuning for Random Forest:0.883015
Confusion Matrix after hypertuning for Random Forest:
[[39922     0]
 [ 5289     0]]
=== Classification Report ===
              precision    recall  f1-score   support

          no       0.88      1.00      0.94     39922
         yes       0.00      0.00      0.00      5289

    accuracy                           0.88     45211
   macro avg       0.44      0.50      0.47     45211
weighted avg       0.78      0.88      0.83     45211

[0.88461538 0.88453882 0.88512616]




In [0]:
#GRID SEARCH for RANDOM FOREST----------------------------------------
rfc_grid = GridSearchCV(rfc,rand_parameters)
rfc_grid.fit(X_train, y_train)
grid_parm_rfc=rfc_grid.best_params_
print(grid_parm_rfc)
rfc= RandomForestClassifier(**grid_parm_rfc)
rfc.fit(X_train,y_train)
rfc_predict = rfc.predict(X_test)
print("accuracy Score (testing) after Grid hypertuning for Random Forest:{0:6f}".format(rfc.score(X_test,y_test)))
print("Confusion Matrix after Grid hypertuning for Random Forest (Testing):")
print(confusion_matrix(y_test,rfc_predict))
print("=== Classification Report after Grid hypertuning for Random Forest (Testing) ===")
print(classification_report(y_test,rfc_predict))
rfc_cv_score = cross_val_score(rfc, X_train, y_train)
print(rfc_cv_score)
print('\n')