In [51]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

**Univariate Selection**

In [37]:
df1 = pd.read_csv('bank_df.csv')
df1.head()

Unnamed: 0,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,...,month_nov,month_oct,month_sep,poutcome_other,poutcome_success,poutcome_unknown,age_scaled,balance_scaled,duration_scaled,subscribed
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,1.266667,1.25,0.375,0
1,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0.333333,-0.308997,-0.134259,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,-0.4,-0.328909,-0.481481,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0.533333,0.780236,-0.407407,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,-0.4,-0.329646,0.083333,0


In [2]:
df = pd.read_csv("bank_df.csv")
X = df.iloc[:,0:35]  #independent columns
y = df.iloc[:,-1]    #target column i.e price range

In [3]:
#apply SelectKBest class to extract top 10 best features
bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(38,'Score'))  #print 10 best features

                  Specs        Score
33     poutcome_success  4113.000571
27            month_mar   749.691937
30            month_oct   734.708162
20      contact_unknown   733.354934
31            month_sep   677.273004
17          housing_yes   388.949715
28            month_may   330.368818
4           job_retired   269.699331
7           job_student   261.792961
22            month_dec   254.219350
34     poutcome_unknown   230.279723
0       job_blue-collar   184.348135
18             loan_yes   176.516137
14   education_tertiary   140.894501
12       marital_single   130.835717
11      marital_married    65.352757
23            month_feb    62.816516
25            month_jul    45.292910
32       poutcome_other    44.287113
3        job_management    38.743601
6          job_services    31.876642
13  education_secondary    29.141390
16          default_yes    22.313875
9        job_unemployed    18.254702
1      job_entrepreneur    16.903979
26            month_jun    11.259182
2

In [4]:
ordered_df = featureScores.nlargest(38,'Score')
df = df[list(ordered_df['Specs'].values)]
df.head()

Unnamed: 0,poutcome_success,month_mar,month_oct,contact_unknown,month_sep,housing_yes,month_may,job_retired,job_student,month_dec,...,month_jun,job_housemaid,month_nov,contact_telephone,education_unknown,month_jan,job_technician,month_aug,job_self-employed,job_unknown
0,0,0,0,1,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,1,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,0,0,1,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,1,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,1,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1


In [5]:
#Create dataframes with different feature sets
top5_df = df.iloc[:,:5]
top10_df = df.iloc[:,:10]
top20_df = df.iloc[:,:20]
top30_df = df.iloc[:,:30]

**Top 5 Features Random Forest**

In [6]:
X = top5_df

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=23)

In [7]:
#Create and gridsearch a random forest model for the data.
clf = RandomForestClassifier()

parameters = {'n_estimators': [10, 20, 50, 100, 200], 
              'max_features': ['log2'], 
              'criterion': ['gini', 'entropy'],
              'max_depth': range(1,10), 
              'min_samples_split': [2,3,6],
              'min_samples_leaf': [1,2,3],
             }

roc_auc = make_scorer(roc_auc_score)

grid_obj = GridSearchCV(clf, parameters, scoring=roc_auc)
grid_obj = grid_obj.fit(X_train, y_train)

clf = grid_obj.best_estimator_

clf.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', max_depth=6, max_features='log2',
                       min_samples_leaf=2, min_samples_split=6,
                       n_estimators=10)

In [8]:
#Predict on model and get accuracy score
predictions_5 = clf.predict(X_test)
print(accuracy_score(y_test, predictions_5))

0.8936891772338543


In [9]:
#Random forest confusion matrix
confusion_matrix(y_test, predictions_5)

array([[11821,   163],
       [ 1279,   301]], dtype=int64)

**Top 10 Features Random Forest**

In [10]:
X = top10_df

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=23)

In [11]:
#Create and gridsearch a random forest model for the data.
rf10 = RandomForestClassifier()

parameters = {'n_estimators': [10, 50, 100, 200, 500], 
              'max_features': ['log2'], 
              'criterion': ['gini', 'entropy'],
              'max_depth': range(1,10), 
              'min_samples_split': [2,3,6],
              'min_samples_leaf': [1,2,3],
             }

roc_auc = make_scorer(roc_auc_score)

grid_obj = GridSearchCV(rf10, parameters, scoring=roc_auc)
grid_obj = grid_obj.fit(X_train, y_train)

rf10 = grid_obj.best_estimator_

rf10.fit(X_train, y_train)

RandomForestClassifier(max_depth=9, max_features='log2', min_samples_leaf=2,
                       min_samples_split=6, n_estimators=10)

In [12]:
#Predict on model and get accuracy score
predictions_10 = rf10.predict(X_test)
print(accuracy_score(y_test, predictions_10))

0.8939840754939545


In [13]:
#Random forest confusion matrix
confusion_matrix(y_test, predictions_10)

array([[11840,   144],
       [ 1294,   286]], dtype=int64)

**Top 20 Random Forest**

In [14]:
X = top20_df

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=23)

In [15]:
#Create and gridsearch a random forest model for the data.
rf20 = RandomForestClassifier()

parameters = {'n_estimators': [10,50,100,200,500], 
              'max_features': ['log2'], 
              'criterion': ['gini', 'entropy'],
              'max_depth': range(1,10), 
              'min_samples_split': [2,3,6],
              'min_samples_leaf': [1,2,3],
             }

roc_auc = make_scorer(roc_auc_score)

grid_obj = GridSearchCV(rf20, parameters, scoring=roc_auc)
grid_obj = grid_obj.fit(X_train, y_train)

rf20 = grid_obj.best_estimator_

rf20.fit(X_train, y_train)

RandomForestClassifier(max_depth=9, max_features='log2', min_samples_leaf=2,
                       min_samples_split=6, n_estimators=200)

In [16]:
#Predict on model and get accuracy score
predictions_20 = rf20.predict(X_test)
print(accuracy_score(y_test, predictions_20))

0.8938366263639045


In [20]:
#Random forest confusion matrix
confusion_matrix(y_test, predictions_20)

array([[11850,   134],
       [ 1306,   274]], dtype=int64)

**Top 30 Random Forest**

In [21]:
X = top30_df

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=23)

In [22]:
#Create and gridsearch a random forest model for the data.
rf30 = RandomForestClassifier()

parameters = {'n_estimators': [10,50,100,200,500], 
              'max_features': ['log2'], 
              'criterion': ['gini', 'entropy'],
              'max_depth': range(1,10), 
              'min_samples_split': [2,3,6],
              'min_samples_leaf': [1,2,3],
             }

roc_auc = make_scorer(roc_auc_score)

grid_obj = GridSearchCV(rf30, parameters, scoring=roc_auc)
grid_obj = grid_obj.fit(X_train, y_train)

rf30 = grid_obj.best_estimator_

rf30.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', max_depth=9, max_features='log2',
                       min_samples_leaf=2, min_samples_split=3,
                       n_estimators=10)

In [23]:
#Predict on model and get accuracy score
predictions_30 = rf30.predict(X_test)
print(accuracy_score(y_test, predictions_30))

0.8930256561486287


In [24]:
#Random forest confusion matrix
confusion_matrix(y_test, predictions_30)

array([[11876,   108],
       [ 1343,   237]], dtype=int64)

**All Features**

In [25]:
X = df

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=23)

In [26]:
#Create and gridsearch a random forest model for the data.
rf_all = RandomForestClassifier()

parameters = {'n_estimators': [10,50,100,200,500], 
              'max_features': ['log2'], 
              'criterion': ['gini', 'entropy'],
              'max_depth': range(1,10), 
              'min_samples_split': [2,3,6],
              'min_samples_leaf': [1,2,3],
             }

roc_auc = make_scorer(roc_auc_score)

grid_obj = GridSearchCV(rf30, parameters, scoring=roc_auc)
grid_obj = grid_obj.fit(X_train, y_train)

rf_all = grid_obj.best_estimator_

rf_all.fit(X_train, y_train)

RandomForestClassifier(max_depth=9, max_features='log2', min_samples_leaf=2,
                       n_estimators=10)

In [27]:
#Predict on model and get accuracy score
predictions_all = rf_all.predict(X_test)
print(accuracy_score(y_test, predictions_all))

0.8934680035387791


In [28]:
#Random forest confusion matrix
confusion_matrix(y_test, predictions_all)

array([[11863,   121],
       [ 1324,   256]], dtype=int64)

**Logistic Regression**

In [47]:
X = top10_df

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=23)

In [48]:
#Create and gridsearch logistic regression model
lr = LogisticRegression()

parameters = {'penalty': ['l2','none'],
              'max_iter': [100, 200, 400, 500],
              'n_jobs': [None, 2],
              'multi_class': ['auto','ovr','multinomial']}

roc_auc = make_scorer(roc_auc_score, greater_is_better=True,
                                needs_threshold=True)
grid_obj_lr = GridSearchCV(lr, parameters, scoring=roc_auc)
grid_obj_lr = grid_obj_lr.fit(X_train, y_train)

lr = grid_obj_lr.best_estimator_

lr.fit(X_train, y_train)

LogisticRegression()

In [49]:
#Predict on model and get accuracy score
predictions_lr = lr.predict(X_test)
print(accuracy_score(y_test, predictions_lr))

0.8942789737540549


In [50]:
#Random forest confusion matrix
confusion_matrix(y_test, predictions_lr)

array([[11857,   127],
       [ 1307,   273]], dtype=int64)

**KNN**

In [53]:
knn = KNeighborsClassifier()

parameters = {'n_neighbors': range(1,10),
              'weights': ['uniform', 'distance'],
              'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
              'p': [1,2]}

roc_auc = make_scorer(roc_auc_score, greater_is_better=True,
                                needs_threshold=True)
grid_obj_knn = GridSearchCV(knn, parameters, scoring=roc_auc)
grid_obj_lr = grid_obj_knn.fit(X_train, y_train)

knn = grid_obj_knn.best_estimator_

knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='brute', n_neighbors=9, p=1)

In [54]:
#Predict on model and get accuracy score
predictions_knn = knn.predict(X_test)
print(accuracy_score(y_test, predictions_knn))

0.8928782070185786


In [55]:
#Random forest confusion matrix
confusion_matrix(y_test, predictions_knn)

array([[11826,   158],
       [ 1295,   285]], dtype=int64)