In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score

**Univariate Selection**

In [2]:
df1 = pd.read_csv('bank_df.csv')
df1.head()

Unnamed: 0,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,...,month_may,month_nov,month_oct,month_sep,poutcome_other,poutcome_success,poutcome_unknown,age_scaled,balance_scaled,subscribed
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.266667,1.249263,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.333333,-0.309735,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.4,-0.329646,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.533333,0.779499,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.4,-0.330383,0.0


In [3]:
df = pd.read_csv("bank_df.csv")
X = df.iloc[:,0:35]  #independent columns
y = df.iloc[:,-1]    #target column i.e price range

In [4]:
#apply SelectKBest class to extract top 10 best features
bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(38,'Score'))  #print 10 best features

                  Specs        Score
33     poutcome_success  4102.364079
27            month_mar   751.390676
30            month_oct   736.544197
20      contact_unknown   730.603443
31            month_sep   678.905604
17          housing_yes   386.946356
28            month_may   328.216293
4           job_retired   264.230749
7           job_student   257.473070
22            month_dec   254.830097
34     poutcome_unknown   229.873875
0       job_blue-collar   182.752541
18             loan_yes   176.351728
14   education_tertiary   141.919597
12       marital_single   130.676994
11      marital_married    65.256115
23            month_feb    63.367162
32       poutcome_other    44.948695
25            month_jul    44.564087
3        job_management    39.195984
6          job_services    31.395433
13  education_secondary    29.669913
16          default_yes    22.190861
9        job_unemployed    18.455529
1      job_entrepreneur    16.680984
29            month_nov    12.301400
2

In [5]:
ordered_df = featureScores.nlargest(38,'Score')
df = df[list(ordered_df['Specs'].values)]
df.head()

Unnamed: 0,poutcome_success,month_mar,month_oct,contact_unknown,month_sep,housing_yes,month_may,job_retired,job_student,month_dec,...,month_nov,month_jun,job_housemaid,contact_telephone,education_unknown,month_jan,job_technician,month_aug,job_self-employed,job_unknown
0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [6]:
#Create dataframes with different feature sets
top5_df = df.iloc[:,:5]
top10_df = df.iloc[:,:10]
top20_df = df.iloc[:,:20]
top30_df = df.iloc[:,:30]

**Top 5 Features Random Forest**

In [7]:
X = top5_df

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=23)

In [8]:
#Create and gridsearch a random forest model for the data.
clf = RandomForestClassifier()

parameters = {'n_estimators': [10, 20, 50, 100, 200], 
              'max_features': ['log2'], 
              'criterion': ['gini', 'entropy'],
              'max_depth': range(1,10), 
              'min_samples_split': [2,3,6],
              'min_samples_leaf': [1,2,3],
             }

roc_auc = make_scorer(roc_auc_score)

grid_obj = GridSearchCV(clf, parameters, scoring=roc_auc)
grid_obj = grid_obj.fit(X_train, y_train)

clf = grid_obj.best_estimator_

clf.fit(X_train, y_train)

RandomForestClassifier(max_depth=9, max_features='log2', min_samples_leaf=2,
                       min_samples_split=6, n_estimators=10)

In [9]:
roc_auc_score(y, clf.predict_proba(X)[:, 1])

0.695927934183491

**Top 10 Features Random Forest**

In [10]:
X = top10_df

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=23)

In [11]:
#Create and gridsearch a random forest model for the data.
rf10 = RandomForestClassifier()

parameters = {'n_estimators': [10, 50, 100, 200, 500], 
              'max_features': ['log2'], 
              'criterion': ['gini', 'entropy'],
              'max_depth': range(1,10), 
              'min_samples_split': [2,3,6],
              'min_samples_leaf': [1,2,3],
             }

roc_auc = make_scorer(roc_auc_score)

grid_obj = GridSearchCV(rf10, parameters, scoring=roc_auc)
grid_obj = grid_obj.fit(X_train, y_train)

rf10 = grid_obj.best_estimator_

rf10.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', max_depth=9, max_features='log2',
                       min_samples_leaf=3, n_estimators=10)

In [12]:
roc_auc_score(y, rf10.predict_proba(X)[:, 1])

0.7384991537966141

**Top 20 Random Forest**

In [13]:
X = top20_df

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=23)

In [14]:
#Create and gridsearch a random forest model for the data.
rf20 = RandomForestClassifier()

parameters = {'n_estimators': [10,50,100,200,500], 
              'max_features': ['log2'], 
              'criterion': ['gini', 'entropy'],
              'max_depth': range(1,10), 
              'min_samples_split': [2,3,6],
              'min_samples_leaf': [1,2,3],
             }

roc_auc = make_scorer(roc_auc_score)

grid_obj = GridSearchCV(rf20, parameters, scoring=roc_auc)
grid_obj = grid_obj.fit(X_train, y_train)

rf20 = grid_obj.best_estimator_

rf20.fit(X_train, y_train)

RandomForestClassifier(max_depth=9, max_features='log2', min_samples_leaf=3,
                       min_samples_split=6, n_estimators=10)

In [15]:
roc_auc_score(y, rf20.predict_proba(X)[:, 1])

0.7645191729781551

**Top 30 Random Forest**

In [18]:
X = top30_df

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=23)

In [19]:
#Create and gridsearch a random forest model for the data.
rf30 = RandomForestClassifier()

parameters = {'n_estimators': [10,50,100,200,500], 
              'max_features': ['log2'], 
              'criterion': ['gini', 'entropy'],
              'max_depth': range(1,10), 
              'min_samples_split': [2,3,6],
              'min_samples_leaf': [1,2,3],
             }

roc_auc = make_scorer(roc_auc_score)

grid_obj = GridSearchCV(rf30, parameters, scoring=roc_auc)
grid_obj = grid_obj.fit(X_train, y_train)

rf30 = grid_obj.best_estimator_

rf30.fit(X_train, y_train)

RandomForestClassifier(max_depth=9, max_features='log2', min_samples_leaf=2,
                       min_samples_split=3, n_estimators=10)

In [20]:
roc_auc_score(y, rf30.predict_proba(X)[:, 1])

0.7707628644699127

**All Features**

In [21]:
X = df

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=23)

In [22]:
#Create and gridsearch a random forest model for the data.
rf_all = RandomForestClassifier()

parameters = {'n_estimators': [10,50,100,200,500], 
              'max_features': ['log2'], 
              'criterion': ['gini', 'entropy'],
              'max_depth': range(1,10), 
              'min_samples_split': [2,3,6],
              'min_samples_leaf': [1,2,3],
             }

roc_auc = make_scorer(roc_auc_score)

grid_obj = GridSearchCV(rf30, parameters, scoring=roc_auc)
grid_obj = grid_obj.fit(X_train, y_train)

rf_all = grid_obj.best_estimator_

rf_all.fit(X_train, y_train)

RandomForestClassifier(max_depth=9, max_features='log2', n_estimators=10)

In [23]:
roc_auc_score(y, rf_all.predict_proba(X)[:, 1])

0.7772384748356422

**Logistic Regression**

In [29]:
X = df

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=23)

In [30]:
#Create and gridsearch logistic regression model
lr = LogisticRegression()

parameters = {'penalty': ['l2','none'],
              'max_iter': [100, 200, 400, 500],
              'n_jobs': [None, 2],
              'multi_class': ['auto','ovr','multinomial']}

roc_auc = make_scorer(roc_auc_score, greater_is_better=True,
                                needs_threshold=True)
grid_obj_lr = GridSearchCV(lr, parameters, scoring=roc_auc)
grid_obj_lr = grid_obj_lr.fit(X_train, y_train)

lr = grid_obj_lr.best_estimator_

lr.fit(X_train, y_train)

LogisticRegression()

In [31]:
roc_auc_score(y, lr.predict_proba(X)[:, 1])

0.76234856695094

**KNN**

In [32]:
knn = KNeighborsClassifier()

parameters = {'n_neighbors': range(1,10),
              'weights': ['uniform', 'distance'],
              'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
              'p': [1,2]}

roc_auc = make_scorer(roc_auc_score, greater_is_better=True,
                                needs_threshold=True)
grid_obj_knn = GridSearchCV(knn, parameters, scoring=roc_auc)
grid_obj_lr = grid_obj_knn.fit(X_train, y_train)

knn = grid_obj_knn.best_estimator_

knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=9, p=1)

In [33]:
roc_auc_score(y, knn.predict_proba(X)[:, 1])

0.7762862603486558