In [1]:
# Import Libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# Read data
bank_data_final = pd.read_csv(filepath_or_buffer="bank-additional-full-final.csv", delimiter=',')
bank_data_final = bank_data_final.iloc[:, 1:44]
bank_data_final.head()

Unnamed: 0,x0_admin.,x0_blue-collar,x0_entrepreneur,x0_housemaid,x0_management,x0_retired,x0_self-employed,x0_services,x0_student,x0_technician,...,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,261.0,1.0,1.0,0.0,1.1,93.994,-36.4,4.857,5191.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,226.0,1.0,1.0,0.0,1.1,93.994,-36.4,4.857,5191.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,151.0,1.0,1.0,0.0,1.1,93.994,-36.4,4.857,5191.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,307.0,1.0,1.0,0.0,1.1,93.994,-36.4,4.857,5191.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,139.0,1.0,1.0,0.0,1.1,93.994,-36.4,4.857,5191.0,0.0


In [3]:
# Setting dependent and independent variables
X = bank_data_final.iloc[:, 0:42].values
y = bank_data_final.iloc[:, 42].values

In [4]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [5]:
# Feature Scaling for numerical attributes only
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, 31:42] = sc.fit_transform(X_train[:, 31:42])
X_test[:,31:42] = sc.transform(X_test[:, 31:42])

In [6]:
# Solving imbalance output problem(accuracy paradox) by oversampling
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=2)
X_train, y_train = sm.fit_sample(X_train, y_train.ravel())

In [7]:
# Training the random forest model
classifier = []
from sklearn.ensemble import RandomForestClassifier
classifier.append(RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 0))
classifier[0].fit(X_train, y_train)
y_pred = []
y_pred.append(classifier[0].predict_proba(X_test)[:, 1] > 0.3)

In [8]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = []
for x in range(0,len(y_pred)):
    cm.append(confusion_matrix(y_test, y_pred[x]))
cm[0]

array([[4677,  673],
       [ 102,  646]])

In [9]:
# Making classification_report
from sklearn.metrics import classification_report
for x in range(0,len(y_pred)):
    print(x)
    print(classification_report(y_test, y_pred[x]))

0
             precision    recall  f1-score   support

        0.0       0.98      0.87      0.92      5350
        1.0       0.49      0.86      0.63       748

avg / total       0.92      0.87      0.89      6098



In [10]:
# Grid Search CV Sequential
import time
from sklearn import grid_search, datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
tStart_seq = time.time()
parameters = {"max_depth": [3, None],
              "max_features": [1, 3, 10],
             "min_samples_split": [0.5, 0.9],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"],
              "n_estimators": [10, 20, 40, 80]}
gs = grid_search.GridSearchCV(RandomForestClassifier(), param_grid=parameters)
gs.fit(X_train, y_train.astype('int'))
print(gs.best_score_)
print(gs.best_params_)
tEnd_seq = time.time()
print('Time required to do Sequential Grid Search CV: {} seconds'.format(tEnd_seq - tStart_seq))



0.7940927675172705
{'bootstrap': False, 'criterion': 'entropy', 'max_depth': 3, 'max_features': 1, 'min_samples_leaf': 1, 'min_samples_split': 0.5, 'n_estimators': 10}
Time required to do Sequential Grid Search CV: 423.53497409820557 seconds


In [11]:
# Grid Search CV Parallel
import time
from sklearn import grid_search, datasets
from sklearn.ensemble import RandomForestClassifier
import spark_sklearn
from spark_sklearn import GridSearchCV
from spark_sklearn.util import createLocalSparkSession
sc = createLocalSparkSession().sparkContext
tStart_par = time.time()
parameters = {"max_depth": [3, None],
              "max_features": [1, 3, 10],
             "min_samples_split": [0.5, 0.9],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"],
              "n_estimators": [10, 20, 40, 80]}
gs = spark_sklearn.GridSearchCV(sc,estimator=RandomForestClassifier(), param_grid=parameters)
gs.fit(X_train, y_train.astype('int'))
# print(gs.best_score_)
# print(gs.best_params_)
tEnd_par = time.time()
print('Time required to do Parallel Grid Search CV: {} seconds'.format(tEnd_par - tStart_par))

Time required to do Parallel Grid Search CV: 171.53400897979736 seconds
