# UniversalBank (Target = CD Account)

## 1.0 Setup : Import and install python libraries

In [110]:
# import numpy and pandas libraries
import numpy as np
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import uniform
import pandas as pd
from sklearn.tree import DecisionTreeClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings("ignore")

# set random seed to ensure that results are repeatable
np.random.seed(100)

## 2.0 Load Data

In [111]:
# load data
UniversalBank = pd.read_csv("UniversalBank.csv")

# look at the data
UniversalBank.head(3)

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0


## 3.0 Data Pocessing

### Observations:
1. ID and the ZIP Code columns will be dropped as part of feature selection and it is not needed
2. We dont have any observations with missing NA's.
3. Education we will encode it with Dummy encoding technique.

In [112]:
# Check the missing values by summing the total na's for each variable
UniversalBank.isna().sum()

ID                    0
Age                   0
Experience            0
Income                0
ZIP Code              0
Family                0
CCAvg                 0
Education             0
Mortgage              0
Personal Loan         0
Securities Account    0
CD Account            0
Online                0
CreditCard            0
dtype: int64

In [113]:
# create a list of these catagorical variables
category_var_list = list(UniversalBank.select_dtypes(include='object').columns)
category_var_list

[]

In [114]:
UniversalBank.drop(['ID','ZIP Code'], axis=1, inplace = True)

### Non ordered Dummy encoding
As education is label encoded we need to change it to dummy encoding.

In [115]:
UniversalBank = UniversalBank.join(pd.get_dummies(UniversalBank['Education'],prefix='Education',drop_first = True))
UniversalBank.drop('Education', axis=1, inplace = True)

## 4.0 Split Data
Splitting the data into train and test, with traning set as 70% and test set as 30%

In [116]:
# split the data into validation and training set
train_df, test_df = train_test_split(UniversalBank, test_size=0.3)

# to reduce repetition in later code, create variables to represent the columns
# that are our predictors and target
target = 'CD Account'
predictors = list(UniversalBank.columns)
predictors.remove(target)

#### Standardize numeric values

In [117]:
# create a standard scaler and fit it to the training set of predictors
scaler = preprocessing.StandardScaler()
cols_to_stdize = ['Age','Experience','Income','Family','CCAvg','Mortgage']
               
# Transform the predictors of training and validation sets
train_df[cols_to_stdize] = scaler.fit_transform(train_df[cols_to_stdize]) # train_predictors is not a numpy array
test_df[cols_to_stdize] = scaler.transform(test_df[cols_to_stdize]) # validation_target is now a series object

In [118]:
X_train = train_df[predictors]
y_train = train_df[target] # train_target is now a series objecttrain_df.to_csv('airbnb_train_df.csv', index=False)
X_test = test_df[predictors]
y_test = test_df[target] # validation_target is now a series object

## 5.0 Model the data
Create a dataframe to load the model performance metrics into

In [119]:
performance = pd.DataFrame({"model": [], "Accuracy": [], "Precision": [], "Recall": [], "F1": []})

### 5.1 Logistic Regression using default,L1,L2,Elastic,liblinear regularization

Conduting a random and exhaustive search across a smaller range of parameters around the parameters. We will use the value of best parameters found through Random Search to perform the Grid Search

In [120]:
score_measure = "recall"
kfolds = 5

param_grid = {
    'max_iter':np.arange(500,1000),
    'penalty': ['None','l1','l2','elasticnet'],
    'solver':['saga','liblinear']
}

log_reg_model = LogisticRegression()
rand_search = RandomizedSearchCV(estimator = log_reg_model, param_distributions=param_grid, cv=kfolds, n_iter=500,
                           scoring=score_measure, verbose=1, n_jobs=-1, 
                           return_train_score=True)

_ = rand_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {rand_search.best_score_}")
print(f"... with parameters: {rand_search.best_params_}")

Fitting 5 folds for each of 500 candidates, totalling 2500 fits
The best recall score is 0.6335365853658537
... with parameters: {'solver': 'saga', 'penalty': 'l1', 'max_iter': 649}


In [121]:
max_iter = rand_search.best_params_['max_iter']
penalty = rand_search.best_params_['penalty']
solver = rand_search.best_params_['solver']

param_grid = {
    'max_iter': np.arange(max_iter-3,max_iter+3),  
    'penalty': [penalty],
    'solver': [solver]
}

log_reg_model = LogisticRegression()
grid_search = GridSearchCV(estimator = log_reg_model, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,
                           return_train_score=True)

_ = grid_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {grid_search.best_score_}")
print(f"... with parameters: {grid_search.best_params_}")

bestRecallLogistic = grid_search.best_estimator_

Fitting 5 folds for each of 6 candidates, totalling 30 fits
The best recall score is 0.6335365853658537
... with parameters: {'max_iter': 646, 'penalty': 'l1', 'solver': 'saga'}


In [122]:
c_matrix = confusion_matrix(y_test, grid_search.predict(X_test))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"Logistic", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

### Observation:

For the Logistic Regression Classifier the best performing parameters are {'max_iter': 646, 'penalty': 'l1', 'solver': 'saga'} with the best recall score of 63.35%

### 5.2 SVM Classification using linear, rbf and poly kernal

Conduting a random and exhaustive search across a smaller range of parameters around the parameters. We will use the value of best parameters found through Random Search to perform the Grid Search

In [123]:
score_measure = "recall"
kfolds = 5

param_grid = {
    'C': np.arange(1,30),   
    'gamma': ['scale','auto'],
    'kernel':['linear','rbf','poly']
}

svm_model = SVC()
rand_search = RandomizedSearchCV(estimator = svm_model, param_distributions=param_grid, cv=kfolds, n_iter=500,
                           scoring=score_measure, verbose=1, n_jobs=-1, 
                           return_train_score=True)

_ = rand_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {rand_search.best_score_}")
print(f"... with parameters: {rand_search.best_params_}")

Fitting 5 folds for each of 174 candidates, totalling 870 fits
The best recall score is 0.6335365853658537
... with parameters: {'kernel': 'linear', 'gamma': 'scale', 'C': 1}


In [124]:
score_measure = "recall"
kfolds = 5

C = rand_search.best_params_['C']
gamma = rand_search.best_params_['gamma']
kernel = rand_search.best_params_['kernel']

param_grid = {
    'C': np.arange(C-2,C+2),  
    'gamma': [gamma],
    'kernel': [kernel]
    
}

svm_model = SVC()
grid_search = GridSearchCV(estimator = svm_model, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,
                           return_train_score=True)

_ = grid_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {grid_search.best_score_}")
print(f"... with parameters: {grid_search.best_params_}")

bestRecallSVM = grid_search.best_estimator_

Fitting 5 folds for each of 4 candidates, totalling 20 fits
The best recall score is 0.6335365853658537
... with parameters: {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}


In [125]:
c_matrix = confusion_matrix(y_test, grid_search.predict(X_test))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"SVM", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

### Observation:

For the SVM Classifier the best performing parameters are {'C': 1, 'gamma': 'scale', 'kernel': 'linear'} with the best recall score of 63.35%

### 5.3  Decision Tree Classifier
Conduting a random and exhaustive search across a smaller range of parameters. We will use the value of best parameters found through Random Search to perform the Grid Search


In [131]:
score_measure = "recall"
kfolds = 5

param_grid = {
    'min_samples_split': np.arange(1,100),  
    'min_samples_leaf': np.arange(1,100),
    'min_impurity_decrease': np.arange(0.0001, 0.01, 0.0005),
    'max_leaf_nodes': np.arange(5, 100), 
    'max_depth': np.arange(1,50), 
    'criterion': ['entropy', 'gini'],
}

dtree = DecisionTreeClassifier()
rand_search = RandomizedSearchCV(estimator = dtree, param_distributions=param_grid, cv=kfolds, n_iter=500,
                           scoring=score_measure, verbose=1, n_jobs=-1, 
                           return_train_score=True)

_ = rand_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {rand_search.best_score_}")
print(f"... with parameters: {rand_search.best_params_}")


Fitting 5 folds for each of 500 candidates, totalling 2500 fits
The best recall score is 0.6485365853658537
... with parameters: {'min_samples_split': 17, 'min_samples_leaf': 2, 'min_impurity_decrease': 0.0006000000000000001, 'max_leaf_nodes': 49, 'max_depth': 45, 'criterion': 'entropy'}


In [132]:
score_measure = "recall"
kfolds = 5

min_samples_split = rand_search.best_params_['min_samples_split']
min_samples_leaf = rand_search.best_params_['min_samples_leaf']
min_impurity_decrease = rand_search.best_params_['min_impurity_decrease']
max_leaf_nodes = rand_search.best_params_['max_leaf_nodes']
max_depth = rand_search.best_params_['max_depth']
criterion = rand_search.best_params_['criterion']

param_grid = {
    'min_samples_split': np.arange(min_samples_split-3,min_samples_split+3),  
    'min_samples_leaf': np.arange(min_samples_leaf-3,min_samples_leaf+3),
    'min_impurity_decrease': np.arange(min_impurity_decrease-0.0001, min_impurity_decrease+0.0001, 0.0001),
    'max_leaf_nodes': np.arange(max_leaf_nodes-2,max_leaf_nodes+2), 
    'max_depth': np.arange(max_depth-2,max_depth+2), 
    'criterion': [criterion]
}

grid_search = GridSearchCV(estimator=dtree, param_grid=param_grid, cv=kfolds, scoring=score_measure, verbose=1, n_jobs=-1, 
                           return_train_score=True)
_ = grid_search.fit(X_train, y_train)


print(f"The best {score_measure} score is {rand_search.best_score_}")
print(f"... with parameters: {rand_search.best_params_}")

bestRecallTree = grid_search.best_estimator_

Fitting 5 folds for each of 1728 candidates, totalling 8640 fits
The best recall score is 0.6485365853658537
... with parameters: {'min_samples_split': 17, 'min_samples_leaf': 2, 'min_impurity_decrease': 0.0006000000000000001, 'max_leaf_nodes': 49, 'max_depth': 45, 'criterion': 'entropy'}


In [133]:
c_matrix = confusion_matrix(y_test, grid_search.predict(X_test))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"Decision Tree", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

### Observation:

For the Decision Tree Classifier the best performing parameters are {'min_samples_split': 17, 'min_samples_leaf': 2, 'min_impurity_decrease': 0.0006000000000000001, 'max_leaf_nodes': 49, 'max_depth': 45, 'criterion': 'entropy'} with the best recall score of 64.85%

## 6.0 Summary

In [134]:
performance.sort_values(by=['Recall'])

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,Logistic,0.978667,1.0,0.68,0.809524
0,SVM,0.978667,1.0,0.68,0.809524
0,Decision Tree,0.974667,0.907895,0.69,0.784091


#### Observation:
Considering the three models, for scoring measure recall, we can see that Decision Tree has the best recall rate of 69%. 
The SVM and Logistics model has a recall rate of 68%