## Logistic Model Tuning

- Bank Customer Churn Data


In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('ggplot')
%matplotlib inline
np.set_printoptions(suppress=True) 

from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, precision_recall_curve,f1_score, fbeta_score

from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score, cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [5]:
bank_churn = pd.read_csv('bankchurn.csv')
bank_churn.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Gender_Male,Geography_Germany,Geography_Spain
0,619,42,2,0.0,1,1,1,101348.88,1,0,0,0
1,608,41,1,83807.86,1,0,1,112542.58,0,0,0,1
2,502,42,8,159660.8,3,1,0,113931.57,1,0,0,0
3,699,39,1,0.0,2,0,0,93826.63,0,0,0,0
4,850,43,2,125510.82,1,1,1,79084.1,0,0,0,1


In [6]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in split.split(bank_churn, bank_churn['Exited']):
    data_tr = bank_churn.loc[train_index]
    data_tst = bank_churn.loc[test_index]

In [7]:
X_train = data_tr.drop("Exited", axis=1)
X_test = data_tst.drop("Exited", axis=1)

y_train = data_tr["Exited"]
y_test = data_tst["Exited"]

In [9]:

sc=StandardScaler()

x_sc=sc.fit_transform(X_train)
x_sc=sc.fit_transform(X_test)

# Models

## KNN

In [37]:
# define the parameter values that should be searched
k_range = list(range(1, 31))
weight_options = ['uniform', 'distance']

In [38]:
# create a parameter grid: map the parameter names to the values that should be searched
param_grid = dict(n_neighbors=k_range, weights=weight_options)
print(param_grid)

{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], 'weights': ['uniform', 'distance']}


In [39]:
# instantiate and fit the grid
grid = GridSearchCV(knn, param_grid, cv=10, scoring='f1_weighted')
grid.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=KNeighborsClassifier(n_neighbors=30),
             param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                         13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                         23, 24, 25, 26, 27, 28, 29, 30],
                         'weights': ['uniform', 'distance']},
             scoring='f1_weighted')

In [40]:
# examine the best model -- still 13 neighbors and uniform weights
print("Best params: ", grid.best_params_)
print("Best estimator: ", grid.best_estimator_)
print("Best score: ", grid.best_score_)

Best params:  {'n_neighbors': 7, 'weights': 'uniform'}
Best estimator:  KNeighborsClassifier(n_neighbors=7)
Best score:  0.7177557607709788


In [52]:
knn2 = KNeighborsClassifier(n_neighbors=7, weights='uniform')
knn2.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=7)

In [53]:
print('KNN Train score:',knn2.score(X_train,y_train)*100)

KNN Train score: 80.5875


## Logistic Regression

In [15]:
lm=LogisticRegression(random_state=42, max_iter=1000)
lm.fit(X_train,y_train)


LogisticRegression(max_iter=1000, random_state=42)

In [17]:
parameters = {'C':[0.8,0.9,1,1.1,1.2]}

lm_grid = GridSearchCV(lm, parameters,cv=10, verbose=1, n_jobs=-1).fit(X_train,y_train)


Fitting 10 folds for each of 5 candidates, totalling 50 fits


In [18]:
print('Best parameters:',lm_grid.best_params_)

print('Logistic Regression Train score:',lm.score(X_train,y_train)*100)

print('Logistic Regression Cros validation score:',lm_grid.best_score_*100)



Best parameters: {'C': 0.8}
Logistic Regression Train score: 79.625
Logistic Regression Cros validation score: 79.025


In [21]:
lm2=LogisticRegression(random_state=42, C=0.8)
lm2.fit(X_train,y_train)
print('Logistic Regression Train score:',lm2.score(X_train,y_train)*100)

Logistic Regression Train score: 79.625


In [None]:
print('Logistic Regression Train score:',lm2.score(X_train,y_train)*100)

## Random Forest

In [31]:
rfc=RandomForestClassifier(n_jobs=-1)

In [32]:
parameters = {'n_estimators':[50,100,200,300,400],'max_depth':[3,4,5,6]}
rfc_grid = GridSearchCV(rfc, parameters,cv=10,verbose=1,n_jobs=-1).fit(X_train,y_train)

Fitting 10 folds for each of 20 candidates, totalling 200 fits


In [34]:
print(rfc_grid.best_params_)
print('Random Forest Classifier Cros validation score:',rfc_grid.best_score_*100)

{'max_depth': 6, 'n_estimators': 50}
Random Forest Classifier Cros validation score: 85.8


In [36]:
rfc2 = RandomForestClassifier(n_jobs=-1, n_estimators=50, max_depth=6)
rfc2.fit(X_train,y_train)


RandomForestClassifier(max_depth=6, n_estimators=50, n_jobs=-1)

In [38]:
print('Random Forest Classifier Train score:',rfc2.score(X_train,y_train)*100)

Random Forest Classifier Train score: 86.47500000000001


## XGBoost

In [21]:
print(xgb_grid.best_params_)
print('XGB Classifier Cros validation score:',xgb_grid.best_score_*100)

Fitting 10 folds for each of 24 candidates, totalling 240 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 240 out of 240 | elapsed:  3.8min finished


{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}
XGB Classifier Cros validation score: 86.38888888888889


In [22]:
parameters = {'n_estimators':[25,50,100,200],'max_depth':[2,3,4,5,6,7,8],'learning_rate':[.5, 0.1,0.01]}
xgb_grid = GridSearchCV(xgb, parameters, cv=10, verbose=1).fit(X_train,y_train)

XGB Classifier Classifier Train score: 87.9


In [43]:
xgb = XGBClassifier(n_jobs=-1, use_label_encoder=False, random_state=42, eval_metric='error')

In [49]:
xgb2 = XGBClassifier(n_jobs=-1, random_state=42, n_estimators=200, max_depth=3, learning_rate=0.1, eval_metric='error')
xgb2.fit(X_train,y_train)




XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              eval_metric='error', gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.1, max_delta_step=0,
              max_depth=3, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=200, n_jobs=-1,
              num_parallel_tree=1, predictor='auto', random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [50]:
print('XGB Classifier Classifier Train score:',xgb2.score(X_train,y_train)*100)

XGB Classifier Classifier Train score: 87.675
