In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

lr = LogisticRegression()
pipe = make_pipeline(StandardScaler(), LogisticRegression())
print(pipe.steps)

[('standardscaler', StandardScaler()), ('logisticregression', LogisticRegression())]


In [3]:
churn = pd.read_csv('data/churn_ver02.csv')
churn.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,0,42,2,0.0,1,1,1,101348.88,1
1,608,0,41,1,83807.86,1,0,1,112542.58,0
2,502,0,42,8,159660.8,3,1,0,113931.57,1
3,699,0,39,1,0.0,2,0,0,93826.63,0
4,850,0,43,2,125510.82,1,1,1,79084.1,0


In [4]:
x = churn.drop(columns='Exited', axis='columns')
y = churn['Exited']

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1234, stratify=y)

### Example 1. Compare PIPE with LR!

In [5]:
pipe.fit(x_train, y_train)
lr.fit(x_train, y_train)

LogisticRegression()

In [6]:
from sklearn.metrics import roc_auc_score

pred_pipe = pipe.predict(x_test)
pred_lr = lr.predict(x_test)

print('Score (Pipeline):', roc_auc_score(y_test, pred_pipe).round(4))
print('Score (Logistic Regression):', roc_auc_score(y_test, pred_lr).round(4))

Score (Pipeline): 0.5669
Score (Logistic Regression): 0.5139


### Example 2. Grid Search and Random Search

In [7]:
lr_params = {'logisticregression__C': [0.01, 0.1, 1, 10, 100]}

from sklearn.model_selection import GridSearchCV

pipe = make_pipeline(StandardScaler(), LogisticRegression())
gs = GridSearchCV(pipe, lr_params, cv=5)
gs.fit(x_train, y_train)

pred_gs = gs.predict(x_test)

print('Score (Pipeline):', roc_auc_score(y_test, pred_pipe).round(4))
print('Score (Logistic Regression):', roc_auc_score(y_test, pred_lr).round(4))
print('Score (Grid Search):', roc_auc_score(y_test, pred_gs).round(4))

Score (Pipeline): 0.5669
Score (Logistic Regression): 0.5139
Score (Grid Search): 0.5633


### Example 3. Model Selection

In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [9]:
pipe = Pipeline([('Data_Cleaning', StandardScaler()), ('Algorithm', LogisticRegression())])

In [11]:
params_dict = [
    
    {'Algorithm': [LogisticRegression()],
     'Data_Cleaning': [StandardScaler()],
     'Algorithm__C': [0.01, 0.1, 1, 10, 100]},
    
    {'Algorithm': [DecisionTreeClassifier()],
     'Data_Cleaning': [None],
     'Algorithm__max_depth': np.arange(1, 11)},
    
    {'Algorithm': [KNeighborsClassifier()],
     'Data_Cleaning': [StandardScaler(), None],
     'Algorithm__n_neighbors': [10, 20, 30, 40, 50, 100, 200, 300]}
    
]

In [12]:
gs = GridSearchCV(pipe, params_dict, cv=5)
gs.fit(x_train, y_train)
pred_modelselection = gs.predict(x_test)

print('Score (Pipeline):', roc_auc_score(y_test, pred_pipe).round(4))
print('Score (Logistic Regression):', roc_auc_score(y_test, pred_lr).round(4))
print('Score (GridSearch):', roc_auc_score(y_test, pred_gs).round(4))
print('Score (Model Selection):', roc_auc_score(y_test, pred_modelselection).round(4))

Score (Pipeline): 0.5669
Score (Logistic Regression): 0.5139
Score (GridSearch): 0.5633
Score (Model Selection): 0.7174


In [13]:
# Check out what is the best model!
gs.best_params_

{'Algorithm': DecisionTreeClassifier(max_depth=5),
 'Algorithm__max_depth': 5,
 'Data_Cleaning': None}