Each team will develop a machine learning model using those available in `scikit-learn` to predict whether or not a particular will elect to stop services provided by a financial institution based on known attributes. The first five rows of the training data are shown below.

In [1]:
# Standard library imports
import joblib
import json
import pathlib
import warnings
warnings.filterwarnings("ignore")

# Third-party library imports
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler

In [2]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,597,Germany,Female,35,8,131101.04,1,1,1,192852.67,0
1,523,France,Female,40,2,102967.41,1,1,0,128702.1,1
2,706,Spain,Female,42,8,95386.82,1,1,1,75732.25,0
3,788,France,Male,32,4,112079.58,1,0,0,89368.59,0
4,706,Germany,Male,38,5,163034.82,2,1,1,135662.17,0


Each team will deploy there models to Azure. Specifically, each team should use `Flask` to deploy a web service that can accept JSON payloads via a `post` request. An example of a typical JSON payload that should be expected is shown below.

In [3]:
target = 'Exited'
features = [col for col in train.columns if col != target]

train[features].loc[0].to_dict()

{'CreditScore': 597,
 'Geography': 'Germany',
 'Gender': 'Female',
 'Age': 35,
 'Tenure': 8,
 'Balance': 131101.04,
 'NumOfProducts': 1,
 'HasCrCard': 1,
 'IsActiveMember': 1,
 'EstimatedSalary': 192852.67}

**This will be a graded assignment!** Points will be allocated as follows:
- 70 points for a working deployment
- 20 points for a model that achieves a predictive accuracy greater than 80%.
- 10 points based on competition

The model competition will take place on Monday, 2/21. Each model will be asked to make predictions for > 30 unseen customers. The winning model will be the one that achieves the **highest predictive accuracy**. Teams with the **best and worst** performing models will give a overview of their modeling pipeline (all teams should be prepared to speak to this). This overview should include discussions on:
- data preparation steps,
- model selection, and
- model tuning.

In [4]:
train.shape

(9970, 11)

In [5]:
train.head(25)

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,597,Germany,Female,35,8,131101.04,1,1,1,192852.67,0
1,523,France,Female,40,2,102967.41,1,1,0,128702.1,1
2,706,Spain,Female,42,8,95386.82,1,1,1,75732.25,0
3,788,France,Male,32,4,112079.58,1,0,0,89368.59,0
4,706,Germany,Male,38,5,163034.82,2,1,1,135662.17,0
5,670,Spain,Female,57,3,175575.95,2,1,0,99061.75,1
6,590,Spain,Male,34,0,65812.35,2,0,1,160346.3,0
7,636,Spain,Female,29,6,157576.47,2,1,1,101102.39,0
8,598,France,Female,64,9,0.0,1,0,1,13181.37,1
9,456,France,Female,63,1,165350.61,2,0,0,140758.07,1


In [6]:
train['Geography'].unique()

array(['Germany', 'France', 'Spain'], dtype=object)

In [22]:
# Create "dummy" columns for categorical data
dummy_column_mapper = {}
for col in train.columns:
    if train[col].dtype == 'object':
        temp = pd.get_dummies(train[col], prefix=col, drop_first=True)
        train = train.drop(columns=[col])
        train[temp.columns] = temp
        dummy_column_mapper[col] = temp.columns.tolist()

# Save mapper for dummy columns
with open('dummy_column_mapper.json', 'w') as fout:
    json.dump(dummy_column_mapper, fout)

NameError: name 'X_train' is not defined

In [8]:
# Prepare data for model training
target = 'Exited'
features = [col for col in train.columns if col != target]
binary_columns = [col for col in features if sorted(train[col].unique().tolist()) == [0, 1]]

x = train[features].copy()
y = train[target]

x_train, x_test, y_train, y_test = train_test_split(
    x, 
    y, 
    test_size=0.25, 
    random_state=42,
)

In [9]:
# Fit scaler
scaler = StandardScaler()
scaler = scaler.fit(x_train)

# Save scaling information
scaler_filepath = pathlib.Path('scaler_values.json')

scaler_dict  = {}
for feature, mean, scale in zip(features, scaler.mean_, scaler.scale_):
    if feature in binary_columns:
        scaler_dict[feature] = {
            'mean': 0,
            'std': 1,
        }
    else:
        scaler_dict[feature] = {
            'mean': mean,
            'std': scale,
        }
        
with open(scaler_filepath, 'w') as fout:
    json.dump(scaler_dict, fout)
    
#Scaled our continuous data and left binary values that same

In [10]:
# Scale data
for col, col_params in scaler_dict.items():
    x_train.loc[:, col] = (x_train.loc[:, col] - col_params['mean'])/col_params['std']
    x_test.loc[:, col] = (x_test.loc[:, col] - col_params['mean'])/col_params['std']

In [11]:
x_train.head(25)

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_Germany,Geography_Spain,Gender_Male
2870,-0.209979,-1.605986,0.335593,1.001153,-0.912776,1.0,1.0,-0.621553,1.0,0.0,0.0
2818,-0.583537,0.476986,-0.354514,0.320263,0.802435,1.0,0.0,0.772192,1.0,0.0,0.0
3206,-0.230732,1.897195,1.370752,0.593799,-0.912776,1.0,1.0,0.301206,0.0,0.0,1.0
4461,0.972955,1.13975,-0.699567,0.362329,-0.912776,1.0,1.0,-1.226723,1.0,0.0,1.0
7033,-0.957096,0.950389,-1.04462,1.128129,-0.912776,1.0,0.0,0.365364,0.0,0.0,1.0
2678,0.080566,-0.185778,-1.389673,1.192672,-0.912776,1.0,1.0,0.719753,0.0,0.0,0.0
177,1.097475,-0.943222,1.025699,0.860047,-0.912776,1.0,0.0,-0.383103,0.0,0.0,1.0
1643,-0.334498,-1.037903,0.335593,0.361831,0.802435,1.0,0.0,-0.886946,1.0,0.0,0.0
9264,-1.683459,1.802514,-0.354514,-1.223838,0.802435,1.0,1.0,1.597043,0.0,1.0,1.0
6862,-1.133498,-0.943222,0.680646,-1.223838,0.802435,0.0,1.0,1.687968,0.0,0.0,0.0


In [12]:
y_train.head(25)

2870    0
2818    1
3206    1
4461    0
7033    1
2678    0
177     0
1643    0
9264    0
6862    0
1374    0
1720    0
6057    0
3643    0
2264    0
87      0
3799    0
9141    0
403     0
8381    1
3469    0
6786    1
8972    1
2456    0
8496    0
Name: Exited, dtype: int64

In [23]:
# Save column order of training data
with open('col_order.json', 'w') as fout:
    json.dump(x_train.columns.tolist(), fout)

## Training the Model

## Random Forest

In [13]:
# Fit Random Forest Model
params = {
    'criterion': ['gini', 'entropy'], 
    'max_depth': [1, 2, 5, 7, 10], 
    'n_estimators': [10, 100],
}

rf = GridSearchCV(RandomForestClassifier(random_state=2*16*2022*5), params, error_score=0)
search = rf.fit(x_train, y_train)
best_params = search.best_params_ 

rf = RandomForestClassifier(random_state=2*16*2022*5, **best_params)
rf = rf.fit(x_train.values, y_train.values) 
rf.score(x_test.values, y_test.values)

0.8644203770557561

In [14]:
# Save model
joblib.dump(rf, 'rf_model.joblib')

['rf_model.joblib']

## AdaBoost Model

In [15]:
#add a boost classifier
from sklearn.ensemble import AdaBoostClassifier 

params = {
    'n_estimators': [10, 100, 200],
    'learning_rate': [0.25, 0.5, 1.0, 2.0],
}    

adaboost = GridSearchCV(AdaBoostClassifier(random_state=0), params, error_score=0)
search = adaboost.fit(x_train, y_train)
best_params = search.best_params_ 

adaboost = AdaBoostClassifier(random_state=0, **best_params) 
adaboost = adaboost.fit(x_train, y_train) 
adaboost.score(x_test, y_test)

0.8576012835940634

In [16]:
# Save model
joblib.dump(adaboost, 'adaboost_model.joblib')

['adaboost_model.joblib']

## Logisic Regression

In [17]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(x_train,y_train)
logreg.score(x_test, y_test)

0.8126754913758524

In [18]:
# Save model
joblib.dump(logreg, 'logreg_model.joblib')

['logreg_model.joblib']

## Voting Classification Model

In [19]:
from sklearn.ensemble import VotingClassifier
params = {
    'voting': ['hard', 'soft'],
    'weights': [[0.3, 0.3, 0.4], [0.33, 0.33, 0.34], [0.70, 0.20, 0.10], [0.5, 0.35, 0.15], [0.4, 0.25, 0.35],
                [0.3, 0.4, 0.3], [0.33, 0.34, 0.33], [0.20, 0.10, 0.70], [0.35, 0.15, 0.5], [0.25, 0.35, 0.4],
                [0.4, 0.3, 0.3], [0.34, 0.33, 0.33], [0.10, 0.70, 0.20], [0.15, 0.5, 0.35], [0.35, 0.4, 0.25]],
}    

eclf = VotingClassifier(
    estimators=[('rf', rf), ('ada', adaboost), ('log', logreg)], 
)

clf = GridSearchCV(eclf, params, error_score=0)
search = clf.fit(x_train, y_train)
best_params = search.best_params_ 

eclf = VotingClassifier(
    estimators=[('rf', rf), ('ada', adaboost), ('log', logreg)],
    **best_params,
)

eclf = eclf.fit(x_train, y_train)
eclf.score(x_test, y_test)

0.8644203770557561

In [20]:
best_params

{'voting': 'hard', 'weights': [0.7, 0.2, 0.1]}

In [21]:
# Save model
joblib.dump(eclf, 'eclf_model.joblib')

['eclf_model.joblib']

## Run in Azure

In [18]:
import requests

In [19]:
base_endpoint = 'http://127.0.0.1:5000'

In [None]:
r = requests.get(base_endpoint)
r.text

In [None]:
predict_endpoint = 'http://127.0.0.1:5000'

In [None]:
r = requests.post(predict_endpoint, json=tf)
int(r.text)