# 5. Deploying Machine Learning Models

In [4]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

rs = 1 # random state

In [5]:
df = pd.read_csv("data-week-3.csv")

df.columns = df.columns.str.lower().str.replace(" ", "_")
categorical_columns = list(df.dtypes[df.dtypes == "object"].index)

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(" ", "_")

df["totalcharges"] = pd.to_numeric(df["totalcharges"], errors="coerce")
df["totalcharges"] = df["totalcharges"].fillna(0)

df["churn"] = (df["churn"] == "yes").astype(int)

In [6]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=rs)

In [7]:
numerical = ["tenure", "monthlycharges", "totalcharges"]

categorical = [
    "gender",
    "seniorcitizen",
    "partner",
    "dependents",
    "phoneservice",
    "multiplelines",
    "internetservice",
    "onlinesecurity",
    "onlinebackup",
    "deviceprotection",
    "techsupport",
    "streamingtv",
    "streamingmovies",
    "contract",
    "paperlessbilling",
    "paymentmethod",
]

In [8]:
def train(df_train, y_train, C=1.0):
    dicts = df_train[categorical + numerical].to_dict(orient="records")
    
    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(dicts)
    model = LogisticRegression(C=C, max_iter=1000)
    model.fit(X_train, y_train)

    return dv, model

In [9]:
def predict(df, dv, model):
    dicts = df[categorical + numerical].to_dict(orient="records")
    X = dv.transform(dicts)
    y_pred = model.predict_proba(X)[:, 1]

    return y_pred

In [10]:
C = 1.0
n_splits = 5

In [24]:
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=rs)

scores = []

for i, (train_idx, val_idx) in enumerate(kfold.split(df_full_train)):
    df_train = df_full_train.iloc[train_idx]
    df_val = df_full_train.iloc[val_idx]

    y_train = df_train["churn"].values
    y_val = df_val["churn"].values

    dv, model = train(df_train, y_train, C=C)
    y_pred = predict(df_val, dv, model)

    auc = roc_auc_score(y_val, y_pred)
    scores.append(auc)
    print(f"AUC({i+1}): {auc:.3f}")

print(f"C = {C:4} | {np.mean(scores):.3f} +- {np.std(scores):.3f}")

AUC(1): 0.844
AUC(2): 0.846
AUC(3): 0.832
AUC(4): 0.830
AUC(5): 0.852
C =  1.0 | 0.841 +- 0.008


In [26]:
dv, model = train(df_full_train, df_full_train["churn"].values, C=1.0)
y_pred = predict(df_test, dv, model)
y_test = df_test["churn"].values

auc = roc_auc_score(y_test, y_pred)
print(f"AUC(full): {auc:.3f}")

AUC(full): 0.857


Code until here was taken from previous weeks. Now we want to save the model in order to use it further.

## 5.2 Saving and loading the model

- Saving the model to `pickle`
- Loading the model from `pickle`
- Turning our notebook into a Python script

**Save the model with `pickle`**

In [27]:
import pickle

In [29]:
output_file = f"model_C={C}.bin"
output_file

'model_C=1.0.bin'

In [31]:
# dv is also important for preprocessing of model-inputs
# and is saved alongside with the model
with open(output_file, 'wb') as f_out:
    pickle.dump((dv, model), f_out)

**Loading the model from `pickle`**

In [33]:
import pickle

In [34]:
model_file = "model_C=1.0.bin"

In [35]:
with open(model_file, "rb") as f_in:
    dv, model = pickle.load(f_in)


In [36]:
dv, model

(DictVectorizer(sparse=False), LogisticRegression(max_iter=1000))

**Turning our notebook into a Python script**

In [37]:
customer = {
    'gender': 'female',
    'seniorcitizen': 0,
    'partner': 'yes',
    'dependents': 'no',
    'phoneservice': 'no',
    'multiplelines': 'no_phone_service',
    'internetservice': 'dsl',
    'onlinesecurity': 'no',
    'onlinebackup': 'yes',
    'deviceprotection': 'no',
    'techsupport': 'no',
    'streamingtv': 'no',
    'streamingmovies': 'no',
    'contract': 'month-to-month',
    'paperlessbilling': 'yes',
    'paymentmethod': 'electronic_check',
    'tenure': 1,
    'monthlycharges': 29.85,
    'totalcharges': 29.85
}

Turning the customer (singular dictionary of data) into a feature matrix with the saved `DictVectorizer`

In [40]:
X = dv.transform([customer])

Using the saved model for predictiom

In [46]:
y_customer = model.predict_proba(X)[0, 1]
print(f"p = {y_customer:.3f} | Churn: {y_customer >= 0.5}")

p = 0.636 | Churn: True


Exporting the Notebook as Python-file:
- Using the save as function of the web-version of Jupyter notebook
- Using the console: `jupyter nbconvert --to python notebook.ipynb`

The Code from this notebook is split into the
- Traingin script: [train.py](./train.py)
- Prediction script: [predict.py](./predict.py)