# Baseline Model



<br>

### Imports

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score

from src.helper_utilities import load_data
from src.modeling_utilities import Baseline, f2_scorer, classification_scores

### Random Seed

In [2]:
random_state = 42

### Load the data

In [3]:
# Get the (user-friendly) data for a baseline model
df = load_data(mode='analysis', format='dataframe')

# Save the "user friendly" dataframe for EDA as csv
df.to_csv("data/user_friendly_cats.csv", index=False)

# get the data from the saved csv due to the pd quirk with Ctegoricals
df = pd.read_csv("data/user_friendly_cats.csv")
df.tail()

Unnamed: 0,tenure,amount,rate,residence,age,credits,maintenance,history,savings,employment,...,status,purpose,guarantor,installments,housing,telephone,foreign,sex,personal,label
995,12,1736,3,4,31,1,1,so far so good,"[0, 100)","[4, 7)",...,no account,furniture,none,none,ownership,none,True,female,female divorced/separated/married,0
996,30,3857,4,4,40,1,1,so far so good,"[0, 100)","[1, 4)",...,overdrawn,used car,none,none,ownership,yes,True,male,male divorced/separated,0
997,12,804,4,4,38,1,1,so far so good,"[0, 100)","[7, inf)",...,no account,television,none,none,ownership,none,True,male,male single,0
998,45,1845,4,4,23,1,1,so far so good,"[0, 100)","[1, 4)",...,overdrawn,television,none,none,without payment,yes,True,male,male single,1
999,45,4576,3,4,27,1,1,critical,"[100, 500)",unemployed,...,petty,used car,none,none,ownership,none,True,male,male single,0


# Baseline model

This baseline model is based on a simple lookup table approach. You can view the code here:
[src/modeling_utilities.py](src/modeling_utilities.py)

In [4]:
# Train Test Split
X = df.copy()
y = X.pop('label')
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=random_state)

In [5]:
# This baseline model is based on a simple lookup table approach
baseline = Baseline(best_features=['status', 'history', 'savings'], threshold=0.5)
baseline.fit(X_train, y_train)

In [6]:
# Cross validation F2 score (with the default threshold of 0.5)
print("F2 =", cross_val_score(baseline, X_train, y_train, scoring=f2_scorer, cv=10).mean().round(2))

F2 = 0.47


In [7]:
# The default threshold of 0.5 givs us the following results on the test set:
y_pred = baseline.predict(X_test)
classification_scores(y_test, y_pred)

accuracy     0.76
precision    0.60
recall       0.58
f1           0.59
f2           0.59
dtype: float64

In [8]:
# AUC
y_score = baseline.predict_proba(X_test)
print("AUC =", roc_auc_score(y_test, y_score).round(2))

AUC = 0.76


In [9]:
# Hyperparameter grid search: the best model's threshold is 0.125 and has the F2 = 0.71
gs = GridSearchCV(baseline, {'threshold': np.linspace(0.05, 0.2, num=7)}, cv=10, scoring=f2_scorer).fit(X_train, y_train)
print("threshold =", round(gs.best_estimator_.threshold, 3), "\tF2 =", gs.best_score_.round(2))

threshold = 0.125 	F2 = 0.68


So, the goal is to beat the F2-score (and possibly the AUC)

<br>