# Model Building
This notebooks contains the the model building for the first iteration - including model optimization via gridsearch

# Imports

In [1]:
# Necessary to import custom modules
import os
os.chdir("/home/jovyan/work")

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.model_selection import GridSearchCV

%matplotlib inline
plt.rcParams["figure.figsize"] = (10, 10)

# Construct data

In [26]:
# read features and drop obsolete ones
df_raw = pd.read_csv("./data/features/second_iteration.csv")

df_raw.head()

Unnamed: 0,avg_GamesPlayed_home,avg_Fg2PtAtt_home,avg_Fg2PtAttPerGame_home,avg_Fg2PtMade_home,avg_Fg2PtMadePerGame_home,avg_Fg2PtPct_home,avg_Fg3PtAtt_home,avg_Fg3PtAttPerGame_home,avg_Fg3PtMade_home,avg_Fg3PtMadePerGame_home,...,avg_FoulPersPerGame_guest,avg_PlusMinus_guest,avg_PlusMinusPerGame_guest,avg_MinSeconds_guest,avg_MinSecondsPerGame_guest,team_guest,season,score_home,score_guest,home_win
0,80.8,452.8,5.2,229.2,2.4,51.0,259.2,2.6,92.0,0.6,...,2.2,-96.2,-0.8,147689.0,1818.0,Minnesota Timberwolves,2016/2017,84,92,0
1,80.5,511.0,6.0,259.0,2.75,51.25,224.75,2.25,75.5,0.5,...,1.75,-364.5,-5.0,103625.5,1479.75,Philadelphia 76ers,2016/2017,110,93,1
2,79.2,425.8,4.8,212.8,2.2,50.6,242.4,2.4,86.0,0.6,...,1.666667,-168.666667,-1.666667,123819.0,1526.666667,Milwaukee Bucks,2016/2017,107,100,1
3,80.5,511.0,6.0,259.0,2.75,51.25,224.75,2.25,75.5,0.5,...,2.25,120.75,1.25,131729.5,1949.5,Detroit Pistons,2016/2017,105,98,1
4,66.5,332.0,4.0,159.0,2.0,50.0,191.0,2.5,62.5,0.5,...,1.5,-202.0,-2.5,86731.5,1337.5,Phoenix Suns,2016/2017,95,91,1


In [21]:
# construct training data
df_train = \
    df_raw[df_raw["season"].isin(["2016/2017"])]\
    .drop(["team_home", "team_guest", "score_home", "score_guest", "season"], axis=1)

X_train, y_train = df_train.drop(["home_win"], axis=1).values, df_train["home_win"].values

# construct final validation data
df_val = \
    df_raw[df_raw["season"] == "2017/2018"]\
    .drop(["team_home", "team_guest", "score_home", "score_guest", "season"]
          + [col for col in df_raw.columns if "2017/2018" in col] , axis=1)
X_val, y_val = df_val.drop(["home_win"], axis=1).values, df_val["home_win"].values

# Baseline

### Predict always win

In [12]:
log_loss(y_val, np.ones_like(y_val))

14.504364428156217

### Predict always loss

In [13]:
log_loss(y_val, np.zeros_like(y_val))

20.034747745413874

### Predict always draw

In [14]:
log_loss(y_val, np.ones_like(y_val)*.5)

0.69314718055994529

# Logistic Regression

In [18]:
# parameter grid
pg_lr = {
    "penalty": ["l1", "l2"],
    "C": [0.001, 0.01, 0.1, 1.0, 10],
}

# cross validated estimator
GS_LR = GridSearchCV(
    estimator=LogisticRegression(),
    param_grid=pg_lr,
    cv=5,
    n_jobs=-1,
    scoring="neg_log_loss"
)

# model fitting
GS_LR.fit(X_train, y_train)

# evaluate on validation set
log_loss(y_val, GS_LR.predict_proba(X_val))

0.64703171907337986

# Random Forest

In [16]:
# parameter grid
param_grid = {
    "n_estimators": [10, 15, 20],
    "min_samples_leaf": [1, 2, 3, 4, 5],
    "min_samples_split": [2, 3, 4, 5]
}

# cross validated estimator
GS_RF = GridSearchCV(
    estimator=RandomForestClassifier(),
    param_grid=param_grid,
    n_jobs=-1,
    cv=5,
    scoring="neg_log_loss"
)

# model fitting
GS_RF.fit(X_train, y_train)

# evaluate on validation set
log_loss(y_val, GS_RF.predict_proba(X_val))

0.69730383803716067

***
# Advanced
In addition to a simple point average, all averaged stats shall be put into the feature vector.

### Construct data

In [28]:
# read features and drop obsolete ones
df_raw_adv = pd.read_csv("./data/features/second_iteration_advanced.csv")

df_raw_adv.head()

Unnamed: 0,avg_GamesPlayed_home,avg_Fg2PtAtt_home,avg_Fg2PtAttPerGame_home,avg_Fg2PtMade_home,avg_Fg2PtMadePerGame_home,avg_Fg2PtPct_home,avg_Fg3PtAtt_home,avg_Fg3PtAttPerGame_home,avg_Fg3PtMade_home,avg_Fg3PtMadePerGame_home,...,avg_FoulPersPerGame_guest,avg_PlusMinus_guest,avg_PlusMinusPerGame_guest,avg_MinSeconds_guest,avg_MinSecondsPerGame_guest,team_guest,season,score_home,score_guest,home_win
0,80.8,452.8,5.2,229.2,2.4,51.0,259.2,2.6,92.0,0.6,...,2.2,-96.2,-0.8,147689.0,1818.0,Minnesota Timberwolves,2016/2017,84,92,0
1,80.5,511.0,6.0,259.0,2.75,51.25,224.75,2.25,75.5,0.5,...,1.75,-364.5,-5.0,103625.5,1479.75,Philadelphia 76ers,2016/2017,110,93,1
2,79.2,425.8,4.8,212.8,2.2,50.6,242.4,2.4,86.0,0.6,...,1.666667,-168.666667,-1.666667,123819.0,1526.666667,Milwaukee Bucks,2016/2017,107,100,1
3,80.5,511.0,6.0,259.0,2.75,51.25,224.75,2.25,75.5,0.5,...,2.25,120.75,1.25,131729.5,1949.5,Detroit Pistons,2016/2017,105,98,1
4,66.5,332.0,4.0,159.0,2.0,50.0,191.0,2.5,62.5,0.5,...,1.5,-202.0,-2.5,86731.5,1337.5,Phoenix Suns,2016/2017,95,91,1


In [29]:
# construct training data
df_train = \
    df_raw_adv[df_raw_adv["season"].isin(["2016/2017"])]\
    .drop(["team_home", "team_guest", "score_home", "score_guest", "season"], axis=1)

X_train, y_train = df_train.drop(["home_win"], axis=1).values, df_train["home_win"].values

# construct final validation data
df_val = \
    df_raw_adv[df_raw_adv["season"] == "2017/2018"]\
    .drop(["team_home", "team_guest", "score_home", "score_guest", "season"]
          + [col for col in df_raw.columns if "2017/2018" in col] , axis=1)
X_val, y_val = df_val.drop(["home_win"], axis=1).values, df_val["home_win"].values

# Logistic Regression

In [30]:
# parameter grid
pg_lr = {
    "penalty": ["l1", "l2"],
    "C": [0.001, 0.01, 0.1, 1.0, 10],
}

# cross validated estimator
GS_LR = GridSearchCV(
    estimator=LogisticRegression(),
    param_grid=pg_lr,
    cv=5,
    n_jobs=-1,
    scoring="neg_log_loss"
)

# model fitting
GS_LR.fit(X_train, y_train)

# evaluate on validation set
log_loss(y_val, GS_LR.predict_proba(X_val))

0.62967096059043415

# Random Forest

In [31]:
# parameter grid
param_grid = {
    "n_estimators": [10, 15, 20],
    "min_samples_leaf": [1, 2, 3, 4, 5],
    "min_samples_split": [2, 3, 4, 5]
}

# cross validated estimator
GS_RF = GridSearchCV(
    estimator=RandomForestClassifier(),
    param_grid=param_grid,
    n_jobs=-1,
    cv=5,
    scoring="neg_log_loss"
)

# model fitting
GS_RF.fit(X_train, y_train)

# evaluate on validation set
log_loss(y_val, GS_RF.predict_proba(X_val))

0.66201856067519516