In [15]:
"""

Ref: https://blog.collegefootballdata.com/talking-tech-building-an-artifical-neural-network-to/

"""
import cfbd
import numpy as np
import pandas as pd
import os
import lightgbm as lgbm
from sklearn.model_selection import train_test_split
import optuna

from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [16]:
X_train, X_valid, X_test, y_train, y_valid, y_test = (
    pd.read_parquet(path="./X_train.parquet"), 
    pd.read_parquet(path="./X_valid.parquet"), 
    pd.read_parquet(path="./X_test.parquet"), 
    pd.read_parquet(path="./y_train.parquet"), 
    pd.read_parquet(path="./y_valid.parquet"),
    pd.read_parquet(path="./y_test.parquet")
)

In [31]:
exclude = [
    "id",
    "season",
    'season_type',
 'start_date',
 'completed',
 'home_ml',
 'away_ml',
 'spread_open',
 'over_under_open',
 'team_home',
 'team_away',
 "home_post_wp",
 "away_post_wp",
 "home_team",
 "away_team",
 "home_points",
 "away_points",
]

# Training

In [32]:
# Model
lgbm_reg = lgbm.LGBMRegressor(
    boosting_type="gbdt",
    # num_leaves=100,
    # max_depth=6,
    learning_rate=0.1,
    n_estimators=100,
    objective="regression",
    # min_child_samples=20,
    subsample=0.5,
    subsample_freq=1, # subsample every time
    reg_alpha=1,
    reg_lambda=1,
    random_state=0,
    early_stopping_round=10
)

lgbm_reg.fit(
    X=X_train.drop(columns=exclude), 
    y=y_train,
    eval_set=(X_valid.drop(columns=exclude), y_valid),
    )

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9434
[LightGBM] [Info] Number of data points in the train set: 4145, number of used features: 58
[LightGBM] [Info] Start training from score -6.792762
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[30]	valid_0's l2: 269.276


# Eval

Feature Importance

In [33]:
pd.DataFrame({"feat": lgbm_reg.feature_name_, "imp": lgbm_reg.feature_importances_}).sort_values(by="imp", ascending=False)

Unnamed: 0,feat,imp
8,spread,103
21,returning_svd_1_away,50
4,home_elo,44
19,returning_svd_1_home,37
20,returning_svd_2_home,36
22,returning_svd_2_away,35
5,away_elo,28
25,ppa_off_rush_home,27
29,ppa_def_rush_home,24
30,ppa_def_third_home,23


In [34]:
lgbm_reg.evals_result_

{'valid_0': OrderedDict([('l2',
               [468.5766543633693,
                430.4623920488224,
                400.8975091536229,
                377.29056014481455,
                356.22502147840726,
                339.22245919530553,
                325.1043514054295,
                313.722713761775,
                304.71204336342623,
                297.19470769135506,
                292.1891916411431,
                286.5761537438884,
                282.9261153872509,
                279.96786590107706,
                278.12272440432673,
                276.0048259672117,
                273.887842846993,
                272.18830483376433,
                270.8332283189296,
                270.6629709419395,
                270.40419242251875,
                269.7608663518943,
                269.4282877148645,
                269.59079067086157,
                269.6206887224344,
                270.2461883496833,
                270.0835067385916,
               

In [35]:
y_preds = lgbm_reg.predict(X_test.drop(columns=exclude))

In [36]:
y_preds = pd.Series(y_preds, name="preds")

In [37]:
y_test.margin

5182   -11.0
5183    -3.0
5184   -31.0
5185   -11.0
5186   -32.0
        ... 
5976    12.0
5977   -20.0
5978   -21.0
5979    29.0
5980    -3.0
Name: margin, Length: 799, dtype: float64

In [38]:
eval_df = pd.DataFrame({"preds": y_preds, "actual": y_test["margin"].reset_index(drop=True)})

In [39]:
eval_df["mae"] = abs(eval_df["preds"] - eval_df["actual"])

In [40]:
eval_df["mae"].describe()

count    799.000000
mean      12.523260
std        9.516600
min        0.040358
25%        5.168810
50%       10.640142
75%       17.502600
max       51.628540
Name: mae, dtype: float64

Log Model Results

Compare to Previous Model

# Save Model

In [48]:
# Save model
import pickle
from datetime import datetime
date = format(datetime.today(), "%Y-%m-%d")
with open(f"./model-{date}.pickle", "wb") as f:
    pickle.dump(lgbm_reg, f)