In [1]:
"""

Ref: https://blog.collegefootballdata.com/talking-tech-building-an-artifical-neural-network-to/

"""
import cfbd
import numpy as np
import pandas as pd
import os
import lightgbm as lgbm
from sklearn.model_selection import train_test_split
import optuna

from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

from fastai.tabular import *
from fastai.tabular.all import *

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
configuration = cfbd.Configuration()
configuration.api_key['Authorization'] = os.getenv("CFBD_API")
configuration.api_key_prefix['Authorization'] = 'Bearer'

api_config = cfbd.ApiClient(configuration)

In [3]:
teams_api = cfbd.TeamsApi(api_config)
ratings_api = cfbd.RatingsApi(api_config)
games_api = cfbd.GamesApi(api_config)
stats_api = cfbd.StatsApi(api_config)
betting_api = cfbd.BettingApi(api_config)
player_api = cfbd.PlayersApi(api_config)

In [54]:
run_type = "predict"

In [55]:
games = []
lines = []
if run_type == "train":
    start = 2015
    end = 2023

elif run_type == "predict":
    start = 2023
    end = 2024

for year in range(start, end):
    print(f"Getting year {year}")
    response = games_api.get_games(year=year)
    games = [*games, *response]

    response = betting_api.get_lines(year=year)
    lines = [*lines, *response]

Getting year 2023


Filter data

In [56]:
if run_type == "train":
    games = [g for g in games if g.home_conference is not None and g.away_conference is not None and g.home_points is not None and g.away_points is not None]
    len(games)
elif run_type == "predict":
    games = [g for g in games if g.home_conference is not None and g.away_conference is not None and g.home_points is None and g.away_points is None]
    len(games)

In [57]:
# TODO: Can keep more features as desired
games = [
    dict(
        id = g.id,
        year = g.season,
        week = g.week,
        neutral_site = g.neutral_site,
        home_team = g.home_team,
        home_conference = g.home_conference,
        home_points = g.home_points,
        home_elo = g.home_pregame_elo,
        away_team = g.away_team,
        away_conference = g.away_conference,
        away_points = g.away_points,
        away_elo = g.away_pregame_elo
    ) for g in games]

In [58]:
games[0]

{'id': 401545860,
 'year': 2023,
 'week': 1,
 'neutral_site': False,
 'home_team': 'Alderson-Broaddus',
 'home_conference': 'Mountain East',
 'home_points': None,
 'home_elo': None,
 'away_team': 'California (PA)',
 'away_conference': 'Pennsylvania State Athletic',
 'away_points': None,
 'away_elo': None}

In [59]:
# Add spread to games object if a consensus spread is available
for game in games:
    game_lines = [l for l in lines if l.id == game['id']]

    if len(game_lines) > 0:
        game_line = [l for l in game_lines[0].lines if l.provider == 'consensus']

        if len(game_line) > 0 and game_line[0].spread is not None:
            game['spread'] = float(game_line[0].spread)

        elif len(game_line) == 0:
            game_spread = np.mean([l.spread for l in game_lines[0].lines if l.spread is not None])
            game["spread"] = float(game_spread)

            

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [60]:
# Filter out games without spread
games = [g for g in games if 'spread' in g and g['spread'] is not None and not np.isnan(g["spread"])]

In [61]:
if run_type == "train":
    for game in games:
        game['margin'] = game['away_points'] - game['home_points']

In [62]:
if run_type == "train":
    df = pd.DataFrame.from_records(games).dropna()
else:
    df = pd.DataFrame.from_records(games)
df.head()

Unnamed: 0,id,year,week,neutral_site,home_team,home_conference,home_points,home_elo,away_team,away_conference,away_points,away_elo,spread
0,401520195,2023,2,False,Kansas,Big 12,,1194.0,Illinois,Big Ten,,1499.0,-2.75
1,401520216,2023,2,False,Wake Forest,ACC,,1640.0,Vanderbilt,SEC,,1238.0,-10.0
2,401520191,2023,2,False,Georgia,SEC,,1980.0,Ball State,Mid-American,,1350.0,-42.0
3,401523999,2023,2,False,Baylor,Big 12,,1643.0,Utah,Pac-12,,1804.0,3.25
4,401525436,2023,2,False,NC State,ACC,,1635.0,Notre Dame,FBS Independents,,1909.0,7.75


In [67]:
# Define feature groupings
excluded = ['id','year','week','home_team','away_team','margin', 'home_points', 'away_points']
cat_features = ['home_conference','away_conference','neutral_site']
cont_features = [c for c in df.columns.to_list() if c not in cat_features and c not in excluded]
target = ["margin"]

cont_features

['home_elo', 'away_elo', 'spread']

In [68]:
power_5 = ["Big Ten", "ACC", "SEC", "Big 12", "Pac-12"]

df["home_conference"] = np.where(df["home_conference"].isin(power_5), df["home_conference"], "Other")
df["away_conference"] = np.where(df["away_conference"].isin(power_5), df["away_conference"], "Other")

In [69]:
# Clean up features
cat_df = pd.get_dummies(df[cat_features], drop_first=True)
df = pd.concat([df, cat_df], axis=1)
df = df.drop(columns=cat_features)

# Clean column names
df.columns = [c.lower().replace(" ", "_").replace("-", "_") for c in df.columns]

Returning

In [None]:
returning = player_api.get_returning_production(year=2023)
returning_list = [
    dict(
        team = g.team,
        year = g.season,
        passing_usage = g.passing_usage,
        percent_passing_ppa = g.percent_passing_ppa,
        percent_ppa = g.percent_ppa,
        percent_receiving_ppa = g.percent_receiving_ppa,
        percent_rushing_ppa = g.percent_rushing_ppa,
        receiving_usage = g.receiving_usage,
        rushing_usage = g.rushing_usage,
        total_passing_ppa = g.total_passing_ppa,
        total_ppa = g.total_ppa,
        total_receiving_ppa = g.total_receiving_ppa,
        total_rushing_ppa = g.total_rushing_ppa,
    ) for g in returning]
returning_df = pd.DataFrame.from_records(returning_list)
returning_df = returning_df.fillna(returning_df.mean(numeric_only=True))
pl = make_pipeline(
    StandardScaler(),
    TruncatedSVD(n_components=2, random_state=0)
)
key_cols = ["team", "year"]
svd_cols = [c for c in returning_df if c not in key_cols]
pl_res = pl.fit_transform(returning_df.loc[:, svd_cols])
svd_df = pd.DataFrame(pl_res, columns=["returning_svd_1", "returning_svd_2"])
returning_df = returning_df.drop(columns=svd_cols)
returning_df = pd.concat([returning_df, svd_df], axis=1)
df = pd.merge(df, returning_df, how="left", left_on=["year", "home_team"], right_on=["year", "team"])
df = pd.merge(df, returning_df, how="left", left_on=["year", "away_team"], right_on=["year", "team"], suffixes=["_home", "_away"])

## Split data

In [70]:
test_year = df.year.max()
test_df = df.query(f"year == {test_year}")
train_df = df.query(f"year != {test_year}")

In [71]:
train_df.shape, test_df.shape

((0, 20), (111, 20))

In [72]:
if run_type == "train":
    X_train, y_train = train_df.loc[:, ~train_df.columns.isin(excluded+target)], train_df[target]
    X_test, y_test = test_df.loc[:, ~test_df.columns.isin(excluded+target)], test_df[target]
elif run_type == "predict":
    X_test = test_df.loc[:, ~test_df.columns.isin(excluded+target)]

In [42]:
# Split train and valid
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=.2, random_state=0)

# Training

In [43]:
# Model
lgbm_reg = lgbm.LGBMRegressor(
    boosting_type="gbdt",
    # num_leaves=100,
    # max_depth=6,
    learning_rate=0.1,
    n_estimators=100,
    objective="regression",
    # min_child_samples=20,
    subsample=0.5,
    subsample_freq=1, # subsample every time
    reg_alpha=1,
    reg_lambda=1,
    random_state=0,
    early_stopping_round=10
)

lgbm_reg.fit(
    X=X_train, 
    y=y_train,
    eval_set=(X_valid, y_valid),
    )

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 678
[LightGBM] [Info] Number of data points in the train set: 3919, number of used features: 13
[LightGBM] [Info] Start training from score -3.879051
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[37]	valid_0's l2: 279.956


In [44]:
lgbm_reg.evals_result_

{'valid_0': OrderedDict([('l2',
               [464.11630309318485,
                430.41512890652234,
                402.00256033373824,
                378.14938970074337,
                359.8330132227578,
                345.37759859111344,
                331.85969268301966,
                322.7248113238488,
                314.6755318550774,
                309.19075084170623,
                303.7783302397453,
                298.80726285471434,
                295.10595371406504,
                292.4528026009366,
                289.89335269997423,
                287.36800899013235,
                286.3504149212807,
                285.4380719464489,
                283.7714871136929,
                283.3571044599134,
                282.80822192298814,
                282.1031884954271,
                281.81351937223496,
                281.08816658685487,
                280.9131662714876,
                280.8243598824171,
                280.6181702948406,
         

In [73]:
X_test

Unnamed: 0,home_elo,away_elo,spread,home_conference_big_12,home_conference_big_ten,home_conference_other,home_conference_pac_12,home_conference_sec,away_conference_big_12,away_conference_big_ten,away_conference_other,away_conference_pac_12,away_conference_sec
0,1194.0,1499.0,-2.75,1,0,0,0,0,0,1,0,0,0
1,1640.0,1238.0,-10.00,0,0,0,0,0,0,0,0,0,1
2,1980.0,1350.0,-42.00,0,0,0,0,1,0,0,1,0,0
3,1643.0,1804.0,3.25,1,0,0,0,0,0,0,0,1,0
4,1635.0,1909.0,7.75,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,1484.0,1791.0,-9.50,0,0,0,0,1,0,0,0,0,1
107,1459.0,1684.0,7.00,0,0,0,0,1,0,0,0,0,0
108,1718.0,2079.0,14.00,0,0,0,0,1,0,0,0,0,1
109,1558.0,1555.0,9.00,0,0,0,0,1,0,0,0,0,0


In [74]:
[c for c in lgbm_reg.feature_name_ if c not in X_test.columns]

[]

In [75]:
y_preds = lgbm_reg.predict(X_test)

In [76]:
y_preds = pd.Series(y_preds, name="preds")

In [77]:
y_test.margin

5506    -3
5509   -11
5510   -32
5513   -30
5516    18
        ..
6907   -20
6908   -17
6911    29
6912   -21
6917    -3
Name: margin, Length: 734, dtype: int64

In [78]:
eval_df = pd.DataFrame({"preds": y_preds, "actual": y_test["margin"].reset_index(drop=True)})

In [79]:
eval_df["mae"] = abs(eval_df["preds"] - eval_df["actual"])

In [80]:
eval_df["mae"].describe()

count    111.000000
mean      21.248527
std       15.558188
min        0.069578
25%        8.358684
50%       18.526744
75%       29.280529
max       68.538426
Name: mae, dtype: float64

Feature Importance

In [53]:
dict(zip(lgbm_reg.feature_name_, lgbm_reg.feature_importances_))

{'home_elo': 372,
 'away_elo': 361,
 'spread': 301,
 'home_conference_big_12': 8,
 'home_conference_big_ten': 7,
 'home_conference_other': 6,
 'home_conference_pac_12': 4,
 'home_conference_sec': 7,
 'away_conference_big_12': 7,
 'away_conference_big_ten': 11,
 'away_conference_other': 18,
 'away_conference_pac_12': 6,
 'away_conference_sec': 2}

Pred DF

In [92]:
if run_type == "predict":
    preds_df = df[["id", "home_team", "away_team"]].copy()
    preds_df["predicted"] = y_preds

In [94]:
preds_df.columns = ["id", "home", "away", "predicted"]

In [101]:
preds_df.to_clipboard(sep=",", index=False)