In [1]:
import polars as pl

pl.Config.set_tbl_cols(200)
pl.Config.set_fmt_str_lengths(20)
pl.Config.set_tbl_width_chars(1000)
pl.Config.set_tbl_rows(200)

polars.config.Config

In [2]:
import polars as pl

# Getting team stats
# team_stats = pl.read_parquet('team_stats.parquet')
# standings = pl.read_parquet('team_standings.parquet').unnest('teamName').rename({'default': 'teamFullName'})
# full_team_stats = team_stats.join(standings, on=('seasonId', 'teamFullName'))
# full_team_stats = full_team_stats.unnest('teamAbbrev').rename({'default': 'teamAbbrev'})

games = pl.read_parquet('games.parquet')
games = games.with_columns(
  pl.col('season').cast(pl.UInt64).alias('seasonId'),
  pl.when(pl.col('homeScore') > pl.col('awayScore')).then(pl.col('homeTeam')).otherwise(pl.col('awayTeam')).alias('winner')
)

playoffs = games.filter(pl.col('gameType') == 3)
playoffs = playoffs.with_columns(
  pl.col('gameId').cast(pl.String).str.head(-1).alias('series')
)
playoffs = playoffs.with_columns(
  (pl.col('winner') == pl.col('homeTeam').first()).cum_sum().over(pl.col('series')).shift(1).fill_null(0).alias('seriesHomeTeamWins'),
  (pl.col('winner') == pl.col('awayTeam').first()).cum_sum().over(pl.col('series')).shift(1).fill_null(0).alias('seriesAwayTeamWins'),
)

playoffs = playoffs.with_columns(
  pl.when(pl.col('homeTeam') == pl.col('homeTeam').first()).then(pl.col('seriesHomeTeamWins')).otherwise(pl.col('seriesAwayTeamWins')).alias('seriesHomeTeamWins'),
  pl.when(pl.col('awayTeam') == pl.col('awayTeam').first()).then(pl.col('seriesAwayTeamWins')).otherwise(pl.col('seriesHomeTeamWins')).alias('seriesAwayTeamWins')
)

playoffs = playoffs.with_columns(
  (pl.col('winner') == pl.col('homeTeam')).cast(pl.UInt64).alias('homeTeamWon'),
  (pl.col('winner') == pl.col('awayTeam')).cast(pl.UInt64).alias('awayTeamWon'),
)

playoffs = playoffs.select('gameId', 'gameType', 'seasonId', 'homeTeam', 'seriesHomeTeamWins', 'awayTeam', 'seriesAwayTeamWins', 'homeTeamWon', 'awayTeamWon')
playoffs = playoffs.with_columns(
  pl.when(pl.col('homeTeam') < pl.col('awayTeam')).then(pl.concat_str(pl.col('homeTeam'), pl.col('awayTeam'), separator='-')).otherwise(pl.concat_str(pl.col('awayTeam'), pl.col('homeTeam'), separator='-')).alias('matchup')
)


def aggregate_stats(df: pl.DataFrame, home: bool = True, matchup: bool = True) -> pl.DataFrame:
  df = df.filter(pl.col('gameType') == 2)
  for_prefix = 'home' if home else 'away'
  against_prefix = 'away' if home else 'home'
  if matchup:
    df = df.with_columns(
      pl.when(pl.col('homeTeam') < pl.col('awayTeam')).then(pl.concat_str(pl.col('homeTeam'), pl.col('awayTeam'), separator='-')).otherwise(pl.concat_str(pl.col('awayTeam'), pl.col('homeTeam'), separator='-')).alias('matchup')
    )
    groupby_cols = ('seasonId', 'matchup', f'{for_prefix}Team')
  else:
    groupby_cols = ('seasonId', f'{for_prefix}Team')
  return df.group_by(*groupby_cols).agg(
    pl.col(f'{for_prefix}Score').sum().alias('goals'),
    pl.col(f'{against_prefix}Score').sum().alias('goalsAgainst'),
    pl.col(f'{for_prefix}Sog').sum().alias('sog'),
    pl.col(f'{for_prefix}FaceoffWinningPctg').mean().alias('faceoffWinningPctg'),
    pl.col(f'{for_prefix}PowerPlay').str.split(by='/').list.first().cast(pl.UInt64).sum().alias('powerPlayGoals'),
    pl.col(f'{for_prefix}PowerPlay').str.split(by='/').list.last().cast(pl.UInt64).sum().alias('powerPlays'),
    pl.col(f'{for_prefix}Pim').sum().alias('pim'),
    pl.col(f'{for_prefix}Hits').sum().alias('hits'),
    pl.col(f'{for_prefix}BlockedShots').sum().alias('blockShots'),
    pl.col(f'{for_prefix}Giveaways').sum().alias('giveaways'),
    pl.col(f'{for_prefix}Takeaways').sum().alias('takeaways'),
    (pl.col('winner') == pl.col(f'{for_prefix}Team')).cast(pl.UInt8).sum().alias('wins'),
    (pl.col('winner') != pl.col(f'{for_prefix}Team')).cast(pl.UInt8).sum().alias('losses')
  ).rename({f'{for_prefix}Team': 'team'})

season_home_stats = aggregate_stats(games, matchup=False)
season_away_stats = aggregate_stats(games, home=False, matchup=False)
season_stats = pl.concat((season_home_stats, season_away_stats)).group_by('seasonId', 'team').agg(pl.all().sum())
season_stats = season_stats.with_columns(pl.col('faceoffWinningPctg') / 2)

home_stats = aggregate_stats(games, matchup=True)
away_stats = aggregate_stats(games, home=False, matchup=True)
matchup_stats = pl.concat((home_stats, away_stats)).group_by('seasonId', 'matchup', 'team').agg(pl.all().sum())
matchup_stats = matchup_stats.with_columns(pl.col('faceoffWinningPctg') / 2)

total_stats = matchup_stats.join(season_stats, on=('seasonId', 'team'), suffix='Season')

data = playoffs.join(total_stats, left_on=('seasonId', 'matchup', 'homeTeam'), right_on=('seasonId', 'matchup', 'team'), suffix='Home')
data = data.join(total_stats, left_on=('seasonId', 'matchup', 'awayTeam'), right_on=('seasonId', 'matchup', 'team'), suffix='Away')
# Formatting the data

data = data.sort(by='gameId')
X = data.select(pl.exclude(('gameId', 'seasonId', 'gameType', 'matchup', 'homeTeamWon', 'awayTeamWon', 'homeTeam', 'awayTeam')))
Y = data.select('homeTeamWon')
print(Y.schema)
print(X.columns)

Schema({'homeTeamWon': UInt64})
['seriesHomeTeamWins', 'seriesAwayTeamWins', 'goals', 'goalsAgainst', 'sog', 'faceoffWinningPctg', 'powerPlayGoals', 'powerPlays', 'pim', 'hits', 'blockShots', 'giveaways', 'takeaways', 'wins', 'losses', 'goalsSeason', 'goalsAgainstSeason', 'sogSeason', 'faceoffWinningPctgSeason', 'powerPlayGoalsSeason', 'powerPlaysSeason', 'pimSeason', 'hitsSeason', 'blockShotsSeason', 'giveawaysSeason', 'takeawaysSeason', 'winsSeason', 'lossesSeason', 'goalsAway', 'goalsAgainstAway', 'sogAway', 'faceoffWinningPctgAway', 'powerPlayGoalsAway', 'powerPlaysAway', 'pimAway', 'hitsAway', 'blockShotsAway', 'giveawaysAway', 'takeawaysAway', 'winsAway', 'lossesAway', 'goalsSeasonAway', 'goalsAgainstSeasonAway', 'sogSeasonAway', 'faceoffWinningPctgSeasonAway', 'powerPlayGoalsSeasonAway', 'powerPlaysSeasonAway', 'pimSeasonAway', 'hitsSeasonAway', 'blockShotsSeasonAway', 'giveawaysSeasonAway', 'takeawaysSeasonAway', 'winsSeasonAway', 'lossesSeasonAway']


In [9]:
import xgboost as xgb
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import numpy as np

def dummy_model_predict_proba(X: pl.DataFrame) -> np.ndarray:
  # winsSeason, winsSeasonAway
  return pl.DataFrame().with_columns(
    homeTeamWon=1 - (pl.col('winsSeason') >= pl.col('winsSeasonAway')).cast(pl.UInt8)
  )

y = Y['homeTeamWon']

# # Encode categorical columns (Polars → pandas → LabelEncoder → back to Polars)
# for col in ["homeTeam", "awayTeam"]:
#     le = LabelEncoder()
#     X = X.with_columns([
#         pl.Series(col, le.fit_transform(X[col].to_list()))
#     ])

# Convert to numpy arrays for XGBoost
X_np = X.to_numpy()
y_np = y.to_numpy()

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_np, y_np, test_size=0.1, random_state=42)

# Train XGBoost
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss", max_depth=2,
    learning_rate=0.1,       # or even 0.05
    n_estimators=100,
    subsample=0.7,           # add some randomness
    colsample_bytree=0.7)
model.fit(X_train, y_train)

# Predict + Evaluate
y_pred = model.predict_proba(X_test)
y_pred_bin = (y_pred[:, 0] < y_pred[:, 1]).astype(int)

print(y_pred)
# print(X_test)
# print(y_pred)
acc = accuracy_score(y_test, y_pred_bin)
print(f"Accuracy: {acc:.4f}")


[[0.28922606 0.71077394]
 [0.58877176 0.41122824]
 [0.51335216 0.48664784]
 [0.2628293  0.7371707 ]
 [0.66252136 0.3374786 ]
 [0.7120985  0.28790155]
 [0.20085031 0.7991497 ]
 [0.4752193  0.5247807 ]
 [0.6735428  0.3264572 ]
 [0.5508901  0.44910988]
 [0.46333402 0.536666  ]
 [0.6589198  0.34108016]
 [0.532706   0.467294  ]
 [0.23858416 0.76141584]
 [0.33125436 0.66874564]
 [0.5073751  0.4926249 ]
 [0.7684758  0.23152423]
 [0.5441625  0.45583752]
 [0.674042   0.32595804]
 [0.6245884  0.37541166]
 [0.39618218 0.6038178 ]
 [0.59621936 0.40378064]
 [0.45715714 0.54284286]
 [0.5914495  0.40855053]
 [0.31758618 0.6824138 ]
 [0.6829006  0.3170994 ]
 [0.54458004 0.45541996]
 [0.4931208  0.5068792 ]
 [0.42011    0.57989   ]
 [0.6052286  0.3947714 ]
 [0.5186461  0.48135388]
 [0.3951627  0.6048373 ]
 [0.22762012 0.7723799 ]
 [0.6360433  0.36395672]
 [0.6018958  0.39810416]
 [0.5343808  0.46561918]
 [0.4711737  0.5288263 ]
 [0.27553642 0.7244636 ]
 [0.2914399  0.7085601 ]
 [0.6701316  0.32986838]


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [3]:
import polars as pl
import torch
import torch.nn as nn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# 1. Load data
y = Y["homeTeamWon"]
X_input = X

# # 2. Encode categorical columns
# for col in ["homeTeam", "awayTeam"]:
#     le = LabelEncoder()
#     X_input = X_input.with_columns([
#         pl.Series(col, le.fit_transform(X[col].to_list()))
#     ])

# 3. To numpy
X_np = X_input.to_numpy().astype("float32")
y_np = y.to_numpy().astype("float32")

# 4. Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_np, y_np, test_size=0.2, random_state=42)

# 5. To tensors
X_train_t = torch.tensor(X_train)
X_test_t = torch.tensor(X_test)
y_train_t = torch.tensor(y_train).unsqueeze(1)  # shape [N, 1]
y_test_t = torch.tensor(y_test).unsqueeze(1)

# 6. Define model
class SimpleNet(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)

model = SimpleNet(input_dim=X_train.shape[1])
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# 7. Training loop
for epoch in range(100):
    model.train()
    pred = model(X_train_t)
    loss = loss_fn(pred, y_train_t)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:
        with torch.no_grad():
            acc = ((pred > 0.5) == y_train_t).float().mean().item()
            print(f"Epoch {epoch} | Loss: {loss.item():.4f} | Train Acc: {acc:.4f}")

# 8. Evaluate
model.eval()
with torch.no_grad():
    test_pred = model(X_test_t)
    test_acc = ((test_pred > 0.5) == y_test_t).float().mean().item()
    print(f"\nTest Accuracy: {test_acc:.4f}")

torch.save(model.state_dict(), "model.pth")


Epoch 0 | Loss: 32.8839 | Train Acc: 0.5296
Epoch 10 | Loss: 1.6270 | Train Acc: 0.5015
Epoch 20 | Loss: 0.9917 | Train Acc: 0.5316
Epoch 30 | Loss: 0.8108 | Train Acc: 0.5276
Epoch 40 | Loss: 0.7840 | Train Acc: 0.5366
Epoch 50 | Loss: 0.7337 | Train Acc: 0.5416
Epoch 60 | Loss: 0.7275 | Train Acc: 0.5547
Epoch 70 | Loss: 0.7096 | Train Acc: 0.5577
Epoch 80 | Loss: 0.7157 | Train Acc: 0.5627
Epoch 90 | Loss: 0.6965 | Train Acc: 0.5767

Test Accuracy: 0.5120


  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


In [41]:
# Inference

####### Settings #######
home_team: str = 'CAR'
away_team: str = 'NJD'
home_wins: int = 3
away_wins: int = 3
season: int = 20242025
########################

games = pl.read_parquet('games.parquet')
games = games.with_columns(
  pl.col('season').cast(pl.UInt64).alias('seasonId'),
  pl.when(pl.col('homeScore') > pl.col('awayScore')).then(pl.col('homeTeam')).otherwise(pl.col('awayTeam')).alias('winner')
)

season_home_stats = aggregate_stats(games, matchup=False)
season_away_stats = aggregate_stats(games, home=False, matchup=False)
season_stats = pl.concat((season_home_stats, season_away_stats)).group_by('seasonId', 'team').agg(pl.all().sum())
season_stats = season_stats.with_columns(pl.col('faceoffWinningPctg') / 2)

home_stats = aggregate_stats(games, matchup=True)
away_stats = aggregate_stats(games, home=False, matchup=True)
matchup_stats = pl.concat((home_stats, away_stats)).group_by('seasonId', 'matchup', 'team').agg(pl.all().sum())
matchup_stats = matchup_stats.with_columns(pl.col('faceoffWinningPctg') / 2)

total_stats = matchup_stats.join(season_stats, on=('seasonId', 'team'), suffix='Season')

playoffs = pl.DataFrame([
  {
    'gameType': 3,
    'seasonId': season,
    'homeTeam': home_team,
    'seriesHomeTeamWins': home_wins,
    'awayTeam': away_team,
    'seriesAwayTeamWins': away_wins,
    'awayTeam': away_team,
    'homeTeamWon': 0,
    'awayTeamWon': 0,
  }
])
playoffs = playoffs.with_columns(
  pl.when(pl.col('homeTeam') < pl.col('awayTeam')).then(pl.concat_str(pl.col('homeTeam'), pl.col('awayTeam'), separator='-')).otherwise(pl.concat_str(pl.col('awayTeam'), pl.col('homeTeam'), separator='-')).alias('matchup')
)

data = playoffs.join(total_stats, left_on=('seasonId', 'matchup', 'homeTeam'), right_on=('seasonId', 'matchup', 'team'), suffix='Home')
input = data.join(total_stats, left_on=('seasonId', 'matchup', 'awayTeam'), right_on=('seasonId', 'matchup', 'team'), suffix='Away')
input = input.select(pl.exclude(('gameId', 'seasonId', 'gameType', 'matchup', 'homeTeamWon', 'awayTeamWon', 'homeTeam', 'awayTeam')))


# input = torch.tensor(input.to_numpy().astype("float32"))
# print(input)

prediction = model.predict_proba(input.to_numpy())[0, -1]
print(prediction)

import random
import time
random.seed(time.time())
my_val = random.random()
print(my_val)

if my_val <= prediction:
  print(home_team)
else:
  print(away_team)

0.5334963
0.6537167174781133
NJD


In [None]:
# WSH 4 - 1 MTL
# NJD 4 - 3 CAR