In [85]:
import polars as pl
import numpy as np

In [86]:
data = pl.read_csv("data/aus_sports_betting.csv")
data.head(5)

Date,Home Team,Away Team,Home Score,Away Score,Overtime?,Playoff Game?,Neutral Venue?,Home Odds Open,Home Odds Min,Home Odds Max,Home Odds Close,Away Odds Open,Away Odds Min,Away Odds Max,Away Odds Close,Home Line Open,Home Line Min,Home Line Max,Home Line Close,Away Line Open,Away Line Min,Away Line Max,Away Line Close,Home Line Odds Open,Home Line Odds Min,Home Line Odds Max,Home Line Odds Close,Away Line Odds Open,Away Line Odds Min,Away Line Odds Max,Away Line Odds Close,Total Score Open,Total Score Min,Total Score Max,Total Score Close,Total Score Over Open,Total Score Over Min,Total Score Over Max,Total Score Over Close,Total Score Under Open,Total Score Under Min,Total Score Under Max,Total Score Under Close,Notes
str,str,str,i64,i64,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str
"""2025-02-09""","""Philadelphia Eagles""","""Kansas City Chiefs""",40,22,,"""Y""","""Y""",2.1,2.0,2.15,2.0,1.77,1.73,1.83,1.83,2.0,1.0,2.0,1.0,-2.0,-2.0,-1.0,-1.0,1.91,1.87,1.91,1.91,1.91,1.91,1.91,1.91,49.5,48.5,49.5,48.5,1.91,1.87,1.91,1.91,1.91,1.87,1.91,1.91,"""Played at Caesars Superdome in…"
"""2025-01-26""","""Kansas City Chiefs""","""Buffalo Bills""",32,29,,"""Y""",,1.8,1.73,1.87,1.87,2.05,1.95,2.1,1.95,-1.5,-2.0,-1.0,-1.0,1.5,1.0,2.0,1.0,1.91,1.87,1.91,1.91,1.91,1.91,1.91,1.91,48.5,47.5,49.5,49.5,1.91,1.91,1.91,1.91,1.91,1.91,1.91,1.91,
"""2025-01-26""","""Philadelphia Eagles""","""Washington Commanders""",55,23,,"""Y""",,1.4,1.31,1.45,1.34,3.05,2.8,3.4,3.35,-5.5,-6.0,-4.5,-6.0,5.5,4.5,6.0,6.0,1.91,1.87,1.91,1.91,1.91,1.91,1.91,1.91,48.0,46.5,48.0,47.0,1.91,1.91,1.91,1.91,1.91,1.91,1.91,1.91,
"""2025-01-19""","""Buffalo Bills""","""Baltimore Ravens""",27,25,,"""Y""",,1.87,1.83,2.05,2.05,1.95,1.8,1.95,1.8,-1.0,-1.0,1.5,1.5,1.0,-1.5,1.0,-1.5,1.91,1.91,1.91,1.91,1.91,1.91,1.91,1.91,51.5,51.5,52.0,52.0,1.91,1.87,1.91,1.91,1.91,1.87,1.91,1.91,
"""2025-01-19""","""Philadelphia Eagles""","""Los Angeles Rams""",28,22,,"""Y""",,1.36,1.29,1.38,1.29,3.25,3.1,3.7,3.7,-6.0,-7.0,-5.5,-7.0,6.0,5.5,7.0,7.0,2.0,1.91,1.87,1.91,1.83,1.95,1.91,1.91,43.5,42.0,44.0,43.5,1.91,1.87,1.91,1.91,1.91,1.87,1.91,1.91,


Let's clean up the data a bit, and add some important columns

In [87]:
# Drop unimportant columns
# These need to be wrapped in try-catch in case the cell is rerun
try:
    data = data.drop(["Date", "Notes"])
except pl.exceptions.ColumnNotFoundError:
    pass

# replace all "Y" strings with 1
data = data.with_columns([
    pl.when(pl.col(col) == "Y").then(1).otherwise(pl.col(col)).alias(col)
    for col in data.columns if data[col].dtype == pl.Utf8
])
data = data.with_columns([
    pl.col(col).fill_null(0).alias(col) for col in data.columns if data[col].dtype == pl.Utf8
])
# drop rows with null values, as null values here means missing important data
data = data.drop_nulls()

# Add a Home Team Win column that is either 1 or 0 based on if Home Score is greater than Away Score
data = data.with_columns(
    (pl.col("Home Score") > pl.col("Away Score")).cast(pl.Int8).alias("Home Team Win")
)
# Add a column called Over Hit that is 1 if Home Score + Away Score > Total Score Close
data = data.with_columns(
    (pl.col("Home Score") + pl.col("Away Score") > pl.col("Total Score Close")).cast(pl.Int8).alias("Over Hit")
)
# Add a column called Home Team Cover that is 1 if Home Score - Away Score > Home Spread Close
data = data.with_columns(
    (pl.col("Away Score") - pl.col("Home Score") < pl.col("Home Line Close")).cast(pl.Int8).alias("Home Team Cover")
)
# Add a column called Away Team Cover that is 1 if Away Score - Home Score > Away Spread Close
data = data.with_columns(
    (pl.col("Home Score") - pl.col("Away Score") < pl.col("Away Line Close")).cast(pl.Int8).alias("Away Team Cover")
)
# convert Overtime? Playoff Game? and Neutral Venue? to boolean values
data = data.with_columns([
    pl.col(col).cast(pl.Int8).alias(col) for col in ["Overtime?", "Playoff Game?", "Neutral Venue?"]
])
# get all unique values in Home Team and Away Team columns
home_teams = data["Home Team"].unique().to_list()
away_teams = data["Away Team"].unique().to_list()
# Combine the two lists and remove duplicates
teams = list(set(home_teams + away_teams))
# create a mapping of team names to integers
team_mapping = {team: i for i, team in enumerate(teams)}
# Map the team names to integers
data = data.with_columns([
    pl.col("Home Team").replace_strict(team_mapping).cast(pl.UInt8).alias("Home Team"),
    pl.col("Away Team").replace_strict(team_mapping).cast(pl.UInt8).alias("Away Team")
])

# drop things only known after the game, like Home Score, Away Score, and Overtime?
data = data.drop(["Home Score", "Away Score", "Overtime?"])

data

Home Team,Away Team,Playoff Game?,Neutral Venue?,Home Odds Open,Home Odds Min,Home Odds Max,Home Odds Close,Away Odds Open,Away Odds Min,Away Odds Max,Away Odds Close,Home Line Open,Home Line Min,Home Line Max,Home Line Close,Away Line Open,Away Line Min,Away Line Max,Away Line Close,Home Line Odds Open,Home Line Odds Min,Home Line Odds Max,Home Line Odds Close,Away Line Odds Open,Away Line Odds Min,Away Line Odds Max,Away Line Odds Close,Total Score Open,Total Score Min,Total Score Max,Total Score Close,Total Score Over Open,Total Score Over Min,Total Score Over Max,Total Score Over Close,Total Score Under Open,Total Score Under Min,Total Score Under Max,Total Score Under Close,Home Team Win,Over Hit,Home Team Cover,Away Team Cover
u8,u8,i8,i8,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i8,i8,i8,i8
8,28,1,1,2.1,2.0,2.15,2.0,1.77,1.73,1.83,1.83,2.0,1.0,2.0,1.0,-2.0,-2.0,-1.0,-1.0,1.91,1.87,1.91,1.91,1.91,1.91,1.91,1.91,49.5,48.5,49.5,48.5,1.91,1.87,1.91,1.91,1.91,1.87,1.91,1.91,1,1,1,0
28,20,1,0,1.8,1.73,1.87,1.87,2.05,1.95,2.1,1.95,-1.5,-2.0,-1.0,-1.0,1.5,1.0,2.0,1.0,1.91,1.87,1.91,1.91,1.91,1.91,1.91,1.91,48.5,47.5,49.5,49.5,1.91,1.91,1.91,1.91,1.91,1.91,1.91,1.91,1,1,1,0
8,30,1,0,1.4,1.31,1.45,1.34,3.05,2.8,3.4,3.35,-5.5,-6.0,-4.5,-6.0,5.5,4.5,6.0,6.0,1.91,1.87,1.91,1.91,1.91,1.91,1.91,1.91,48.0,46.5,48.0,47.0,1.91,1.91,1.91,1.91,1.91,1.91,1.91,1.91,1,1,1,0
20,32,1,0,1.87,1.83,2.05,2.05,1.95,1.8,1.95,1.8,-1.0,-1.0,1.5,1.5,1.0,-1.5,1.0,-1.5,1.91,1.91,1.91,1.91,1.91,1.91,1.91,1.91,51.5,51.5,52.0,52.0,1.91,1.87,1.91,1.91,1.91,1.87,1.91,1.91,1,0,1,0
8,24,1,0,1.36,1.29,1.38,1.29,3.25,3.1,3.7,3.7,-6.0,-7.0,-5.5,-7.0,6.0,5.5,7.0,7.0,2.0,1.91,1.87,1.91,1.83,1.95,1.91,1.91,43.5,42.0,44.0,43.5,1.91,1.87,1.91,1.91,1.91,1.87,1.91,1.91,1,1,0,1
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
33,3,0,0,1.43,1.34,1.43,1.36,3.1,3.05,3.56,3.46,-5.0,-6.5,-5.0,-6.5,5.0,5.0,6.5,6.5,1.95,1.91,1.97,1.93,1.95,1.93,2.0,1.98,39.5,39.5,41.0,41.0,1.92,1.92,1.98,1.95,1.99,1.99,2.0,1.95,1,0,0,1
8,23,0,0,1.2,1.18,1.21,1.19,5.23,5.08,5.54,5.35,-10.0,-10.5,-10.0,-10.0,10.0,10.0,10.5,10.0,1.91,1.93,2.02,2.02,2.0,1.88,1.98,1.88,53.0,48.5,53.0,49.0,1.95,1.91,1.99,1.91,1.95,2.0,1.95,2.0,1,1,1,0
25,18,0,0,1.37,1.32,1.42,1.41,3.39,3.14,3.73,3.16,-6.0,-7.0,-5.5,-5.5,6.0,5.5,7.0,5.5,1.92,1.97,1.95,1.93,1.99,1.95,1.93,1.97,40.0,40.0,41.5,41.5,1.85,1.85,1.97,1.94,2.06,2.06,2.0,1.96,1,1,0,1
21,7,0,0,1.5,1.5,1.73,1.65,2.81,2.24,2.81,2.4,-4.0,-4.0,-2.5,-3.0,4.0,2.5,4.0,3.0,1.95,1.95,1.88,2.02,1.95,2.02,1.95,1.88,44.0,42.5,44.0,43.5,1.95,1.88,1.95,1.88,1.95,1.98,1.95,2.03,0,0,0,1


In [88]:
correlation = data.corr()
# remove all but the last four rows (correlation with the target columns Home Team Win, Over Hit, Home Team Cover, Away Team Cover)
correlation = correlation.tail(4)
correlation

Home Team,Away Team,Playoff Game?,Neutral Venue?,Home Odds Open,Home Odds Min,Home Odds Max,Home Odds Close,Away Odds Open,Away Odds Min,Away Odds Max,Away Odds Close,Home Line Open,Home Line Min,Home Line Max,Home Line Close,Away Line Open,Away Line Min,Away Line Max,Away Line Close,Home Line Odds Open,Home Line Odds Min,Home Line Odds Max,Home Line Odds Close,Away Line Odds Open,Away Line Odds Min,Away Line Odds Max,Away Line Odds Close,Total Score Open,Total Score Min,Total Score Max,Total Score Close,Total Score Over Open,Total Score Over Min,Total Score Over Max,Total Score Over Close,Total Score Under Open,Total Score Under Min,Total Score Under Max,Total Score Under Close,Home Team Win,Over Hit,Home Team Cover,Away Team Cover
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
-0.004076,0.018929,0.045032,-0.01662,-0.305471,-0.310003,-0.314634,-0.318237,0.31529,0.324059,0.318671,0.321436,-0.35926,-0.366416,-0.369793,-0.376975,0.35926,0.369793,0.366416,0.376975,0.070056,0.079575,0.039574,0.057839,-0.051828,-0.026393,-0.063257,-0.041318,-0.035849,-0.031838,-0.039408,-0.032873,0.014322,-2.4e-05,-0.002242,0.022912,0.023432,0.002227,0.011287,0.014622,1.0,0.027558,0.652294,-0.657245
-0.039751,-0.013469,-0.007999,-0.01103,-0.034399,-0.031131,-0.029216,-0.023851,0.001796,-0.003164,0.004017,0.000584,-0.023724,-0.023058,-0.019825,-0.020497,0.023724,0.019825,0.023058,0.020497,0.01967,0.003918,0.000292,-0.00439,-0.007488,0.005944,0.005913,0.016241,-0.02484,-0.013972,-0.019727,-0.012661,0.007546,-0.006674,0.017346,0.017803,0.010523,0.00977,0.016962,-0.002964,0.027558,1.0,0.030857,-0.03493
-0.000355,0.015966,0.012843,0.014623,0.010952,0.016453,0.009466,0.013779,0.014385,0.016801,0.010726,0.014571,-0.000136,0.005097,0.002405,0.00474,0.000136,-0.002405,-0.005097,-0.00474,0.030346,0.018332,-0.009096,0.010145,-0.026817,0.009202,-0.016961,-0.004896,-0.044704,-0.041782,-0.043272,-0.036296,2.9e-05,0.008753,-0.012592,0.0227,0.011226,-0.033254,0.01042,-0.008501,0.652294,0.030857,1.0,-0.941907
0.001592,-0.023727,-0.006583,-0.006445,-0.010053,-0.015388,-0.010285,-0.014416,-0.009113,-0.010719,-0.005747,-0.009001,-0.003867,-0.009464,-0.00459,-0.008007,0.003867,0.00459,0.009464,0.008007,-0.036261,-0.024237,-0.001395,-0.014104,0.03078,0.000973,0.019655,0.006224,0.056748,0.054262,0.057244,0.049543,0.003593,-0.005274,0.014731,-0.018132,-0.015814,0.024715,-0.010736,0.003949,-0.657245,-0.03493,-0.941907,1.0


In [89]:
feature_columns = np.array(correlation.columns[:-4])  # Exclude the last two columns which are the targets
home_team_win_correlation = correlation.row(0)[:-4]
over_hit_correlation = correlation.row(1)[:-4]
home_team_cover_correlation = correlation.row(2)[:-4]
away_team_cover_correlation = correlation.row(3)[:-4]

THRESHOLD = 0.03
# Important features for Home Team Win
# get top 5 indices for Home Team Win correlation
top_home_team_win_indices = np.argsort(np.abs(home_team_win_correlation))[-5:]
# Important features for Over Hit
top_over_hit_indices = np.argsort(np.abs(over_hit_correlation))[-5:]
# Important features for Home Team Cover
top_home_team_cover_indices = np.argsort(np.abs(home_team_cover_correlation))[-5:]
# Important features for Away Team Cover
top_away_team_cover_indices = np.argsort(np.abs(away_team_cover_correlation))[-5:]

print("Top 5 Home Team Win Features:")
print(feature_columns[top_home_team_win_indices].flatten())

print("Top 5 Over Hit Features:")
print(feature_columns[top_over_hit_indices].flatten())

print("Top 5 Home Team Cover Features:")
print(feature_columns[top_home_team_cover_indices].flatten())

print("Top 5 Away Team Cover Features:")
print(feature_columns[top_away_team_cover_indices].flatten())


Top 5 Home Team Win Features:
['Away Line Max' 'Away Line Min' 'Home Line Max' 'Home Line Close'
 'Away Line Close']
Top 5 Over Hit Features:
['Total Score Open' 'Home Odds Max' 'Home Odds Min' 'Home Odds Open'
 'Home Team']
Top 5 Home Team Cover Features:
['Total Score Under Min' 'Total Score Close' 'Total Score Min'
 'Total Score Max' 'Total Score Open']
Top 5 Away Team Cover Features:
['Home Line Odds Open' 'Total Score Close' 'Total Score Min'
 'Total Score Open' 'Total Score Max']
