In [27]:
import data_processing as dp
import warnings
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn.metrics import brier_score_loss


In [28]:
# Create an instance of DataProcessing
data_processor = dp.DataProcessing()

# First, process the KenPom data to create self.kenpom_processed
data_processor.process_kenpom()

# Then, process the tournament data to create self.tourney_processed
data_processor.process_tourney()

# Now you can call get_data()
matchup_stats = data_processor.get_data()
warnings.filterwarnings("ignore")
matchup_stats.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WTeamName,WSeed,WRecord,LTeamName,...,Adjusted Temo_diff,Raw Tempo_diff,Adjusted Offensive Efficiency_diff,Raw Offensive Efficiency_diff,Adjusted Defensive Efficiency_diff,Raw Defensive Efficiency_diff,higher_seed_num,lower_seed_num,higher_record,lower_record
0,2002,134,1373,81,1108,77,Siena,16,0.470588,Alcorn St.,...,6.5,6.9,-0.7,4.6,6.8,2.9,16,16,0.689655,0.470588
1,2002,136,1104,86,1194,78,Alabama,2,0.787879,Florida Atlantic,...,-2.2,-1.5,14.4,7.7,-6.8,-3.1,2,15,0.787879,0.642857
2,2002,136,1112,86,1364,81,Arizona,3,0.709677,UC Santa Barbara,...,7.7,10.1,11.1,5.3,-0.5,7.9,3,14,0.709677,0.655172
3,2002,136,1181,84,1457,37,Duke,1,0.90625,Winthrop,...,8.6,8.6,23.7,17.0,-15.8,-6.0,1,16,0.90625,0.576923
4,2002,136,1231,75,1428,56,Indiana,5,0.633333,Utah,...,3.5,3.4,2.6,-1.3,-8.1,-3.3,5,12,0.633333,0.703704


In [29]:
matchup_stats.columns

Index(['Season', 'DayNum', 'WTeamID', 'WScore', 'LTeamID', 'LScore',
       'WTeamName', 'WSeed', 'WRecord', 'LTeamName', 'LSeed', 'LRecord',
       'HigherSeed', 'HigherSeedWin', 'Tempo_diff', 'AdjTempo_diff', 'OE_diff',
       'AdjOE_diff', 'DE_diff', 'AdjDE_diff', 'AdjEM_diff', 'OffFT_diff',
       'Off2PtFG_diff', 'Off3PtFG_diff', 'DefFT_diff', 'Def2PtFG_diff',
       'Def3PtFG_diff', 'eFGPct_diff', 'TOPct_diff', 'ORPct_diff',
       'FTRate_diff', 'FG2Pct_diff', 'FG3Pct_diff', 'FTPct_diff',
       'BlockPct_diff', 'OppFG2Pct_diff', 'OppFG3Pct_diff', 'OppFTPct_diff',
       'OppBlockPct_diff', 'FG3Rate_diff', 'OppFG3Rate_diff', 'ARate_diff',
       'OppARate_diff', 'StlRate_diff', 'OppStlRate_diff',
       'Adjusted Temo_diff', 'Raw Tempo_diff',
       'Adjusted Offensive Efficiency_diff', 'Raw Offensive Efficiency_diff',
       'Adjusted Defensive Efficiency_diff', 'Raw Defensive Efficiency_diff',
       'higher_seed_num', 'lower_seed_num', 'higher_record', 'lower_record'],
      

In [30]:
# Create a list of the columns we want to use for training
features = [ 'higher_record',  'lower_record', 'Tempo_diff', 'AdjTempo_diff', 'OE_diff',
       'AdjOE_diff', 'DE_diff', 'AdjDE_diff', 'AdjEM_diff', 'OffFT_diff',
       'Off2PtFG_diff', 'Off3PtFG_diff', 'DefFT_diff', 'Def2PtFG_diff',
       'Def3PtFG_diff', 'eFGPct_diff', 'TOPct_diff', 'ORPct_diff',
       'FTRate_diff', 'FG2Pct_diff', 'FG3Pct_diff', 'FTPct_diff',
       'BlockPct_diff', 'OppFG2Pct_diff', 'OppFG3Pct_diff', 'OppFTPct_diff',
       'OppBlockPct_diff', 'FG3Rate_diff', 'OppFG3Rate_diff', 'ARate_diff',
       'OppARate_diff', 'StlRate_diff', 'OppStlRate_diff',
       'Adjusted Temo_diff', 'Raw Tempo_diff',
       'Adjusted Offensive Efficiency_diff', 'Raw Offensive Efficiency_diff',
       'Adjusted Defensive Efficiency_diff', 'Raw Defensive Efficiency_diff']
#'higher_seed_num', 'lower_seed_num',
x_train = matchup_stats[(matchup_stats["Season"] != 2023) & (matchup_stats["Season"] != 2024)][features]
y_train = matchup_stats[(matchup_stats["Season"] != 2023) & (matchup_stats["Season"] != 2024)]['HigherSeedWin'] 

x_validate = matchup_stats[matchup_stats["Season"] == 2023][features]
y_validate = matchup_stats[matchup_stats["Season"] == 2023]['HigherSeedWin']

x_test = matchup_stats[matchup_stats["Season"] == 2024][features]
y_test = matchup_stats[matchup_stats["Season"] == 2024]['HigherSeedWin']

y_test.sum()/y_test.count()

0.6865671641791045

In [31]:
x_test

Unnamed: 0,higher_record,lower_record,Tempo_diff,AdjTempo_diff,OE_diff,AdjOE_diff,DE_diff,AdjDE_diff,AdjEM_diff,OffFT_diff,...,ARate_diff,OppARate_diff,StlRate_diff,OppStlRate_diff,Adjusted Temo_diff,Raw Tempo_diff,Adjusted Offensive Efficiency_diff,Raw Offensive Efficiency_diff,Adjusted Defensive Efficiency_diff,Raw Defensive Efficiency_diff
1379,0.69697,0.6875,-5.0894,-5.9624,-8.892,-8.929,-5.0519,-3.7138,-5.2146,-3.885844,...,-1.950005,4.347552,0.025524,0.001541,-6.0,-5.0,-8.9,-8.9,-3.7,-5.1
1380,0.5,0.482759,6.3143,6.2529,8.5507,9.1909,6.498,7.578,1.61414,6.388996,...,1.17411,-10.984474,-0.007054,0.030063,6.2,6.3,9.2,8.6,7.5,6.5
1381,0.705882,0.666667,2.6535,0.9761,3.286,4.999,1.932,1.9306,3.0679,-0.434401,...,11.189726,-4.097876,-0.000124,0.020129,1.0,2.6,5.0,3.2,1.9,1.9
1382,0.548387,0.483871,-3.0147,-4.1201,-7.434,-3.443,-2.294,-1.004,-2.43928,5.159391,...,-7.581077,9.69339,-0.006578,0.025969,-4.2,-3.0,-3.4,-7.5,-1.0,-2.3
1383,0.757576,0.575758,1.2967,0.7391,11.301,12.981,-8.3951,-13.8184,26.799136,-2.36593,...,4.185346,-6.99785,0.007251,-0.001249,0.7,1.3,12.9,11.3,-13.8,-8.4
1384,0.71875,0.6875,0.9069,0.7975,8.891,14.145,0.634,-5.4864,19.63104,-4.158848,...,9.594063,-4.717034,-0.025221,-0.000645,0.8,0.9,14.1,8.9,-5.5,0.6
1385,0.774194,0.78125,-2.179,-2.2919,1.582,2.675,2.119,1.7888,0.8852,-3.92905,...,0.065872,-4.553756,-0.006289,0.016013,-2.3,-2.2,2.7,1.6,1.8,2.1
1386,0.69697,0.676471,3.2227,2.956,11.676,12.67,-0.88,1.9034,10.76689,-4.211543,...,9.621947,-5.349638,-0.025384,-0.008298,3.0,3.2,12.6,11.7,1.9,-0.9
1387,0.774194,0.896552,2.4897,2.9828,3.396,9.248,3.5186,-2.1137,11.3625,-4.109014,...,1.095471,-8.667897,-0.043212,0.009305,3.0,2.5,9.2,3.4,-2.1,3.5
1388,0.764706,0.733333,4.4361,5.5125,7.794,16.367,3.483,-3.011,19.37829,3.481374,...,-7.248428,1.440613,-0.024544,-0.010552,5.5,4.4,16.4,7.8,-3.0,3.4


In [32]:
# Train the Random Forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(x_train, y_train)

# Evaluate the model
accuracy_before = rf.score(x_validate, y_validate)
print(f'Accuracy before feature selection: {accuracy_before:.2f}')

importances = rf.feature_importances_
feature_names = x_train.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})

# Rank features by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
feature_importance_df

Accuracy before feature selection: 0.73


Unnamed: 0,Feature,Importance
8,AdjEM_diff,0.12504
37,Adjusted Defensive Efficiency_diff,0.05088
7,AdjDE_diff,0.049591
35,Adjusted Offensive Efficiency_diff,0.048188
5,AdjOE_diff,0.043116
4,OE_diff,0.028641
38,Raw Defensive Efficiency_diff,0.028347
29,ARate_diff,0.02654
6,DE_diff,0.025924
36,Raw Offensive Efficiency_diff,0.024549


In [34]:
# xgb = XGBClassifier(booster = 'gbtree', objective = 'binary:logistic', device = 'gpu', nthread = -1)

# param_grid = {
#     'eta': [0, 1],
#     'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10],  
#     'gamma': [0, 1, 2, 3, 4, 5],
#     'min_child_weight': [1, 2, 3, 4, 5],
#     'max_delta_step': [0, 1, 2, 3, 4, 5],
#     'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1],
#     'lambda': [0, 1, 2, 3, 4, 5],
#     'alpha': [0, 1, 2, 3, 4, 5],
# }

# xgb_random = RandomizedSearchCV(estimator = xgb, param_distributions = param_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# xgb_random.fit(x_train, y_train)
# xgb_random.best_params_

In [None]:
# {'subsample': 1,
#  'min_child_weight': 3,
#  'max_depth': 10,
#  'max_delta_step': 0,
#  'lambda': 2,
#  'gamma': 5,
#  'eta': 1,
#  'alpha': 3}

In [37]:
model = XGBClassifier(booster = 'gbtree', objective = 'binary:logistic', device = 'gpu', nthread = -1, subsample = 1, 
                      min_child_weight = 4, max_depth = 7, max_delta_step = 3, reg_lambda = 5, gamma = 4, eta = 1, alpha = 4)

model.fit(x_train, y_train)

importances = model.feature_importances_
feature_names = x_train.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})

# Rank features by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
# feature_importance_df

In [39]:
print(model.score(x_test, y_test))
preds = model.predict_proba(x_test)[:,1]

outcomes = pd.DataFrame({'Actual': y_test, 'Predicted': preds})
outcomes.head()

0.8208955223880597


Unnamed: 0,Actual,Predicted
1379,0,0.155404
1380,0,0.391282
1381,1,0.238156
1382,1,0.156921
1383,1,0.955802


In [40]:
tourney24 = matchup_stats[matchup_stats["Season"] == 2024]
tourney24['Predicted'] = preds
tourney24 = tourney24[['HigherSeed','WTeamName', 'LTeamName', 'WSeed','LSeed', 'HigherSeedWin', 'Predicted']]

In [41]:
pd.set_option('display.max_rows', None)
display(tourney24)
brier_score_loss(y_test, preds)

Unnamed: 0,HigherSeed,WTeamName,LTeamName,WSeed,LSeed,HigherSeedWin,Predicted
1379,Virginia,Colorado St.,Virginia,10,10,0,0.155404
1380,Howard,Wagner,Howard,16,16,0,0.391282
1381,Colorado,Colorado,Boise St.,10,10,1,0.238156
1382,Grambling St.,Grambling St.,Montana St.,16,16,1,0.156921
1383,Arizona,Arizona,Long Beach St.,2,15,1,0.955802
1384,Creighton,Creighton,Akron,3,14,1,0.948826
1385,Dayton,Dayton,Nevada,7,10,1,0.467391
1386,BYU,Duquesne,BYU,11,6,0,0.674869
1387,Gonzaga,Gonzaga,McNeese St.,5,12,1,0.810287
1388,Illinois,Illinois,Morehead St.,3,14,1,0.864732


0.14262705014490576