# Optimizing Simulator with XGBoost model 
(with more/new data this time)

- Currently training on 84,237 matchups from 2008 to 2024 (regular seasons + postseason)
- Will perform a grid search to find optimal parameters
- Possibly PCA for features

I will follow the same process I performed in full_bracket_simulator.ipynb, so I won't outline as many of the steps. I am just using new data and some different steps.

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np


from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [2]:
data = pd.read_csv('/Users/jacksonisidor/Documents/March Madness Project/Deployment/all_matchup_stats.csv')

Make the functions for simulating the bracket:

In [3]:
def score_bracket(predicted, actual):
    
    score = 0
    for (pred_index, pred_matchup), (act_index, act_matchup) in zip(predicted.iterrows(), actual.iterrows()):
        
        if (pred_matchup["team_1"] == act_matchup["team_1"]) and (pred_matchup["prediction"] == act_matchup["winner"] == 1):
            score += 64 / pred_matchup["current_round"]
            
        elif (pred_matchup["team_2"] == act_matchup["team_2"]) and (pred_matchup["prediction"] == act_matchup["winner"] == 0): 
            score += 64 / pred_matchup["current_round"]
            
    return score

In [4]:
def get_winner_info(matchups):
    next_round_teams_list = []
    
    for index, matchup in matchups.iterrows():
        # if team_1 wins, get all info that ends in "_1"
        if matchup["prediction"] == 1:
            winning_team_info = matchup.filter(regex='_1$').rename(lambda x: x[:-2], axis=0)
        # if team_2 wins, get all info that ends in "_2"
        else:
            winning_team_info = matchup.filter(regex='_2$').rename(lambda x: x[:-2], axis=0)
        
        winning_team_info["year"] = matchup["year"]
        winning_team_info["current_round"] = matchup["current_round"] / 2
        
        next_round_teams_list.append(pd.DataFrame(winning_team_info).T)
    
    next_round_teams = pd.concat(next_round_teams_list, ignore_index=True)
        
    return next_round_teams

In [5]:
def next_sim_matchups(winning_teams):
    matchups = pd.DataFrame(columns=['year', 'team_1', 'seed_1', 'round_1', 'current_round', 'team_2', 'seed_2', 'round_2'])

    matchup_info_list = []
    # iterate through data frame and jump 2 each iteration
    for i in range(0, len(winning_teams)-1, 2):
        team1_info = winning_teams.iloc[i]
        team2_info = winning_teams.iloc[i+1]

        matchup_info = {
                    'year': team1_info['year'],
                    'team_1': team1_info['team'],
                    'seed_1': team1_info['seed'],
                    'round_1': team1_info['round'],
                    'current_round': team1_info['current_round'],
                    'team_2': team2_info['team'],
                    'seed_2': team2_info['seed'],
                    'round_2': team2_info['round'],
                    'badj_em_1': team1_info['badj_em'],
                    'badj_o_1': team1_info['badj_o'],
                    'badj_d_1': team1_info['badj_d'],
                    'wab_1': team1_info['wab'],
                    'barthag_1': team1_info['barthag'],
                    'efg_1': team1_info['efg'],
                    'efg_d_1': team1_info['efg_d'],
                    'ft_rate_1': team1_info['ft_rate'],
                    'ft_rate_d_1': team1_info['ft_rate_d'],
                    'tov_percent_1': team1_info['tov_percent'],
                    'tov_percent_d_1': team1_info['tov_percent_d'],
                    'adj_tempo_1': team1_info['adj_tempo'],
                    '3p_percent_1': team1_info['3p_percent'],
                    '3p_rate_1': team1_info['3p_rate'],
                    '2p_percent_1': team1_info['2p_percent'],
                    '3p_percent_d_1': team1_info['2p_percent_d'],
                    '2p_percent_d_1': team1_info['2p_percent_d'],
                    'exp_1': team1_info['exp'],
                    'eff_hgt_1': team1_info['eff_hgt'],
                    'talent_1' : team1_info['talent'],
                    'elite_sos_1': team1_info['elite_sos'],
                    'win_percent_1': team1_info['win_percent'],
                    'badj_em_2': team2_info['badj_em'],
                    'badj_o_2': team2_info['badj_o'],
                    'badj_d_2': team2_info['badj_d'],
                    'wab_2': team2_info['wab'],
                    'barthag_2': team2_info['barthag'],
                    'efg_2': team2_info['efg'],
                    'efg_d_2': team2_info['efg_d'],
                    'ft_rate_2': team2_info['ft_rate'],
                    'ft_rate_d_2': team2_info['ft_rate_d'],
                    'tov_percent_2': team2_info['tov_percent'],
                    'tov_percent_d_2': team2_info['tov_percent_d'],
                    'adj_tempo_2': team2_info['adj_tempo'],
                    '3p_percent_2': team2_info['3p_percent'],
                    '3p_rate_2': team2_info['3p_rate'],
                    '2p_percent_2': team2_info['2p_percent'],
                    '3p_percent_d_2': team2_info['3p_percent_d'],
                    '2p_percent_d_2': team2_info['2p_percent_d'],
                    'exp_2': team2_info['exp'],
                    'eff_hgt_2': team2_info['eff_hgt'],
                    'talent_2' : team2_info['talent'],
                    'elite_sos_2': team2_info['elite_sos'],
                    'win_percent_2': team2_info['win_percent']
                    }
    
        matchup_info_list.append(matchup_info)

    matchups = pd.concat([matchups, pd.DataFrame(matchup_info_list)])
            
    # get the stat differences same as before
    stat_variables = [
                        'badj_em', 'badj_o', 'badj_d', 'wab', 'barthag', 'efg', 'efg_d', 
                        'ft_rate', 'ft_rate_d', 'tov_percent', 'tov_percent_d', 'adj_tempo', 
                        '3p_percent', '3p_rate', '2p_percent', 'exp', 'eff_hgt', 'talent', 
                        'elite_sos', 'win_percent'
                        ]
    for variable in stat_variables:
        matchups[f'{variable}_diff'] = matchups[f'{variable}_1'] - matchups[f'{variable}_2']
            
    return matchups

In [6]:
def sim_bracket(round_matchups, model, predictors):

    # get predictions for each game in the current round and add that column to the df
    preds = model.predict(round_matchups[predictors])
    # add in probabilities too in case I want to identify the most likely upsets
    probs = model.predict_proba(round_matchups[predictors])

    round_matchups = round_matchups.copy()
    round_matchups.loc[:, "prediction"] = preds
    round_matchups.loc[:, "win probability"] = probs[:, 1]

    
    # base case for recursion (we are in the championship round)
    if round_matchups["current_round"].iloc[0] == 2:
        return round_matchups
    
    # pass teams on to the next round in a new df and combine them into new matchups
    next_round_teams = get_winner_info(round_matchups)
    next_round_matchups = next_sim_matchups(next_round_teams)

    # recurse through making a simulated df that mimics the structure of the actual df
    return pd.concat([round_matchups, sim_bracket(next_round_matchups, model, predictors)], ignore_index=True)

Find the optimal parameters 

In [78]:
predictors = ['badj_em_diff', 'badj_o_diff', 'badj_d_diff', 'wab_diff', 'barthag_diff',
              'efg_diff', 'efg_d_diff', 'ft_rate_diff', 'ft_rate_d_diff', 
              'tov_percent_diff', 'tov_percent_d_diff', 'adj_tempo_diff', 
               '3p_percent_diff', '3p_rate_diff', '2p_percent_diff', 'exp_diff', 
               'eff_hgt_diff', 'talent_diff', 'elite_sos_diff', 'win_percent_diff']

target = "winner"

param_scores = {}
for estimator in [100, 200, 300]:
    for depth in [3, 5, 7]:
        for lr in [0.01, 0.1, 0.2]:
            for ss in [0.8, 0.9, 1.0]:
                for csbt in [0.8, 0.9, 1.0]:
                    for g in [0, 1, 5]:

                        # Loop through all the years in the data with the current set of params
                        scores = []
                        for year in data["year"].unique():
                            if year != 2020 and year != 2021:
                                test = data[(data["year"] == year) & (data["type"] == "T")]
                                train = data[(data["year"] != year) | 
                                                            ((data["year"] == year) & (data["type"] != "T"))]
                                X_train = train[predictors]
                                y_train = train[target]
                                
                                xgb_pipeline = make_pipeline(StandardScaler(), 
                                                    XGBClassifier(colsample_bytree=csbt,
                                                                gamma=g,
                                                                learning_rate=lr,
                                                                max_depth=depth,
                                                                n_estimators=estimator,
                                                                subsample=ss
                                                    ))

                                xgb_pipeline.fit(X_train, y_train)
                                
                                # simulate the test bracket 
                                test_r64 = test[test["current_round"] == 64]
                                prediction_bracket = sim_bracket(test_r64, xgb_pipeline, predictors)
                                
                                # Score the test bracket
                                score = score_bracket(prediction_bracket, test)
                                scores.append(score)

                        # Get the average score for this set of parameters and add it to the dictionary
                        cv_score = np.mean(scores)
                        param_scores[(estimator, depth, lr, ss, csbt, g)] = cv_score

In [79]:
param_scores_df = pd.DataFrame(param_scores.items(), columns=['Hyperparameters', 'Average Score'])
param_scores_df.sort_values(by="Average Score", ascending=False).head()

Unnamed: 0,Hyperparameters,Average Score
717,"(300, 7, 0.2, 0.9, 1.0, 0)",92.066667
66,"(100, 3, 0.2, 0.9, 0.9, 0)",90.533333
355,"(200, 5, 0.1, 0.8, 0.9, 1)",90.066667
545,"(300, 3, 0.2, 0.8, 0.9, 5)",89.8
302,"(200, 3, 0.2, 0.8, 0.9, 5)",89.733333


The best hyperparameters are: 
- n_estimators = 300
- max_depth = 7
- learning_rate = 0.2
- ss = 0.9
- colsample_bytree = 1.0,
- gamma = 0

These hyperparameters yielded an average bracket score of 110 which is significantly better than the previous xgboost model that trained on only tournament data. It is also **twice** as good as the average bracket score according to March Madness Live.

In [7]:
predictors = ['badj_em_diff', 'badj_o_diff', 'badj_d_diff', 'wab_diff', 'barthag_diff',
              'efg_diff', 'efg_d_diff', 'ft_rate_diff', 'ft_rate_d_diff', 
              'tov_percent_diff', 'tov_percent_d_diff', 'adj_tempo_diff', 
               '3p_percent_diff', '3p_rate_diff', '2p_percent_diff', 'exp_diff', 
               'eff_hgt_diff', 'talent_diff', 'elite_sos_diff', 'win_percent_diff']

target = "winner"

scores = pd.DataFrame(columns=["year", "score"])
for year in data["year"].unique():
    # Split data leaving out 1 year for testing
    if year != 2020 and year != 2021:
        test = data[(data["year"] == year) & (data["type"] == "T")]
        train = data[(data["year"] != year) | 
                ((data["year"] == year) & (data["type"] != "T"))]
        X_train = train[predictors]
        y_train = train[target]

    # Train model on rest of the years
    xgb_pipeline = make_pipeline(StandardScaler(), 
                                  XGBClassifier(n_estimators=300,
                                                max_depth=7,
                                                learning_rate=0.2,
                                                subsample=0.9,
                                                colsample_bytree=1.0,
                                                gamma=0
                                                 ))

    xgb_pipeline.fit(X_train, y_train)

    # Simulate the test bracket 
    test_r64 = test[test["current_round"] == 64]
    prediction_bracket = sim_bracket(test_r64, xgb_pipeline, predictors)

    # Score the test bracket
    score = score_bracket(prediction_bracket, test)
    
    # add to df
    test_bracket_info = pd.DataFrame({'year': [year], 'score': [score]})
    scores = pd.concat([scores, test_bracket_info], ignore_index=True)

In [8]:
scores

Unnamed: 0,year,score
0,2024,136.0
1,2023,40.0
2,2022,114.0
3,2019,58.0
4,2018,82.0
5,2017,139.0
6,2016,88.0
7,2015,104.0
8,2014,67.0
9,2013,50.0


In [9]:
scores.score.mean()

91.8125

In [12]:
predictors = ['badj_em_diff', 'badj_o_diff', 'badj_d_diff', 'wab_diff', 'barthag_diff',
              'efg_diff', 'efg_d_diff', 'ft_rate_diff', 'ft_rate_d_diff', 
              'tov_percent_diff', 'tov_percent_d_diff', 'adj_tempo_diff', 
               '3p_percent_diff', '3p_rate_diff', '2p_percent_diff', 'exp_diff', 
               'eff_hgt_diff', 'talent_diff', 'elite_sos_diff', 'win_percent_diff']

target = "winner"

X_train, X_test, y_train, y_test = train_test_split(data[predictors], data[target], test_size=0.3, random_state=42) # 70% to train

# Train model on rest of the years
xgb_pipeline = make_pipeline(StandardScaler(), 
                                  XGBClassifier(n_estimators=300,
                                                max_depth=7,
                                                learning_rate=0.2,
                                                subsample=0.9,
                                                colsample_bytree=1.0,
                                                gamma=0
                                                 ))

xgb_pipeline.fit(X_train, y_train)
preds = xgb_pipeline.predict(X_test)

accuracy = accuracy_score(preds, y_test)

In [14]:
accuracy

0.7120548389713637