# Madness of March
## Jim Haines & Josh McCoy
### [Project Website](https://joshmccoy2.github.io/NCAA_March_Madness/)

## Current Datasets
[Kaggle datasets](https://www.kaggle.com/competitions/mens-march-mania-2022/data)

# ETL & EDA

In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd 

# Elo Score

In [2]:
def update_elo(winner_rating, loser_rating):
    K = 20  # Maximum change per game
    expected_win = 1 / (1 + 10 ** ((loser_rating - winner_rating) / 400))
    new_winner_rating = winner_rating + K * (1 - expected_win)
    new_loser_rating = loser_rating - K * (1 - expected_win)
    return new_winner_rating, new_loser_rating


# Adding in Other Kaggle Data to run XG Boost On

In [27]:
detailed_tourney_data = 'MNCAATourneyDetailedResults.csv'
detailed_tourney_results = pd.read_csv(detailed_tourney_data)
detailed_tourney_results.head(1)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2003,134,1421,92,1411,84,N,1,32,69,...,31,14,31,17,28,16,15,5,0,22


In [28]:
#Create variable for amount by which winning team wins
detailed_tourney_results['SCOREDIFF'] = detailed_tourney_results['WScore'] - detailed_tourney_results['LScore']
# Winner stats
detailed_tourney_results.dtypes
detailed_tourney_results['WFGPCT'] = detailed_tourney_results['WFGM']/detailed_tourney_results['WFGA']       # field goal pt completion %
detailed_tourney_results['W3PCT'] = detailed_tourney_results['WFGM3']/detailed_tourney_results['WFGA3']      # 3 pt completion %
detailed_tourney_results['WFTPCT'] = detailed_tourney_results['WFTM']/detailed_tourney_results['WFTA']       # free throw completion %
detailed_tourney_results['WORBCHANCE'] =  detailed_tourney_results['WOR'] + detailed_tourney_results['LDR']  # total potential rebounds
detailed_tourney_results['WORPCT'] = detailed_tourney_results['WOR']/detailed_tourney_results['WORBCHANCE']  # rebound completion %

# Losing stats
detailed_tourney_results['LFGPCT'] = detailed_tourney_results['LFGM']/detailed_tourney_results['LFGA']       
detailed_tourney_results['L3PCT'] = detailed_tourney_results['LFGM3']/detailed_tourney_results['LFGA3']
detailed_tourney_results['LFTPCT'] = detailed_tourney_results['LFTM']/detailed_tourney_results['LFTA']
detailed_tourney_results['LORBCHANCE'] =  detailed_tourney_results['LOR'] + detailed_tourney_results['WDR']
detailed_tourney_results['LORPCT'] = detailed_tourney_results['LOR']/detailed_tourney_results['LORBCHANCE']
detailed_tourney_results.head(1)


Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,WFGPCT,W3PCT,WFTPCT,WORBCHANCE,WORPCT,LFGPCT,L3PCT,LFTPCT,LORBCHANCE,LORPCT
0,2003,134,1421,92,1411,84,N,1,32,69,...,0.463768,0.37931,0.653846,42,0.333333,0.432836,0.387097,0.451613,47,0.361702


In [26]:
# Load the MTeams.csv file
mteams_df = pd.read_csv('MTeams.csv')

# Assuming detailed_tourney_results is your dataframe containing data about the tournament results
# and you have columns WTeamID and LTeamID in detailed_tourney_results

# Merge for WTeamID
merged_df_winning = pd.merge(detailed_tourney_results, mteams_df, left_on=['WTeamID'], right_on=['TeamID'], how='left')
merged_df_winning.rename(columns={'TeamName': 'TeamName1'}, inplace=True)

# Merge for LTeamID
merged_df_losing = pd.merge(detailed_tourney_results, mteams_df, left_on=['LTeamID'], right_on=['TeamID'], how='left')
merged_df_losing.rename(columns={'TeamName': 'TeamName2'}, inplace=True)

# Combine the two merged dataframes horizontally
merged_df_names = pd.concat([merged_df_winning, merged_df_losing[['TeamName2']]], axis=1)

# Display the resulting DataFrame
merged_df_names.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGPCT,L3PCT,LFTPCT,LORBCHANCE,LORPCT,TeamID,TeamName1,FirstD1Season,LastD1Season,TeamName2
0,2003,134,1421,92,1411,84,N,1,32,69,...,0.432836,0.387097,0.451613,47,0.361702,1421,UNC Asheville,1987,2024,TX Southern
1,2003,136,1112,80,1436,51,N,0,31,66,...,0.3125,0.25,1.0,44,0.181818,1112,Arizona,1985,2024,Vermont
2,2003,136,1113,84,1272,71,N,0,31,59,...,0.362319,0.25,0.666667,47,0.425532,1113,Arizona St,1985,2024,Memphis
3,2003,136,1141,79,1166,73,N,0,29,53,...,0.45,0.411765,0.705882,34,0.411765,1141,C Michigan,1985,2024,Creighton
4,2003,136,1143,76,1301,74,N,1,27,64,...,0.446429,0.428571,0.75,30,0.333333,1143,California,1985,2024,NC State


In [24]:
seeds_df = pd.read_csv('MNCAATourneySeeds.csv')
seeds_df['Seed_correct'] = seeds_df['Seed'].str.extract('(\d+)', expand=False).astype(int)

seeds_df = seeds_df[seeds_df['Season'] == 2023].copy()

seeds_df.head()

Unnamed: 0,Season,Seed,TeamID,Seed_correct
2422,2023,W01,1345,1
2423,2023,W02,1266,2
2424,2023,W03,1243,3
2425,2023,W04,1397,4
2426,2023,W05,1181,5


In [25]:
# Merge for WTeamID
merged_df_winning = pd.merge(merged_df_names, seeds_df, left_on=['WTeamID', 'Season'], right_on=['TeamID', 'Season'], how='left')
merged_df_winning.rename(columns={'Seed_correct': 'WTeamSeedCorr', 'Seed': 'WTeamSeed'}, inplace=True)

# Merge for LTeamID
merged_df_losing = pd.merge(merged_df_names, seeds_df, left_on=['LTeamID', 'Season'], right_on=['TeamID', 'Season'], how='left')
merged_df_losing.rename(columns={'Seed_correct': 'LTeamSeedCorr', 'Seed': 'LTeamSeed'}, inplace=True)

# Combine the two merged dataframes horizontally
merged_df_seeds = pd.concat([merged_df_winning, merged_df_losing[['LTeamSeedCorr', 'LTeamSeed']]], axis=1)

# Display the resulting DataFrame
merged_df_seeds.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,TeamID_x,TeamName1,FirstD1Season,LastD1Season,TeamName2,WTeamSeed,TeamID_y,WTeamSeedCorr,LTeamSeedCorr,LTeamSeed
0,2003,134,1421,92,1411,84,N,1,32,69,...,1421,UNC Asheville,1987,2024,TX Southern,,,,,
1,2003,136,1112,80,1436,51,N,0,31,66,...,1112,Arizona,1985,2024,Vermont,,,,,
2,2003,136,1113,84,1272,71,N,0,31,59,...,1113,Arizona St,1985,2024,Memphis,,,,,
3,2003,136,1141,79,1166,73,N,0,29,53,...,1141,C Michigan,1985,2024,Creighton,,,,,
4,2003,136,1143,76,1301,74,N,1,27,64,...,1143,California,1985,2024,NC State,,,,,


# Prepare Data for XGBoost

In [8]:
def prepare_data(df_data):
    df = df_data.copy()
    df.rename(columns={'WLoc': 'location'}, inplace=True)

    # Drop unnecessary columns
    df.drop(columns=['TeamID_x', 'TeamName1', 'FirstD1Season', 'LastD1Season', 'TeamName2', 'TeamID_y'], inplace=True)

    dfswap = df[['Season', 'DayNum', 'LTeamID', 'LScore', 'WTeamID', 'WScore', 'location', 'NumOT',
                 'LFGM', 'LFGA', 'LFGM3', 'LFGA3', 'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF',
                 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR', 'WAst', 'WTO', 'WStl', 'WBlk', 'WPF',
                 'WTeamSeedCorr', 'LTeamSeedCorr', 'SCOREDIFF', 'WFGPCT', 'W3PCT', 'WFTPCT', 'WORBCHANCE', 'WORPCT',
                 'LFGPCT', 'L3PCT', 'LFTPCT', 'LORBCHANCE', 'LORPCT', 'WTeamSeed', 'LTeamSeed']]

    df.columns = df.columns.str.replace('W', 'T1_')
    df.columns = df.columns.str.replace('L', 'T2_')
    dfswap.columns = dfswap.columns.str.replace('L', 'T1_')
    dfswap.columns = dfswap.columns.str.replace('W', 'T2_')

    output = pd.concat([df, dfswap]).reset_index(drop=True)
    output.loc[output.location == 'N', 'location'] = '0'
    output.loc[output.location == 'H', 'location'] = '1'
    output.loc[output.location == 'A', 'location'] = '-1'
    output.location = output.location.astype(int)
    output['PointDiff'] = output['T1_Score'] - output['T2_Score']
    output['Outcome'] = (output['PointDiff'] > 0).astype(int)

    return output

# Predicting Winner 

In [9]:
def predict_winner(team1_id, team2_id, elo_df):
    elo1 = elo_df.loc[elo_df['TeamID'] == team1_id, 'EloRating'].iloc[0]
    elo2 = elo_df.loc[elo_df['TeamID'] == team2_id, 'EloRating'].iloc[0]
    return team1_id if elo1 > elo2 else team2_id

# Simulate Tourney

In [10]:
def simulate_tournament(seeds_df, slots_df):
    slot_winners = {}   # Initialize a dictionary to store winners of each slot
    for _, row in slots_df.iterrows():
        slot = row['Slot'] # Identify the current slot
        
        # Round 1 uses direct seeds; subsequent rounds use winners from previous rounds
        if row['Slot'].startswith('R1'):
            strong_team_seed = row['StrongSeed']
            weak_team_seed = row['WeakSeed']
        else:
            strong_team_seed = slot_winners.get(row['StrongSeed'])
            weak_team_seed = slot_winners.get(row['WeakSeed'])

        # Get team IDs and predict the winner
        strong_team_id = seeds_df.loc[seeds_df['Seed'] == strong_team_seed, 'TeamID'].iloc[0]
        weak_team_id = seeds_df.loc[seeds_df['Seed'] == weak_team_seed, 'TeamID'].iloc[0]
        winner_id = predict_winner(strong_team_id, weak_team_id, seeds_df)
        winner_seed = seeds_df.loc[seeds_df['TeamID'] == winner_id, 'Seed'].iloc[0]
        slot_winners[slot] = winner_seed

    return slot_winners

In [11]:
def calculate_elo_ratings(games_df, tournament_type):
  
    team_ratings = {team: 1500 for team in set(games_df['WTeamID']).union(games_df['LTeamID'])}
    for _, row in games_df.iterrows():
        w_rating, l_rating = team_ratings[row['WTeamID']], team_ratings[row['LTeamID']]
        team_ratings[row['WTeamID']], team_ratings[row['LTeamID']] = update_elo(w_rating, l_rating)
    
    return pd.DataFrame(list(team_ratings.items()), columns=['TeamID', 'EloRating'])

# Elo with Seeds Merge

In [12]:
def merge_elo_with_seeds(seeds_df, elo_df):
    
    seeds_df = seeds_df.copy()
    seeds_df['TeamID'] = seeds_df['TeamID'].astype(int)
    return pd.merge(seeds_df, elo_df, on='TeamID', how='left')

# Prepare Submission Data

In [13]:
def prepare_submission_data(winners, tournament_type, start_row_id=1):
    submission_data = []
    row_id = start_row_id

    for slot, winner_seed in winners.items():
        game_data = {
            'RowId': row_id,
            'Tournament': tournament_type,
            'Bracket': 1,  # Assuming a single bracket simulation
            'Slot': slot,
            'Team': winner_seed
        }
        submission_data.append(game_data)
        row_id += 1

    return submission_data, row_id 

In [14]:
df_games = pd.read_csv('MRegularSeasonCompactResults.csv')
df_games_w = pd.read_csv('WRegularSeasonCompactResults.csv')
df_seeds = pd.read_csv('2024_tourney_seeds.csv')
round_slots = pd.read_csv('MNCAATourneySlots.csv')
round_slots_w = pd.read_csv('WNCAATourneySlots.csv')

In [15]:
round_slots = round_slots.loc[(round_slots['Season'] == 2023) & (round_slots['Slot'].str.startswith('R'))]
round_slots_w = round_slots_w.loc[(round_slots_w['Season'] == 2023) & (round_slots_w['Slot'].str.startswith('R'))]

In [16]:
# Calculate and merge Elo ratings for Men's
elo_df_m = calculate_elo_ratings(df_games[df_games['Season'] == 2024], 'M')
df_seeds_m = merge_elo_with_seeds(df_seeds[df_seeds['Tournament'] == 'M'], elo_df_m)

In [17]:
df_seeds_m

Unnamed: 0,Tournament,Seed,TeamID,EloRating
0,M,W01,1163,
1,M,W02,1235,
2,M,W03,1228,
3,M,W04,1120,
4,M,W05,1361,
...,...,...,...,...
59,M,Z12,1241,
60,M,Z13,1436,
61,M,Z14,1324,
62,M,Z15,1443,


In [18]:
elo_df_w = calculate_elo_ratings(df_games_w[df_games_w['Season'] == 2024], 'W')
df_seeds_w = merge_elo_with_seeds(df_seeds[df_seeds['Tournament'] == 'W'], elo_df_w)

In [19]:
df_seeds_w

Unnamed: 0,Tournament,Seed,TeamID,EloRating
0,W,W01,3376,1748.987147
1,W,W02,3323,1671.874515
2,W,W03,3333,1634.865307
3,W,W04,3231,1643.106096
4,W,W05,3328,1620.747124
...,...,...,...,...
59,W,Z12,3162,1634.279877
60,W,Z13,3267,1649.148242
61,W,Z14,3238,1619.738361
62,W,Z15,3263,1611.326898


# Simulating Tournament

In [20]:
men_winners = simulate_tournament(df_seeds_m, round_slots)
women_winners = simulate_tournament(df_seeds_w, round_slots_w)

In [21]:
# Start with RowId 1 for Men's tournament
men_submission_data, next_row_id = prepare_submission_data(men_winners, 'M', 1)

# Continue with the next RowId for Women's tournament
women_submission_data, _ = prepare_submission_data(women_winners, 'W', next_row_id)

# Combine Men's and Women's submission data
combined_submission_data = men_submission_data + women_submission_data


# Converting to DF

In [22]:
# Convert to DataFrame
df_submission = pd.DataFrame(combined_submission_data)
df_submission.to_csv('submission.csv', index=False)

In [23]:
df_submission

Unnamed: 0,RowId,Tournament,Bracket,Slot,Team
0,1,M,1,R1W1,W16
1,2,M,1,R1W2,W15
2,3,M,1,R1W3,W14
3,4,M,1,R1W4,W13
4,5,M,1,R1W5,W12
...,...,...,...,...,...
121,122,W,1,R4Y1,Y10
122,123,W,1,R4Z1,Z03
123,124,W,1,R5WX,W01
124,125,W,1,R5YZ,Y10
