# Madness of March
## Jim Haines & Josh McCoy
### [Project Website](https://joshmccoy2.github.io/NCAA_March_Madness/)

## Current Datasets
[Kaggle datasets](https://www.kaggle.com/competitions/mens-march-mania-2022/data)

# ETL & EDA

In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd 

# Elo Score

In [2]:
def update_elo(winner_rating, loser_rating):
    K = 20  # Maximum change per game
    expected_win = 1 / (1 + 10 ** ((loser_rating - winner_rating) / 400))
    new_winner_rating = winner_rating + K * (1 - expected_win)
    new_loser_rating = loser_rating - K * (1 - expected_win)
    return new_winner_rating, new_loser_rating


Let's check the datatypes

# Predicting Winner 

In [3]:
def predict_winner(team1_id, team2_id, elo_df):
    elo1 = elo_df.loc[elo_df['TeamID'] == team1_id, 'EloRating'].iloc[0]
    elo2 = elo_df.loc[elo_df['TeamID'] == team2_id, 'EloRating'].iloc[0]
    return team1_id if elo1 > elo2 else team2_id

# Simulate Tourney

In [4]:
def simulate_tournament(seeds_df, slots_df):
    slot_winners = {}   # Initialize a dictionary to store winners of each slot
    for _, row in slots_df.iterrows():
        slot = row['Slot'] # Identify the current slot
        
        # Round 1 uses direct seeds; subsequent rounds use winners from previous rounds
        if row['Slot'].startswith('R1'):
            strong_team_seed = row['StrongSeed']
            weak_team_seed = row['WeakSeed']
        else:
            strong_team_seed = slot_winners.get(row['StrongSeed'])
            weak_team_seed = slot_winners.get(row['WeakSeed'])

        # Get team IDs and predict the winner
        strong_team_id = seeds_df.loc[seeds_df['Seed'] == strong_team_seed, 'TeamID'].iloc[0]
        weak_team_id = seeds_df.loc[seeds_df['Seed'] == weak_team_seed, 'TeamID'].iloc[0]
        winner_id = predict_winner(strong_team_id, weak_team_id, seeds_df)
        winner_seed = seeds_df.loc[seeds_df['TeamID'] == winner_id, 'Seed'].iloc[0]
        slot_winners[slot] = winner_seed

    return slot_winners

In [5]:
def calculate_elo_ratings(games_df, tournament_type):
  
    team_ratings = {team: 1500 for team in set(games_df['WTeamID']).union(games_df['LTeamID'])}
    for _, row in games_df.iterrows():
        w_rating, l_rating = team_ratings[row['WTeamID']], team_ratings[row['LTeamID']]
        team_ratings[row['WTeamID']], team_ratings[row['LTeamID']] = update_elo(w_rating, l_rating)
    
    return pd.DataFrame(list(team_ratings.items()), columns=['TeamID', 'EloRating'])

# Elo with Seeds Merge

In [6]:
def merge_elo_with_seeds(seeds_df, elo_df):
    
    seeds_df = seeds_df.copy()
    seeds_df['TeamID'] = seeds_df['TeamID'].astype(int)
    return pd.merge(seeds_df, elo_df, on='TeamID', how='left')

In [7]:
def prepare_submission_data(winners, tournament_type, start_row_id=1):
    submission_data = []
    row_id = start_row_id

    for slot, winner_seed in winners.items():
        game_data = {
            'RowId': row_id,
            'Tournament': tournament_type,
            'Bracket': 1,  # Assuming a single bracket simulation
            'Slot': slot,
            'Team': winner_seed
        }
        submission_data.append(game_data)
        row_id += 1

    return submission_data, row_id 

In [8]:
df_games = pd.read_csv('MRegularSeasonCompactResults.csv')
df_games_w = pd.read_csv('WRegularSeasonCompactResults.csv')
df_seeds = pd.read_csv('2024_tourney_seeds.csv')
round_slots = pd.read_csv('MNCAATourneySlots.csv')
round_slots_w = pd.read_csv('WNCAATourneySlots.csv')
mteams_df = pd.read_csv('MTeams.csv')

In [9]:
round_slots = round_slots.loc[(round_slots['Season'] == 2023) & (round_slots['Slot'].str.startswith('R'))]
round_slots_w = round_slots_w.loc[(round_slots_w['Season'] == 2023) & (round_slots_w['Slot'].str.startswith('R'))]

In [19]:
round_slots

Unnamed: 0,Season,Slot,StrongSeed,WeakSeed
2385,2023,R1W1,W01,W16
2386,2023,R1W2,W02,W15
2387,2023,R1W3,W03,W14
2388,2023,R1W4,W04,W13
2389,2023,R1W5,W05,W12
...,...,...,...,...
2443,2023,R4Y1,R3Y1,R3Y2
2444,2023,R4Z1,R3Z1,R3Z2
2445,2023,R5WX,R4W1,R4X1
2446,2023,R5YZ,R4Y1,R4Z1


In [10]:
# Calculate and merge Elo ratings for Men's
elo_df_m = calculate_elo_ratings(df_games[df_games['Season'] == 2024], 'M')
df_seeds_m = merge_elo_with_seeds(df_seeds[df_seeds['Tournament'] == 'M'], elo_df_m)

In [11]:
elo_df_m

Unnamed: 0,TeamID,EloRating
0,1101,1487.901975
1,1102,1406.095517
2,1103,1571.833179
3,1104,1590.786091
4,1105,1427.443835
...,...,...
357,1474,1451.892523
358,1475,1363.310523
359,1476,1323.238686
360,1477,1422.599588


In [12]:
df_seeds_m

Unnamed: 0,Tournament,Seed,TeamID,EloRating
0,M,W01,1163,1709.078366
1,M,W02,1235,1667.421915
2,M,W03,1228,1645.301820
3,M,W04,1120,1658.235629
4,M,W05,1361,1595.552448
...,...,...,...,...
59,M,Z12,1241,1665.157187
60,M,Z13,1436,1644.941511
61,M,Z14,1324,1590.077150
62,M,Z15,1443,1543.195370


In [13]:
elo_df_w = calculate_elo_ratings(df_games_w[df_games_w['Season'] == 2024], 'W')
df_seeds_w = merge_elo_with_seeds(df_seeds[df_seeds['Tournament'] == 'W'], elo_df_w)

In [14]:
df_seeds_w

Unnamed: 0,Tournament,Seed,TeamID,EloRating
0,W,W01,3376,1748.987147
1,W,W02,3323,1671.874515
2,W,W03,3333,1634.865307
3,W,W04,3231,1643.106096
4,W,W05,3328,1620.747124
...,...,...,...,...
59,W,Z12,3162,1634.279877
60,W,Z13,3267,1649.148242
61,W,Z14,3238,1619.738361
62,W,Z15,3263,1611.326898


# Simulating Tournament

In [15]:
men_winners = simulate_tournament(df_seeds_m, round_slots)
women_winners = simulate_tournament(df_seeds_w, round_slots_w)

In [16]:
# Start with RowId 1 for Men's tournament
men_submission_data, next_row_id = prepare_submission_data(men_winners, 'M', 1)

# Continue with the next RowId for Women's tournament
women_submission_data, _ = prepare_submission_data(women_winners, 'W', next_row_id)

# Combine Men's and Women's submission data
combined_submission_data = men_submission_data + women_submission_data


# Converting to DF

In [17]:
# Convert to DataFrame
df_submission = pd.DataFrame(combined_submission_data)
df_submission.to_csv('submission.csv', index=False)

In [18]:
df_submission

Unnamed: 0,RowId,Tournament,Bracket,Slot,Team
0,1,M,1,R1W1,W01
1,2,M,1,R1W2,W02
2,3,M,1,R1W3,W03
3,4,M,1,R1W4,W04
4,5,M,1,R1W5,W05
...,...,...,...,...,...
121,122,W,1,R4Y1,Y10
122,123,W,1,R4Z1,Z03
123,124,W,1,R5WX,W01
124,125,W,1,R5YZ,Y10
