# Summary

**This notebook takes predictions formatted the same way as submissions from previous competitions to simulate n brackets.**

- The predictions should have the columns `ID` and `Pred`. Where ID has the following format year_team1_team2 and the prediction is the the predicted probability of team1 winning against team2. 

- Setting `n_brackets=1` and `sim=False` will give you a single full-chalk bracket for each tournament.

Update: Found an inefficiency in the rng of my code after looking at this [simulation notebook](https://www.kaggle.com/code/goodspellr/seed-benchmark-submission) for the Seed-Benchmark by Good Spellr. Updated version should be much faster. Further performance improvements after suggestion by Ryan Armstrong to [precompute the random-values](https://www.kaggle.com/competitions/march-machine-learning-mania-2024/discussion/482696#2690435).

In [1]:
import numpy as np 
import pandas as pd 
from tqdm import tqdm

# Load and filter data
round_slots = pd.read_csv('/kaggle/input/march-machine-learning-mania-2024/MNCAATourneySlots.csv')
round_slots = round_slots[round_slots["Season"] == 2024][:63]

seeds = pd.read_csv('/kaggle/input/march-machine-learning-mania-2024/2024_tourney_seeds.csv')
seeds_m = seeds[seeds['Tournament'] == 'M']
seeds_w = seeds[seeds['Tournament'] == 'W']

# Predictions of last year's 1st place solution by RustyB: https://www.kaggle.com/code/rustyb/paris-madness-2023/output
preds = pd.read_csv('/kaggle/input/neuralnetpredictor/jlukas_matchup_weights.csv') 
preds["W%"] = [float(x.replace("[", "").replace("]", "")) for x in preds["W%"]]
preds["W%"] = [.85 if x > .85 else x for x in preds["W%"]]
preds["W%"] = [.15 if x < .15 else x for x in preds["W%"]]

In [2]:
round_slots

Unnamed: 0,Season,Slot,StrongSeed,WeakSeed
2452,2024,R1W1,W01,W16
2453,2024,R1W2,W02,W15
2454,2024,R1W3,W03,W14
2455,2024,R1W4,W04,W13
2456,2024,R1W5,W05,W12
...,...,...,...,...
2510,2024,R4Y1,R3Y1,R3Y2
2511,2024,R4Z1,R3Z1,R3Z2
2512,2024,R5WX,R4W1,R4X1
2513,2024,R5YZ,R4Y1,R4Z1


In [3]:
def prepare_data(seeds, preds):
    # Function preparing the data for the simulation
    seed_dict = seeds.set_index('Seed')['TeamID'].to_dict()
    inverted_seed_dict = {value: key for key, value in seed_dict.items()}
    probas_dict = {}
    
    for team1, team2, proba in zip(preds['Team_A'], preds['Team_B'], preds['W%']):
        
        probas_dict.setdefault(team1, {})[team2] = proba
        probas_dict.setdefault(team2, {})[team1] = 1 - proba

    return seed_dict, inverted_seed_dict, probas_dict


def simulate(round_slots, seeds, inverted_seeds, probas, random_values, sim=True):
    '''
    Simulates each round of the tournament.

    Parameters:
    - round_slots: DataFrame containing information on who is playing in each round.
    - seeds (dict): Dictionary mapping seed values to team IDs.
    - inverted_seeds (dict): Dictionary mapping team IDs to seed values.
    - probas (dict): Dictionary containing matchup probabilities.
    - random_values (array-like): Array with precomputed random-values.
    - sim (boolean): Simulates match if True. Chooses team with higher probability as winner otherwise.

    Returns:
    - list: List with winning team IDs for each match.
    - list: List with corresponding slot names for each match.
    '''
    winners = []
    slots = []

    for slot, strong, weak, random_val in zip(round_slots.Slot, round_slots.StrongSeed, round_slots.WeakSeed, random_values):
        team1, team2 = seeds[strong], seeds[weak]

        # Get the probability of team_1 winning
        
        #if team1 in probas["Team_A"] and team2 in probas["Team_B"]:
            #proba = [probas["Team_A"] == team1 and probas["Team_B"] == team2]["W%"]
        #else:
            #proba = 1 - [probas["Team_A"] == team2 and probas["Team_B"] == team1]["W%"]
            
        if sim:
            # Randomly determine the winner based on the probability
            proba = probas[team1][team2]
            winner = team1 if random_val < proba else team2
        else:
            # Determine the winner based on the higher probability
            winner = team1
            
        # Append the winner and corresponding slot to the lists
        winners.append(winner)
        slots.append(slot)

        seeds[slot] = winner

    # Convert winners to original seeds using the inverted_seeds dictionary
    return [inverted_seeds[w] for w in winners], slots


def run_simulation(brackets=1, seeds=None, preds=None, round_slots=None, sim=True):
    '''
    Runs a simulation of bracket tournaments.

    Parameters:
    - brackets (int): Number of brackets to simulate.
    - seeds (pd.DataFrame): DataFrame containing seed information.
    - preds (pd.DataFrame): DataFrame containing prediction information for each match-up.
    - round_slots (pd.DataFrame): DataFrame containing information about the tournament rounds.
    - sim (boolean): Simulates matches if True. Chooses team with higher probability as winner otherwise.

    Returns:
    - pd.DataFrame: DataFrame with simulation results.
    '''
    # Get relevant data for the simulation
    seed_dict, inverted_seed_dict, probas_dict = prepare_data(seeds, preds)
    # Lists to store simulation results
    results = []
    bracket = []
    slots = []
    
    # Precompute random-values
    random_values = np.random.random(size=(brackets, len(round_slots)))

    # Iterate through the specified number of brackets
    for b in tqdm(range(1, brackets+1)):
        # Run single simulation
        r, s = simulate(round_slots, seed_dict, inverted_seed_dict, probas_dict, random_values[b-1], sim)
        
        # Update results
        results.extend(r)
        bracket.extend([b] * len(r))
        slots.extend(s)

    # Create final DataFrame
    result_df = pd.DataFrame({'Bracket': bracket, 'Slot': slots, 'Team': results})

    return result_df

n_brackets = 100000
result_m=run_simulation(brackets=n_brackets, seeds=seeds_m, preds=preds, round_slots=round_slots, sim=True)
result_m['Tournament'] = 'M'
result_w=run_simulation(brackets=1, seeds=seeds_w, preds=preds, round_slots=round_slots, sim=False)
result_w['Tournament'] = 'W'
submission = pd.concat([result_m, result_w])
submission.reset_index(inplace=True, drop=True)
submission.index.names = ['RowId']
submission.reset_index(inplace=True)
submission = submission[["RowId", 'Tournament', 'Bracket', 'Slot', 'Team']]

100%|██████████| 100000/100000 [00:16<00:00, 5923.40it/s]
100%|██████████| 1/1 [00:00<00:00, 2347.12it/s]


In [4]:
submission.to_csv('submission.csv')
submission

Unnamed: 0,RowId,Tournament,Bracket,Slot,Team
0,0,M,1,R1W1,W01
1,1,M,1,R1W2,W02
2,2,M,1,R1W3,W14
3,3,M,1,R1W4,W04
4,4,M,1,R1W5,W05
...,...,...,...,...,...
6300058,6300058,W,1,R4Y1,Y01
6300059,6300059,W,1,R4Z1,Z01
6300060,6300060,W,1,R5WX,W01
6300061,6300061,W,1,R5YZ,Y01


In [5]:
sample = pd.read_csv('/kaggle/input/march-machine-learning-mania-2024/sample_submission.csv')

In [6]:
sample

Unnamed: 0,RowId,Tournament,Bracket,Slot,Team
0,0,M,1,R1W1,W01
1,1,M,1,R1W8,W08
2,2,M,1,R1W5,W05
3,3,M,1,R1W4,W04
4,4,M,1,R1W6,W06
...,...,...,...,...,...
121,121,W,1,R4Y1,Y01
122,122,W,1,R4Z1,Z01
123,123,W,1,R5WX,W01
124,124,W,1,R5YZ,Y01


In [7]:
submission[["RowId", 'Tournament', 'Bracket', 'Slot', 'Team']]

Unnamed: 0,RowId,Tournament,Bracket,Slot,Team
0,0,M,1,R1W1,W01
1,1,M,1,R1W2,W02
2,2,M,1,R1W3,W14
3,3,M,1,R1W4,W04
4,4,M,1,R1W5,W05
...,...,...,...,...,...
6300058,6300058,W,1,R4Y1,Y01
6300059,6300059,W,1,R4Z1,Z01
6300060,6300060,W,1,R5WX,W01
6300061,6300061,W,1,R5YZ,Y01
