In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from gc import collect
import os
import sys
from tqdm import tqdm

# display 100 rows and 100 columns
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

# global random seed
SEED = 0

# set numpy seed
np.random.seed(SEED)

In [2]:
# root dirs
root = 'data/'
mroot = 'data/mens/'
wroot = 'data/womens/'

# load data, get slots for 2024, drop play-ins
slots = pd.read_csv(mroot + 'MNCAATourneySlots.csv')
slots = slots[slots['Season'] == 2024]
slots = slots[slots['Slot'].str.contains('R')].reset_index(drop=True)

# load seed data
seeds_2024 = pd.read_csv(root + '2024_tourney_seeds.csv')

# split into mens and womens
mseeds = seeds_2024[seeds_2024['TeamID'] < 3000].reset_index(drop=True)
wseeds = seeds_2024[seeds_2024['TeamID'] >= 3000].reset_index(drop=True)

# delete vars
del root, mroot, wroot, seeds_2024

In [3]:
# men's historical seed differential winning percentages
mens_seed_win = [(-15, 0.013157894736842105),(-14, 0.04276315789473684),(-13, 0.07236842105263158),(-12, 0.10013769889840882),(-11, 0.12790697674418605),(-10, 0.16451214758997013),(-9, 0.2011173184357542),(-8, 0.25),(-7, 0.30364372469635625),(-6, 0.3222764078027236), 
                 (-5, 0.3409090909090909),(-4, 0.3565744192715802),(-3, 0.3722397476340694),(-2, 0.41212997482713576),(-1, 0.45202020202020204),(0, 0.5),(1, 0.547979797979798),(2, 0.5878700251728642),(3, 0.6277602523659306),(4, 0.6434255807284198),(5, 0.6590909090909092), 
                 (6, 0.6777235921972764),(7, 0.6963562753036437),(8, 0.75),(9, 0.7988826815642458),(10, 0.8354878524100299),(11, 0.872093023255814),(12, 0.8998623011015912),(13, 0.9276315789473684),(14, 0.9572368421052632),(15, 0.9868421052631579)]

# women's historical seed differential winning percentages
womens_seed_win = [(-15, 0.01),(-14, 0.017182890855457225),(-13, 0.024365781710914452),(-12, 0.03154867256637168),(-11, 0.0387315634218289),(-10, 0.04591445427728613),(-9, 0.05309734513274336),(-8, 0.17006802721088435),(-7, 0.17197452229299362),(-6, 0.22679185884764624), 
                   (-5, 0.28160919540229884),(-4, 0.30250672536072387),(-3, 0.32340425531914896),(-2, 0.3844744048872972),(-1, 0.44554455445544555),(0, 0.5),(1, 0.5544554455445545),(2, 0.6155255951127028),(3, 0.676595744680851),(4, 0.6974932746392761),(5, 0.7183908045977012), 
                   (6, 0.7732081411523537),(7, 0.8280254777070064),(8, 0.8299319727891157),(9, 0.9469026548672567),(10, 0.9540855457227139),(11, 0.9612684365781711),(12, 0.9684513274336283),(13, 0.9756342182890856),(14, 0.9828171091445428),(15, 0.99)]

In [4]:
def generate_bracket(seeds_2024, tournament, num_brackets, slots_df=slots):
    """
    Generate a single bracket for the 2024 NCAA tournament.

    Parameters
    ----------
    seeds : pd.DataFrame
        Seeds of the teams competing in the 2024 tournament.
    tournament : str
        'M' or 'W'.
    num_brackets : int
        Number of brackets to generate.
    slots : pd.DataFrame
        Slots for the 2024 tournament.

    Returns
    -------
    all_brackets : pd.DataFrame
        DataFrame with the predicted outcomes of the tournament.
    
    """

    # get copy of data to avoid modifying the original
    seeds = seeds_2024.copy()

    # create empty df for all brackets
    all_brackets = pd.DataFrame()

    # loop for each bracket
    for n in tqdm(range(1, num_brackets+1), desc='Bracket Generation', file=sys.stdout):
        # create bracket-specific slots table
        slots = slots_df.copy()

        # create empty results for round
        result_df = pd.DataFrame(columns=["Slot", "Team"])

        # 6 rounds in a single bracket
        for i in range(1, 7):
            # get slots for round
            slots_round = slots[slots['Slot'].str.contains(f'R{i}')].reset_index(drop=True)

            # holds data for each matchup
            round_matchups = []

            # loop through the slots
            for idx, row in slots_round.iterrows():
                # get team A and team B
                A = seeds[seeds['Seed'] == row['StrongSeed']].reset_index(drop=True)
                B = seeds[seeds['Seed'] == row['WeakSeed']].reset_index(drop=True)

                # rename cols
                A = A.add_prefix('A_')
                B = B.add_prefix('B_')

                # create matchup dataframe
                combined = pd.concat([A, B], axis=1)

                # append combined row to the list
                round_matchups.append(combined)

            # concatenate all matchup rows into a single DataFrame
            round_df = pd.concat(round_matchups, axis=0).reset_index(drop=True)

            # calculate raw seeds
            round_df['A_seed_num'] = round_df['A_Seed'].apply(lambda x: int(x[1:]))
            round_df['B_seed_num'] = round_df['B_Seed'].apply(lambda x: int(x[1:]))
            
            # calculate seed diff
            round_df['A_seed_diff'] = round_df['B_seed_num'] - round_df['A_seed_num']

            # get seed win probability
            def get_seed_win_prob(x, tournament=tournament): 
                # get proper gender seed win list
                seed_list = mens_seed_win if tournament == 'M' else womens_seed_win
                
                # win prob is second value in each tuple
                seed_win_list = [y[1] for y in seed_list if y[0] == x]
                return seed_win_list[0]

            # add cols
            round_df['A_seed_win_prob'] = round_df['A_seed_diff'].apply(get_seed_win_prob)
            
            # predict the outcomes of the round
            preds = round_df['A_seed_win_prob'].values

            # if n == 1:
            #     print(f'\n\nround {i}:')
            #     print(f'preds before: {preds}')

            # generate random values, update preds
            random_values = np.random.rand(len(preds))
            preds = (random_values > preds).astype(int)

            # if n == 1:
            #     print(f'random values: {random_values}')
            #     print(f'preds after: {preds}')

            # replace preds with seed of winning team. prediction of 0 = team A wins, 1 = team B wins
            preds = np.where(preds == 0, round_df['A_Seed'], round_df['B_Seed'])

            for slot, winner_seed in zip(slots_round['Slot'], preds):
                # save results to result_df
                result_df.loc[len(result_df.index)] = [slot, winner_seed]

            # edit slots df for next round
            if i != 6:
                next_round_slots = slots[slots['Slot'].str.contains(f'R{i+1}')]

                for idx, row in next_round_slots.iterrows():
                    # get the teams playing in that slot for the next round
                    team1 = result_df[result_df['Slot'] == row['StrongSeed']]['Team'].values[0]
                    team2 = result_df[result_df['Slot'] == row['WeakSeed']]['Team'].values[0]

                    # update the slots df
                    slots.loc[slots['Slot'] == row['Slot'], 'StrongSeed'] = team1
                    slots.loc[slots['Slot'] == row['Slot'], 'WeakSeed'] = team2

        # add bracket col
        result_df['Bracket'] = n

        # append to all_brackets
        all_brackets = pd.concat([all_brackets, result_df], axis=0)

    # add tournament col
    all_brackets['Tournament'] = tournament

    return all_brackets

In [8]:
# generate men's bracket
m_brackets = generate_bracket(seeds_2024=mseeds, tournament='M', num_brackets=1)

Bracket Generation: 100%|██████████| 5/5 [00:00<00:00,  5.80it/s]


In [28]:
m_brackets.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62
Slot,R1W1,R1W2,R1W3,R1W4,R1W5,R1W6,R1W7,R1W8,R1X1,R1X2,R1X3,R1X4,R1X5,R1X6,R1X7,R1X8,R1Y1,R1Y2,R1Y3,R1Y4,R1Y5,R1Y6,R1Y7,R1Y8,R1Z1,R1Z2,R1Z3,R1Z4,R1Z5,R1Z6,R1Z7,R1Z8,R2W1,R2W2,R2W3,R2W4,R2X1,R2X2,R2X3,R2X4,R2Y1,R2Y2,R2Y3,R2Y4,R2Z1,R2Z2,R2Z3,R2Z4,R3W1,R3W2,R3X1,R3X2,R3Y1,R3Y2,R3Z1,R3Z2,R4W1,R4X1,R4Y1,R4Z1,R5WX,R5YZ,R6CH
Team,W01,W02,W03,W13,W05,W11,W07,W09,X01,X15,X03,X04,X05,X06,X07,X08,Y01,Y02,Y03,Y04,Y05,Y06,Y07,Y08,Z01,Z02,Z14,Z04,Z05,Z06,Z10,Z08,W01,W02,W03,W05,X01,X07,X03,X05,Y08,Y02,Y03,Y05,Z01,Z02,Z06,Z04,W05,W03,X01,X03,Y05,Y03,Z01,Z06,W03,X03,Y03,Z06,X03,Y03,Y03
Bracket,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
Tournament,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M


In [29]:
# generate women's bracket
w_brackets = generate_bracket(seeds_2024=wseeds, tournament='W', num_brackets=1)



round 1:
preds before: [0.99       0.97563422 0.96126844 0.94690265 0.82802548 0.7183908
 0.67659574 0.55445545 0.99       0.97563422 0.96126844 0.94690265
 0.82802548 0.7183908  0.67659574 0.55445545 0.99       0.97563422
 0.96126844 0.94690265 0.82802548 0.7183908  0.67659574 0.55445545
 0.99       0.97563422 0.96126844 0.94690265 0.82802548 0.7183908
 0.67659574 0.55445545]
random values: [0.0191932  0.30157482 0.66017354 0.29007761 0.61801543 0.4287687
 0.13547406 0.29828233 0.56996491 0.59087276 0.57432525 0.65320082
 0.65210327 0.43141844 0.8965466  0.36756187 0.43586493 0.89192336
 0.80619399 0.70388858 0.10022689 0.91948261 0.7142413  0.99884701
 0.1494483  0.86812606 0.16249293 0.61555956 0.12381998 0.84800823
 0.80731896 0.56910074]
preds after: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 1 0 0 0 0 0 1 1 1]


round 2:
preds before: [0.82802548 0.7183908  0.67659574 0.55445545 0.82802548 0.82993197
 0.67659574 0.55445545 0.82993197 0.82993197 0.82993197 0.55445545
 0.8299

In [30]:
w_brackets.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62
Slot,R1W1,R1W2,R1W3,R1W4,R1W5,R1W6,R1W7,R1W8,R1X1,R1X2,R1X3,R1X4,R1X5,R1X6,R1X7,R1X8,R1Y1,R1Y2,R1Y3,R1Y4,R1Y5,R1Y6,R1Y7,R1Y8,R1Z1,R1Z2,R1Z3,R1Z4,R1Z5,R1Z6,R1Z7,R1Z8,R2W1,R2W2,R2W3,R2W4,R2X1,R2X2,R2X3,R2X4,R2Y1,R2Y2,R2Y3,R2Y4,R2Z1,R2Z2,R2Z3,R2Z4,R3W1,R3W2,R3X1,R3X2,R3Y1,R3Y2,R3Z1,R3Z2,R4W1,R4X1,R4Y1,R4Z1,R5WX,R5YZ,R6CH
Team,W01,W02,W03,W04,W05,W06,W07,W08,X01,X02,X03,X04,X05,X06,X10,X08,Y01,Y02,Y03,Y04,Y05,Y11,Y10,Y09,Z01,Z02,Z03,Z04,Z05,Z11,Z10,Z09,W01,W02,W06,W04,X01,X10,X06,X05,Y01,Y02,Y03,Y04,Z01,Z02,Z03,Z04,W04,W02,X01,X06,Y04,Y02,Z01,Z03,W02,X01,Y02,Z01,X01,Z01,Z01
Bracket,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
Tournament,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W


In [None]:
# num brackets
n_brackets = 100000

# generate brackets
m_brackets = generate_bracket(seeds_2024=mseeds, tournament="M", num_brackets=n_brackets)
w_brackets = generate_bracket(seeds_2024=wseeds, tournament="W", num_brackets=n_brackets)

# combine results
submission = pd.concat([m_brackets, w_brackets])
submission = submission.reset_index(drop=True)
submission.index.names = ['RowId']

# reorder
submission = submission[['Tournament', 'Bracket', 'Slot', 'Team']]

# check
submission.head()