In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import os

In [2]:
def parse_seed(seed_string):
    seed_num = ''
    for letter in seed_string:
        if letter.isdigit():
            seed_num += letter

    seed_int = int(seed_num)
    return seed_int

In [3]:
basepath = os.path.expanduser('~/Documents/march-madness-ML')

In [4]:
data_folder = os.path.join(basepath,'MDataFiles_Stage1')

In [5]:
teams = pd.read_csv(os.path.join(data_folder,'MTeams.csv'),index_col=0)

In [6]:
seasons = pd.read_csv(os.path.join(data_folder,'MSeasons.csv'),index_col=0)

In [7]:
tournament_results = pd.read_csv(os.path.join(data_folder,'MNCAATourneyCompactResults.csv'))

In [8]:
tourney_seeds = pd.read_csv(os.path.join(data_folder,'MNCAATourneySeeds.csv'))

In [9]:
seasons = np.arange(1985,2021)

In [10]:
seed_win_counts = {}

number_seeds = 16
for i in range(1,number_seeds):
    seed_opponents = {}
    
    for j in range(i,number_seeds+1):
        seed_opponents[j] = {}
        seed_opponents[j]['wins'] = 0
        seed_opponents[j]['losses'] = 0
        
    seed_win_counts[i] = seed_opponents

In [11]:
for season in seasons:
    if season == 2020:
        continue
        
    season_tourney_seeds = tourney_seeds[tourney_seeds['Season']==season].set_index('TeamID')
    season_tournament_results = tournament_results[tournament_results['Season']==season]
    
    for i in season_tournament_results.index:
        game_result = season_tournament_results.loc[i]
        winner_id = game_result['WTeamID']
        loser_id = game_result['LTeamID']

        winner_seed = parse_seed(season_tourney_seeds.loc[winner_id,'Seed'])
        loser_seed = parse_seed(season_tourney_seeds.loc[loser_id,'Seed'])

        if winner_seed == loser_seed:
            continue
        elif winner_seed < loser_seed:
            seed_win_counts[winner_seed][loser_seed]['wins'] += 1
        elif loser_seed < winner_seed:
            seed_win_counts[loser_seed][winner_seed]['losses'] += 1

In [12]:
number_seeds = 16
for i in range(1,number_seeds):
    for j in range(i,number_seeds+1):
        if i == j:
            pct = 0.5
        else:  
            high_seed_wins = seed_win_counts[i][j]['wins']
            low_seed_wins = seed_win_counts[i][j]['losses']

            if (high_seed_wins + low_seed_wins) >= 5:
                pct = high_seed_wins / (high_seed_wins + low_seed_wins)
                if pct == 1:
                    pct = .99
            else:
                pct = 0.5
            
        seed_win_counts[i][j]['pct'] = round(pct,3)

In [13]:
seed_win_counts

{1: {1: {'wins': 0, 'losses': 0, 'pct': 0.5},
  2: {'wins': 35, 'losses': 30, 'pct': 0.538},
  3: {'wins': 21, 'losses': 14, 'pct': 0.6},
  4: {'wins': 46, 'losses': 17, 'pct': 0.73},
  5: {'wins': 39, 'losses': 8, 'pct': 0.83},
  6: {'wins': 8, 'losses': 3, 'pct': 0.727},
  7: {'wins': 6, 'losses': 1, 'pct': 0.857},
  8: {'wins': 56, 'losses': 14, 'pct': 0.8},
  9: {'wins': 66, 'losses': 6, 'pct': 0.917},
  10: {'wins': 5, 'losses': 1, 'pct': 0.833},
  11: {'wins': 4, 'losses': 3, 'pct': 0.571},
  12: {'wins': 20, 'losses': 0, 'pct': 0.99},
  13: {'wins': 4, 'losses': 0, 'pct': 0.5},
  14: {'wins': 0, 'losses': 0, 'pct': 0.5},
  15: {'wins': 0, 'losses': 0, 'pct': 0.5},
  16: {'wins': 139, 'losses': 1, 'pct': 0.993}},
 2: {2: {'wins': 0, 'losses': 0, 'pct': 0.5},
  3: {'wins': 34, 'losses': 22, 'pct': 0.607},
  4: {'wins': 3, 'losses': 4, 'pct': 0.429},
  5: {'wins': 0, 'losses': 5, 'pct': 0.0},
  6: {'wins': 24, 'losses': 7, 'pct': 0.774},
  7: {'wins': 57, 'losses': 25, 'pct': 0.695

In [14]:
seed_win_counts.keys()

dict_keys([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])

In [15]:
results = []
for seed in seed_win_counts.keys():
    for opp_seed in seed_win_counts[seed]:
        pct = seed_win_counts[seed][opp_seed]['pct']
        
        pct = min(pct,0.95)
        
        pct = max(pct,0.05)
        
        results.append([seed,opp_seed,pct])
        
results = pd.DataFrame(results,columns=['Team Seed','Opponent Seed','pct'])

In [16]:
results

Unnamed: 0,Team Seed,Opponent Seed,pct
0,1,1,0.500
1,1,2,0.538
2,1,3,0.600
3,1,4,0.730
4,1,5,0.830
...,...,...,...
130,14,14,0.500
131,14,15,0.500
132,14,16,0.500
133,15,15,0.500


In [17]:
results.to_csv(os.path.join(basepath,'seed win percentage.csv'),index=False)

In [None]:
seed1 = 1
seed2 = 3

In [None]:
pct = results[(results['Team Seed']==seed1) & (results['Opponent Seed']==seed2)]

In [None]:
pct['pct'].values[0]

In [None]:
season = 2019

season_tourney_seeds = tourney_seeds[tourney_seeds['Season']==season].set_index('TeamID')

season_tourney_seeds.head()

In [None]:
season_tourney_seeds = season_tourney_seeds.sort_index() 

In [None]:
predictions = []

for team1 in season_tourney_seeds.index:
    for team2 in season_tourney_seeds.index:
        if team1 == team2 or team1 > team2:
            continue
            
        string = f'{season}_{team1}_{team2}'
        
        team1_seed = parse_seed(season_tourney_seeds.loc[team1]['Seed'])
        team2_seed = parse_seed(season_tourney_seeds.loc[team2]['Seed'])
        
        if team1_seed < team2_seed:
            pct = seed_win_counts[team1_seed][team2_seed]['pct']
        elif team2_seed < team1_seed:
            pct = 1 - seed_win_counts[team2_seed][team1_seed]['pct']
        elif team1_seed == team2_seed:
            pct = 0.5
         
        prediction = [string,pct]
        predictions.append(prediction)

In [None]:
prediction_df = pd.DataFrame(predictions,columns=['ID','Pred'])

In [None]:
prediction_df.to_csv('seed win percentage - predictions.csv',index=False)

In [None]:
prediction_df.shape

season = 2019

season_tourney_seeds = tourney_seeds[tourney_seeds['Season']==season].set_index('TeamID')
season_tournament_results = tournament_results[tournament_results['Season']==season]

for i in season_tournament_results.index:
    game_result = season_tournament_results.loc[i]
    winner_id = game_result['WTeamID']
    loser_id = game_result['LTeamID']

    winner_seed = parse_seed(season_tourney_seeds.loc[winner_id,'Seed'])
    loser_seed = parse_seed(season_tourney_seeds.loc[loser_id,'Seed'])