In [23]:
!ls ./datasets/kaggle/

MMasseyOrdinals.csv		MNCAATourneySeeds.csv		  MTeams.csv
MNCAATourneyCompactResults.csv	MRegularSeasonCompactResults.csv
team_2024.csv


In [24]:
!ls ./datasets/torvik/mens/

team_2024.csv


In [6]:
import pandas as pd
import numpy as np

In [7]:
m_teams = pd.read_csv("./datasets/kaggle/MTeams.csv")
m_teams.head()

Unnamed: 0,TeamID,TeamName,FirstD1Season,LastD1Season
0,1101,Abilene Chr,2014,2025
1,1102,Air Force,1985,2025
2,1103,Akron,1985,2025
3,1104,Alabama,1985,2025
4,1105,Alabama A&M,2000,2025


In [8]:
teamIDToName = {}
teamNameToID = {}

for row in range(m_teams.shape[0]):
    teamID = m_teams.iloc[row]["TeamID"].item()
    teamName = m_teams.iloc[row]["TeamName"]
    teamIDToName[teamID] = teamName
    teamNameToID[teamName] = teamID

In [9]:
m_massey_ordinals = pd.read_csv("./datasets/kaggle/MMasseyOrdinals.csv")
m_massey_ordinals[m_massey_ordinals["SystemName"] == "POM"].tail()

Unnamed: 0,Season,RankingDayNum,SystemName,TeamID,OrdinalRank
5520900,2025,121,POM,1476,318
5520901,2025,121,POM,1477,330
5520902,2025,121,POM,1478,353
5520903,2025,121,POM,1479,342
5520904,2025,121,POM,1480,350


In [10]:
m_ncaa_regular_season = pd.read_csv("./datasets/kaggle/MRegularSeasonCompactResults.csv")
m_ncaa_regular_season[m_ncaa_regular_season["Season"] == 2024].head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
181682,2024,0,1101,64,1329,59,A,0
181683,2024,0,1103,81,1355,75,A,0
181684,2024,0,1104,105,1287,73,H,0
181685,2024,0,1112,122,1288,59,H,0
181686,2024,0,1114,71,1402,66,H,0


In [11]:
m_ncaa_tourney_compact = pd.read_csv("./datasets/kaggle/MNCAATourneyCompactResults.csv")
m_ncaa_tourney_compact[m_ncaa_tourney_compact["Season"] == 2024].head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
2451,2024,134,1161,67,1438,42,N,0
2452,2024,134,1447,71,1224,68,N,0
2453,2024,135,1160,60,1129,53,N,0
2454,2024,135,1212,88,1286,81,N,0
2455,2024,136,1112,85,1253,65,N,0


In [12]:
m_ncaa_tourney_seeds = pd.read_csv("./datasets/kaggle/MNCAATourneySeeds.csv")
m_ncaa_tourney_seeds[m_ncaa_tourney_seeds["Season"] == 2024].head()

Unnamed: 0,Season,Seed,TeamID
2490,2024,W01,1163
2491,2024,W02,1235
2492,2024,W03,1228
2493,2024,W04,1120
2494,2024,W05,1361


In [13]:
# For each year Y: 
# For each team T that played in the tournament in year Y (ignore teams eliminated in play-in round):
# Record the number of wins by team T in the tournament in year Y, and collect the covariates for OLRE model

# Covariates:
# winning percentage at the end of the regular season (WINPCT)
# point differential at the end of the regular season (DIFF)
# wins above bubble (WAB)
# number of wins against teams rated in the Top 30 based on Torvik's ratings at the end of the regular season (TOP30WIN)

# Let's do this for mens in year 2024

year = 2024
tourney_seeds = m_ncaa_tourney_seeds[m_ncaa_tourney_seeds["Season"] == year].reset_index(drop=True)
tourney_compact = m_ncaa_tourney_compact[m_ncaa_tourney_compact["Season"] == year].reset_index(drop=True)

teamIDs_round_64 = []
teamNames_round_64 = []

for i in range(tourney_seeds.shape[0]):
    teamID = tourney_seeds.at[i, "TeamID"]
    seed = tourney_seeds.at[i, "Seed"]
    if len(seed) > 3:
        # competed in the play-in round
        # check if they made it past this round
        team_tournament_games_won = tourney_compact[lambda df: df["WTeamID"] == teamID]
        if team_tournament_games_won.shape[0] == 0:
            # this means the play-in team won zero games (i.e. didn't make it past play-in round)
            continue
            
    teamIDs_round_64.append(teamID)
    teamNames_round_64.append(m_teams[m_teams["TeamID"] == teamID].iloc[0]["TeamName"])

if len(teamIDs_round_64) != 64:
    raise Exception(f"Expected 64 teams, found {len(teamIDs_round_64)} teams")

# For each of these 64 teams, compute how many games they won in the tournament (ignoring play-in games)
n_tournament_games_won = []

for teamID in teamIDs_round_64:
    # Tournament games won
    team_tournament_games_won = tourney_compact[lambda df: df["WTeamID"] == teamID]
    seed = tourney_seeds[tourney_seeds["TeamID"] == teamID].iloc[0]["Seed"]
    n_games_won = team_tournament_games_won.shape[0] - (0 if len(seed) == 3 else 1)
    n_tournament_games_won.append(n_games_won)  

n_tournament_games_won = np.array(n_tournament_games_won)
if n_tournament_games_won.sum() != 32 + 16 + 8 + 4 + 2 + 1:
    raise Exception(f"Expected total of {32 + 16 + 8 + 4 + 2 + 1} games won across all teams in the tournament, found {n_tournament_games_won.sum()}")

m_ncaa_tourney_wins = pd.DataFrame(np.array([teamIDs_round_64, teamNames_round_64, n_tournament_games_won]).T, columns=["TeamID", "TeamName", "Tournament Wins"])
m_ncaa_tourney_wins

Unnamed: 0,TeamID,TeamName,Tournament Wins
0,1163,Connecticut,6
1,1235,Iowa St,2
2,1228,Illinois,3
3,1120,Auburn,0
4,1361,San Diego St,2
...,...,...,...
59,1241,James Madison,1
60,1436,Vermont,0
61,1324,Oakland,1
62,1443,WKU,0


In [14]:
m_torvik_team_2024 = pd.read_csv("./datasets/torvik/mens/team_2024.csv")
m_torvik_team_2024 = m_torvik_team_2024.dropna(axis=1, how="all")
m_torvik_team_2024.iloc[0]

TeamName        UC Santa Barbara
ADJOE                 104.517757
ADJDE                 109.958647
BARTHAG                 0.358106
Record                     14–15
Wins                          14
Games Played                  29
EFG%                        53.3
EFGD%                       50.6
FTR                         39.7
FTRD                        31.0
TOR                         19.1
TORD                        14.3
ORB                         25.2
DRB                         26.6
Unnamed: 15                 68.5
2P%                         53.4
2P%D                        51.8
3P%                         35.4
3P%D                        32.1
Unnamed: 20                  7.9
Unnamed: 21                  7.1
Unnamed: 22                 50.0
Unnamed: 23                 45.9
3PR                         30.5
3PRD                        33.7
ADJ T.                      68.2
Unnamed: 30                 2024
WAB                   -10.710547
Unnamed: 35                 74.4
Unnamed: 3

In [15]:
# Iterate through all Torvik team names, and check if it exists in the Kaggle Teams dataset.
# If no match exists, find the match and populate the conversion map.
# After some thinking, I realize it's more efficient to check the names of teams in the tournament, not all D1 schools.

# torvikToKaggleTeamName = {
#     "Arkansas Pine Bluff": "Ark Pine Bluff",
#     "Central Michigan": "C Michigan",
#     "FIU": "Florida Intl",
#     "Illinois Chicago": "IL Chicago",
#     "Florida Atlantic": "FL Atlantic",
#     "Cal St. Northridge": "CS Northridge",
#     "Abilene Christian": "Abilene Chr",
#     "UMKC": "Missouri KC",
#     "Northern Illinois": "N Illinois",
#     "Kent St": "Kent",
#     "Central Arkansas": "Cent Arkansas",
#     "American": "American Univ",
#     "UTSA": "UT San Antonio",
#     "Prairie View A&M": "Prairie View",
#     "Georgia Southern": "Ga Southern",
#     "Tennessee Martin": "TN Martin",
#     "LIU": "LIU Brooklyn",
#     "Loyola Chicago": "Loyola-Chicago",
#     "Maryland Eastern Shore": "MD E Shore",
#     "Fairleigh Dickinson": "F Dickinson",
#     "Queens": "Queens NC",
#     "Nebraska Omaha": "NE Omaha",
# }

In [16]:
# Go through all Torvik team names, make sure we can match them with the team names in the Kaggle dataset
kaggleToTorvikTeamNames = {
    "FL Atlantic": "Florida Atlantic",
    "St Mary's CA": "Saint Mary's",
    "Col Charleston": "Charleston",
    "S Dakota St": "South Dakota St.",
    "Grambling": "Grambling St.",
    "NC State": "North Carolina",
    "WKU": "Western Kentucky",
}

def preprocess_torvik_dataframe(torvik_df):
    torvik_teams = torvik_df["TeamName"]
    matches_torvik_teamName = lambda kaggle_teamName: ((torvik_teams == kaggle_teamName).sum().item() == 1)
    
    for kaggle_teamName in teamNames_round_64:
        if matches_torvik_teamName(kaggle_teamName):
            continue
            
        if kaggle_teamName.startswith("St "):
            test_teamName = "Saint" + kaggle_teamName[kaggle_teamName.index("St") + len("St"):]
            print(f"No match for '{kaggle_teamName}'\t checking if '{test_teamName}' exists")
            if matches_torvik_teamName(test_teamName):
                torvik_teams[torvik_teams == test_teamName]["TeamName"] = kaggle_teamName
                continue
                
        if kaggle_teamName.endswith("St"):
            # Looks like Kaggle uses St, while Torvid uses St.
            # We'll just remove the period at the end
            test_teamName = kaggle_teamName + "."
            print(f"No match for '{kaggle_teamName}'\t checking if '{test_teamName}' exists")
            if matches_torvik_teamName(test_teamName):
                torvik_teams[torvik_teams == test_teamName]["TeamName"] = kaggle_teamName
                continue
                
        if teamName.startswith("S "):
            # Looks like Kaggle uses S, while Torvid uses South
            test_teamName = "South" + kaggle_teamName[kaggle_teamName.index("South") + len("South"):]
            print(f"No match for '{kaggle_teamName}'\t checking if '{test_teamName}' exists")
            if matches_torvik_teamName(test_teamName):
                torvik_teams[torvik_teams == test_teamName]["TeamName"] = kaggle_teamName
                continue

        if kaggle_teamName in kaggleToTorvikTeamNames:
            torvik_team = torvik_teams[torvik_teams == kaggleToTorvikTeamNames[kaggle_teamName]]
            if torvik_team.shape[0] != 1:
                raise Exception(f"Expected matching Torvik team name in dataframe as dictionary is populated with key {kaggle_teamName}")
            torvik_team["TeamName"] = kaggle_teamName
            continue
        
        similar = torvik_teams[torvik_teams.str.startswith(kaggle_teamName[0])]
        raise Exception(f"No team name conversion found for Kaggle team name '{kaggle_teamName}'\nThe following are Torvik team names that start with the same character\n{similar}")

In [17]:
preprocess_torvik_dataframe(m_torvik_team_2024)

No match for 'Iowa St'	 checking if 'Iowa St.' exists
No match for 'San Diego St'	 checking if 'San Diego St.' exists
No match for 'Washington St'	 checking if 'Washington St.' exists
No match for 'Morehead St'	 checking if 'Morehead St.' exists
No match for 'S Dakota St'	 checking if 'S Dakota St.' exists
No match for 'St Mary's CA'	 checking if 'Saint Mary's CA' exists
No match for 'Mississippi St'	 checking if 'Mississippi St.' exists
No match for 'Michigan St'	 checking if 'Michigan St.' exists
No match for 'Long Beach St'	 checking if 'Long Beach St.' exists
No match for 'Utah St'	 checking if 'Utah St.' exists
No match for 'Colorado St'	 checking if 'Colorado St.' exists
No match for 'McNeese St'	 checking if 'McNeese St.' exists
No match for 'St Peter's'	 checking if 'Saint Peter's' exists


In [20]:
m_torvik_team_2024.merge(m_ncaa_tourney_wins, on="TeamName")

Unnamed: 0,TeamName,ADJOE,ADJDE,BARTHAG,Record,Wins,Games Played,EFG%,EFGD%,FTR,...,Unnamed: 23,3PR,3PRD,ADJ T.,Unnamed: 30,WAB,Unnamed: 35,Unnamed: 36,TeamID,Tournament Wins
0,UAB,112.419573,106.883939,0.641225,22–11,22,33,50.1,51.0,40.5,...,48.9,31.3,36.7,68.7,2024,-2.790055,74.3,72.2,1412,0
1,BYU,120.241255,98.462358,0.908711,23–10,23,33,55.1,48.0,25.3,...,47.2,50.7,34.1,70.0,2024,2.305701,73.8,69.8,1140,0
2,Arizona,121.626478,93.320937,0.95463,25–8,25,33,55.0,48.7,36.7,...,52.6,32.6,38.2,73.1,2024,4.852735,71.9,71.1,1112,2
3,Baylor,122.20918,100.496602,0.904609,22–10,22,32,55.6,51.3,40.5,...,52.9,38.3,35.5,66.7,2024,4.47163,74.0,73.0,1124,1
4,Kansas,113.890691,93.348961,0.907819,21–10,21,31,53.4,47.6,32.8,...,51.1,29.7,41.1,69.9,2024,3.770634,72.9,69.9,1242,1
5,Akron,105.727669,101.866664,0.605353,22–10,22,32,52.0,48.6,33.6,...,45.4,41.5,36.8,66.7,2024,-4.64286,72.4,69.4,1103,0
6,Creighton,120.354513,96.263726,0.928808,23–9,23,32,57.5,46.4,24.5,...,40.9,48.9,27.2,67.9,2024,4.394713,78.1,78.7,1166,2
7,Duke,120.73893,96.857124,0.926527,24–8,24,32,55.2,49.0,33.8,...,50.7,37.4,35.5,67.9,2024,3.518321,72.2,69.0,1181,3
8,Texas A&M,115.107897,99.658462,0.839887,20–14,20,34,45.4,50.1,38.2,...,59.9,38.1,44.9,66.6,2024,-0.058199,70.9,70.7,1401,1
9,Nevada,114.340174,98.082814,0.853673,25–7,25,32,53.3,48.2,46.2,...,54.9,33.6,41.0,67.7,2024,2.456057,72.3,69.7,1305,0
