# Data Wrangling

In [None]:
# Copied from 2024 with minor adjustments (added link to data), didn't have much time this year

from io import StringIO
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup

# Options to display more/less rows in output
# pd.set_option('display.max_rows', 500)
pd.set_option('display.max_rows', 60)

# Past data (in csv from https://www.kaggle.com/datasets/andrewsundberg/college-basketball-dataset)
team_stats = pd.read_csv(f'./data/cbb.csv')


# Fill empty values (teams didn't make tournament)
team_stats['POSTSEASON'] = team_stats['POSTSEASON'].fillna('NIT')
team_stats['SEED'] = team_stats['SEED'].fillna(-1)
team_stats_save_team_conf = team_stats.copy()

# Encode strings into numerical values for classifier
# Store team names/conferences as dictionary mapping to integers (numbers needed for RandomForest)
team_dict = {}
conf_dict = {}
def getFromDict(x, dict):
    if x not in dict:
        dict[x] = len(dict)
    return dict[x]
team_stats['TEAM'] = team_stats['TEAM'].apply(lambda x: getFromDict(x, team_dict))
team_stats['CONF'] = team_stats['CONF'].apply(lambda x: getFromDict(x, conf_dict))


# Current data, scrape data from barttorvik.com and read into dataframe
soup = BeautifulSoup(requests.get('https://barttorvik.com/trank.php#').content, "html.parser")

# Some spans contain seed, save the team/seed
tds = soup.findAll('td', {'class': 'teamname'})
seeds = {}
for td in tds:
    matches = re.match(r'(.+)\s+(\d+) seed, ✅', td.text)
    if matches:
        seeds[matches.groups()[0].replace('\xa0\xa0', '')] = matches.groups()[1]

# Spans contain extra data (not always seed), they can be removed:
for data in soup(['span']):
    data.decompose()
df = pd.read_html(StringIO(str(soup.find("table"))), header=1)[0]

# Remove header rows, match columns and merge into existing dataframe (there's probably an easier way to get columns to match, oh well 🤷‍♂️)
stats_24 = df[df['Rk'] != 'Rk'].drop(['Rk', '3PR', '3PRD'], axis=1).rename(columns={"EFG%": "EFG_O", "EFGD%": "EFG_D", "2P%": "2P_O", "2P%D": "2P_D", "3P%": "3P_O", "3P%D": "3P_D", "Adj T.": "ADJ_T", "Rec": "W"})
stats_24 = stats_24.replace('^(\d+)-(\d+)', r'\1', regex=True).replace('\+', '', regex=True)
stats_24.columns = map(str.upper, stats_24.columns)

# # Set seed (fill with n/a) and year (2024)
def setSeed(x):
    if x in seeds:
        return seeds[x]
    return -1
stats_24['SEED'] = stats_24['TEAM'].apply(lambda x: setSeed(x))
stats_24['YEAR'] = 2024
stats_24_save_team_conf = stats_24.copy()

# Use previously calculated team and conference values using dictionary
stats_24['TEAM'] = stats_24['TEAM'].apply(lambda x: getFromDict(x, team_dict))
stats_24['CONF'] = stats_24['CONF'].apply(lambda x: getFromDict(x, conf_dict))

team_stats

Unnamed: 0,TEAM,CONF,G,W,ADJOE,ADJDE,BARTHAG,EFG_O,EFG_D,TOR,...,FTRD,2P_O,2P_D,3P_O,3P_D,ADJ_T,WAB,POSTSEASON,SEED,YEAR
0,0,0,40,33,123.3,94.9,0.9531,52.6,48.1,15.4,...,30.4,53.9,44.6,32.7,36.2,71.7,8.6,2ND,1.0,2016
1,1,1,40,36,129.1,93.6,0.9758,54.8,47.7,12.4,...,22.4,54.8,44.7,36.5,37.5,59.3,11.3,2ND,1.0,2015
2,2,1,40,33,114.4,90.4,0.9375,53.9,47.7,14.0,...,30.0,54.7,46.8,35.2,33.2,65.9,6.9,2ND,3.0,2018
3,3,2,38,31,115.2,85.2,0.9696,53.5,43.0,17.7,...,36.6,52.8,41.9,36.5,29.7,67.5,7.0,2ND,3.0,2019
4,4,3,39,37,117.8,86.3,0.9728,56.6,41.1,16.2,...,26.9,56.3,40.0,38.2,29.0,71.5,7.7,2ND,1.0,2017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3518,196,21,34,27,119.9,109.6,0.7369,56.3,52.9,13.6,...,27.5,54.6,52.1,39.7,36.1,69.5,-1.2,NIT,-1.0,2023
3519,117,11,33,27,111.4,97.3,0.8246,55.5,49.3,16.0,...,27.8,56.4,48.6,36.4,33.6,64.4,-2.0,NIT,-1.0,2023
3520,342,32,34,28,107.1,94.6,0.8065,51.7,44.0,19.3,...,28.7,52.5,42.8,33.4,31.1,69.8,-0.3,NIT,-1.0,2023
3521,157,16,38,29,112.4,97.0,0.8453,50.3,47.3,17.3,...,28.9,48.8,47.2,35.6,31.6,70.7,-0.5,NIT,-1.0,2023


# Create Train/Test Data and Classifier

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Split train/test
X = team_stats.drop('POSTSEASON', axis=1)
y = team_stats['POSTSEASON']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# Analyze Classifier Performance

In [3]:
from sklearn.metrics import accuracy_score

y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9049645390070922


# Predict 2024 March Madness

Using the same process to make the classifier above, we can fit using all previous years data and predict against values in the `stats_2024` variable. We can then use the predicted `POSTSEASON` value for each team to determine the winner of each matchup (using the probability as the tiebreaker).

In [4]:
clf.fit(X, y)

# Reverse team_dict to get team names back from ids
reversed_team_dict = {}
for i in team_dict.keys():
  reversed_team_dict[team_dict[i]] = i
reversed_team_dict

# Find most likely POSTSEASON outcome, returning the team name, the outcome and its probability 
def getClassAndProb(x, idx):
  max_prob = 0
  max_idx = -1
  for i in range(len(x)):
    if x[i] > max_prob:
      max_prob = x[i]
      max_idx = i
  return [reversed_team_dict[idx], clf.classes_[max_idx], max_prob]

# Set probabilities
stats_with_predictions = stats_24.copy()
probabilites = clf.predict_proba(stats_24)
stats_with_predictions['TEAM'] = stats_with_predictions['TEAM'].apply(lambda x: reversed_team_dict[x])
stats_with_predictions['2ND'] = probabilites[:,0]
stats_with_predictions['Champions'] = probabilites[:,1]
stats_with_predictions['E8'] = probabilites[:,2]
stats_with_predictions['F4'] = probabilites[:,3]
stats_with_predictions['NIT'] = probabilites[:,4]
stats_with_predictions['R32'] = probabilites[:,5]
stats_with_predictions['R64'] = probabilites[:,6]
stats_with_predictions['R68'] = probabilites[:,7]
stats_with_predictions['S16'] = probabilites[:,8]
stats_with_predictions = stats_with_predictions[stats_with_predictions['SEED'] != -1]

stats_with_predictions

Unnamed: 0,TEAM,CONF,G,W,ADJOE,ADJDE,BARTHAG,EFG_O,EFG_D,TOR,...,YEAR,2ND,Champions,E8,F4,NIT,R32,R64,R68,S16
0,Houston,2,34,30,119.2,85.5,.9785,49.7,44.0,13.7,...,2024,0.02,0.07,0.07,0.11,0.01,0.32,0.14,0.01,0.25
1,Connecticut,5,34,31,127.1,93.6,.9712,57.1,45.1,14.9,...,2024,0.02,0.06,0.15,0.04,0.00,0.34,0.18,0.00,0.21
2,Purdue,1,33,29,126.2,94.7,.9644,56.0,47.7,16.5,...,2024,0.01,0.06,0.17,0.02,0.01,0.34,0.23,0.00,0.16
3,Iowa St.,2,34,27,113.6,86.5,.9583,51.9,47.1,15.7,...,2024,0.00,0.03,0.11,0.02,0.02,0.45,0.17,0.00,0.20
4,Auburn,4,34,27,120.7,92.1,.9573,54.1,43.4,14.9,...,2024,0.00,0.03,0.06,0.03,0.00,0.51,0.16,0.00,0.21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
215,Saint Peter's,20,31,19,96.8,100.2,.4038,45.0,46.6,19.6,...,2024,0.00,0.00,0.07,0.00,0.19,0.04,0.58,0.12,0.00
239,Stetson,11,31,22,108.3,114.0,.3559,53.4,52.0,15.8,...,2024,0.00,0.00,0.00,0.00,0.06,0.05,0.61,0.24,0.04
277,Howard,22,32,18,104.5,112.9,.2913,52.0,50.9,21.4,...,2024,0.00,0.00,0.00,0.00,0.10,0.03,0.32,0.54,0.01
281,Grambling St.,31,31,20,97.7,106.0,.2805,48.2,49.1,20.5,...,2024,0.00,0.00,0.02,0.00,0.12,0.02,0.48,0.35,0.01


In [5]:
# Run through each matchup

# Find the winner of a matchup, compare teams' probabilities at current round (0 indexed with Round of 68)
team_1 = 'Purdue'
team_2 = 'Arizona'
current_round = 6

team_1_stats = stats_with_predictions[stats_with_predictions['TEAM'] == team_1]
team_2_stats = stats_with_predictions[stats_with_predictions['TEAM'] == team_2]


def getWinner(probabilites1, probabilites2, curr_round):
  sum_1 = 0
  sum_2 = 0
  for i in range(curr_round + 1):
    sum_1 += probabilites1[i]
    sum_2 += probabilites2[i]

  # The team with a higher probability at lower rounds is less likely to win:
  if sum_1 > sum_2:
    return 2
  return 1

team_1_probabilities = [team_1_stats['R68'].iloc[0], team_1_stats['R64'].iloc[0], team_1_stats['R32'].iloc[0], team_1_stats['S16'].iloc[0], team_1_stats['E8'].iloc[0], team_1_stats['F4'].iloc[0], team_1_stats['Champions'].iloc[0]]
team_2_probabilities = [team_2_stats['R68'].iloc[0], team_2_stats['R64'].iloc[0], team_2_stats['R32'].iloc[0], team_2_stats['S16'].iloc[0], team_2_stats['E8'].iloc[0], team_2_stats['F4'].iloc[0], team_2_stats['Champions'].iloc[0]]
team_1 if getWinner(team_1_probabilities, team_2_probabilities, current_round) == 1 else team_2

'Arizona'

# Final Bracket Prediction

![Justin Thoms - ESPN March Madness 2024 Bracket: Winner Arizona](Bracket.png)