# Data Wrangling

In [13]:
from io import StringIO
import requests
import pandas as pd
from bs4 import BeautifulSoup

# Options to display more/less rows in output
# pd.set_option('display.max_rows', 500)
# pd.set_option('display.max_rows', 60)

# Past data (in csv)
team_stats = pd.read_csv(f'./data/cbb.csv')


# Fill empty values (teams didn't make tournament)
team_stats['POSTSEASON'] = team_stats['POSTSEASON'].fillna('NIT')
team_stats['SEED'] = team_stats['SEED'].fillna(-1)
team_stats_save_team_conf = team_stats.copy()

# Encode strings into numerical values for classifier
# Store team names/conferences as dictionary mapping to integers (numbers needed for RandomForest)
team_dict = {}
conf_dict = {}
def getFromDict(x, dict):
    if x not in dict:
        dict[x] = len(dict)
    return dict[x]
team_stats['TEAM'] = team_stats['TEAM'].apply(lambda x: getFromDict(x, team_dict))
team_stats['CONF'] = team_stats['CONF'].apply(lambda x: getFromDict(x, conf_dict))


# Current data, scrape data from barttorvik.com and read into dataframe
soup = BeautifulSoup(requests.get('https://barttorvik.com/trank.php#').content, "html.parser")
# Spans contain extra data, they can be removed:
for data in soup(['span']):
    data.decompose()
df = pd.read_html(StringIO(str(soup.find("table"))), header=1)[0]

# Remove header rows, match columns and merge into existing dataframe (there's probably an easier way to get columns to match, oh well 🤷‍♂️)
stats_24 = df[df['Rk'] != 'Rk'].drop(['Rk', '3PR', '3PRD'], axis=1).rename(columns={"EFG%": "EFG_O", "EFGD%": "EFG_D", "2P%": "2P_O", "2P%D": "2P_D", "3P%": "3P_O", "3P%D": "3P_D", "Adj T.": "ADJ_T", "Rec": "W"})
stats_24 = stats_24.replace('^(\d+)-(\d+)', r'\1', regex=True).replace('\+', '', regex=True)
stats_24.columns = map(str.upper, stats_24.columns)

# Set seed (fill with n/a) and year (2024)
stats_24['SEED'] = -1
stats_24['YEAR'] = 2024
stats_24_save_team_conf = stats_24.copy()

# Use previously calculated team and conference values using dictionary
stats_24['TEAM'] = stats_24['TEAM'].apply(lambda x: getFromDict(x, team_dict))
stats_24['CONF'] = stats_24['CONF'].apply(lambda x: getFromDict(x, conf_dict))

team_stats

Unnamed: 0,TEAM,CONF,G,W,ADJOE,ADJDE,BARTHAG,EFG_O,EFG_D,TOR,...,FTRD,2P_O,2P_D,3P_O,3P_D,ADJ_T,WAB,POSTSEASON,SEED,YEAR
0,0,0,40,33,123.3,94.9,0.9531,52.6,48.1,15.4,...,30.4,53.9,44.6,32.7,36.2,71.7,8.6,2ND,1.0,2016
1,1,1,40,36,129.1,93.6,0.9758,54.8,47.7,12.4,...,22.4,54.8,44.7,36.5,37.5,59.3,11.3,2ND,1.0,2015
2,2,1,40,33,114.4,90.4,0.9375,53.9,47.7,14.0,...,30.0,54.7,46.8,35.2,33.2,65.9,6.9,2ND,3.0,2018
3,3,2,38,31,115.2,85.2,0.9696,53.5,43.0,17.7,...,36.6,52.8,41.9,36.5,29.7,67.5,7.0,2ND,3.0,2019
4,4,3,39,37,117.8,86.3,0.9728,56.6,41.1,16.2,...,26.9,56.3,40.0,38.2,29.0,71.5,7.7,2ND,1.0,2017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3518,196,21,34,27,119.9,109.6,0.7369,56.3,52.9,13.6,...,27.5,54.6,52.1,39.7,36.1,69.5,-1.2,NIT,-1.0,2023
3519,117,11,33,27,111.4,97.3,0.8246,55.5,49.3,16.0,...,27.8,56.4,48.6,36.4,33.6,64.4,-2.0,NIT,-1.0,2023
3520,342,32,34,28,107.1,94.6,0.8065,51.7,44.0,19.3,...,28.7,52.5,42.8,33.4,31.1,69.8,-0.3,NIT,-1.0,2023
3521,157,16,38,29,112.4,97.0,0.8453,50.3,47.3,17.3,...,28.9,48.8,47.2,35.6,31.6,70.7,-0.5,NIT,-1.0,2023


# Create Train/Test Data and Classifier

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Split train/test
X = team_stats.drop('POSTSEASON', axis=1)
y = team_stats['POSTSEASON']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# Analyze Classifier Performance

In [15]:
from sklearn.metrics import accuracy_score

y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9177304964539007


# Predict 2024 March Madness

Using the same process to make the classifier above, we can fit using all previous years data and predict against values in the `stats_2024` variable. We can then use the predicted `POSTSEASON` value for each team to determine the winner of each matchup (using the probability as the tiebreaker).

In [16]:
clf.fit(X, y)

# Find most likely POSTSEASON outcome, returning the team name, the outcome and its probability 
def getClassAndProb(x, idx):
  max_prob = 0
  max_idx = -1
  for i in range(len(x)):
    if x[i] > max_prob:
      max_prob = x[i]
      max_idx = i
  return [stats_24_save_team_conf.iloc[idx]['TEAM'], clf.classes_[max_idx], max_prob]

[getClassAndProb(x, i) for i,x in enumerate(clf.predict_proba(stats_24))]

[['Houston', 'NIT', 0.51],
 ['Connecticut', 'NIT', 0.53],
 ['Purdue', 'NIT', 0.53],
 ['Auburn', 'NIT', 0.54],
 ['Arizona', 'NIT', 0.52],
 ['Iowa St.', 'NIT', 0.59],
 ['Tennessee', 'NIT', 0.56],
 ['North Carolina', 'NIT', 0.55],
 ['Marquette', 'NIT', 0.51],
 ['Creighton', 'NIT', 0.5],
 ['Duke', 'NIT', 0.56],
 ['Illinois', 'NIT', 0.59],
 ["Saint Mary's", 'NIT', 0.54],
 ['Alabama', 'NIT', 0.53],
 ['Baylor', 'NIT', 0.52],
 ['Gonzaga', 'NIT', 0.55],
 ['Kansas', 'NIT', 0.62],
 ['BYU', 'NIT', 0.65],
 ['Texas', 'NIT', 0.61],
 ["St. John's", 'NIT', 0.68],
 ['Wisconsin', 'NIT', 0.66],
 ['Michigan St.', 'NIT', 0.73],
 ['Florida', 'NIT', 0.64],
 ['Kentucky', 'NIT', 0.58],
 ['Nebraska', 'NIT', 0.63],
 ['Colorado', 'NIT', 0.61],
 ['Texas Tech', 'NIT', 0.66],
 ['New Mexico', 'NIT', 0.74],
 ['San Diego St.', 'NIT', 0.68],
 ['Mississippi St.', 'NIT', 0.7],
 ['Northwestern', 'NIT', 0.66],
 ['Villanova', 'NIT', 0.89],
 ['TCU', 'NIT', 0.69],
 ['Wake Forest', 'NIT', 0.94],
 ['Clemson', 'NIT', 0.67],
 ['Nev