In [29]:
# Using the top 2 models to predict the 2025 March Madness tournament.
# In addition, web-scraping code is included for this year's regular season stats
# along with a manual formation of this year's tournament match ups. With this,
# we created a data frame, such as we did for the other tournament years, to use
# with the models. The models predict the outcomes for the First Four part of the tournament
# first, to then finalize the First Round, and then finally predict the rest of the tournament.

In [5]:
# Kevin Code
# Importing March Madness data we scraped for last year's tournament

import pandas as pd

df_combined_23_24 = pd.read_csv("MarchMadness2023-24.csv")

In [6]:
# Kevin code
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import (QuantileTransformer, MaxAbsScaler, Normalizer)
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

modelPredictions2025_YTrain = df_combined_23_24["winner"]

In [7]:
# Kevin code
# Best Model 1: GaussianNB (Tuned Hyperparameters first, then input features)

bestModel1_pipeline = make_pipeline(QuantileTransformer(n_quantiles=60, output_distribution='normal'),
                                        GaussianNB(var_smoothing=1e-12))
bestModel1_inputFeatures = ['SOS_diff', 'Tm._diff', 'Opp._diff', '3P%_diff', 'TRB_diff', 'STL_diff', 'BLK_diff']
bestModel1_XTrain = df_combined_23_24[bestModel1_inputFeatures]

bestModel1_pipeline.fit(bestModel1_XTrain, modelPredictions2025_YTrain)

In [8]:
# Kevin code
# Best Model 2: RandomForest Ensemble

best_features_forests1 = ['SRS_diff', 'SOS_diff', 'Tm._diff', 'Opp._diff']
best_pipeline_forests1 = make_pipeline(MaxAbsScaler(), RandomForestClassifier(max_depth=5, min_samples_split=2, n_estimators=100))

best_features_forests2 = ['Seed_diff', 'SOS_diff', 'Opp._diff', 'TRB_diff', 'STL_diff', 'BLK_diff', 'PF_diff']
best_pipeline_forests2 = make_pipeline(Normalizer(), RandomForestClassifier(max_depth=5, min_samples_split=5,
                                                                            n_estimators=100, random_state=42))

forestsEnsemble_model = VotingClassifier(
    estimators=[('best_pipeline_forests1', best_pipeline_forests1), ('best_pipeline_forests2', best_pipeline_forests2)],
    voting='soft'
)

bestModel2_pipeline = forestsEnsemble_model

bestModel2_inputFeatures = list(set(best_features_forests1 + best_features_forests2))
bestModel2_XTrain = df_combined_23_24[bestModel2_inputFeatures]

bestModel2_pipeline.fit(bestModel2_XTrain, modelPredictions2025_YTrain)

In [9]:
# Mack Code (Modified by Kevin)

# code to scrape this year's team data

import requests
from bs4 import BeautifulSoup

import time

url = "https://www.sports-reference.com/cbb/seasons/men/2025-school-stats.html"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

# Find the specific table
table = soup.find("table", {"id": "basic_school_stats"})

thead_rows = table.find("thead").find_all("tr")
last_header_row = thead_rows[-1]
header_cells = last_header_row.find_all("th")
headers = [th.get_text(strip=True) for th in header_cells]

tbody_rows = table.find("tbody").find_all("tr")
data = []
for row in tbody_rows:
    cells = row.find_all(["th", "td"])
    row_data = []
    for cell in cells:
      text = cell.get_text(strip=True)
      row_data.append(text)
    data.append(row_data)
    time.sleep(0.1) # sleep timer to be respectful

df_2025 = pd.DataFrame(data, columns=headers)

In [10]:
# Mack code (Modified by Kevin)
df_2025.loc[5, "School"] = "Alabama St.NCAA"
df_2025.loc[31, "School"] = "BYUNCAA"
df_2025.loc[61, "School"] = "Colorado St.NCAA"
df_2025.loc[138, "School"] = "Iowa St.NCAA"
df_2025.loc[184, "School"] = "McNeeseNCAA"
df_2025.loc[192, "School"] = "Michigan St.NCAA"
df_2025.loc[199, "School"] = "Mississippi St.NCAA"
df_2025.loc[198, "School"] = "Ole MissNCAA"
df_2025.loc[280, "School"] = "Saint Francis UNCAA"
df_2025.loc[317, "School"] = "Saint John'sNCAA"
df_2025.loc[283, "School"] = "Saint Mary'sNCAA"
df_2025.loc[290, "School"] = "San Diego St.NCAA"
df_2025.loc[63, "School"] = "UConnNCAA"
df_2025.loc[362, "School"] = "Utah St.NCAA"
df_2025.loc[374, "School"] = "VCUNCAA"
df_2025.loc[224, "School"] = "Norfolk St.NCAA"

df_2025["School"] = df_2025["School"].str.replace("NCAA", "")
df_2025.reset_index(drop=True, inplace=True)
df_2025

Unnamed: 0,Rk,School,G,W,L,W-L%,SRS,SOS,Unnamed: 9,W.1,...,FT,FTA,FT%,ORB,TRB,AST,STL,BLK,TOV,PF
0,1,Abilene Christian,32,16,16,.500,-5.74,-2.30,,8,...,476,666,.715,355,1081,421,331,94,476,670
1,2,Air Force,32,4,28,.125,-8.04,3.68,,1,...,363,572,.635,236,956,428,181,91,433,571
2,3,Akron,35,28,7,.800,3.30,-3.67,,17,...,445,590,.754,414,1347,621,267,131,414,623
3,4,Alabama,34,26,8,.765,25.23,15.56,,13,...,627,877,.715,431,1459,580,207,152,434,644
4,5,Alabama A&M,32,10,22,.313,-20.48,-9.90,,6,...,490,738,.664,447,1172,432,266,131,513,687
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,Rk,School,G,W,L,W-L%,SRS,SOS,,W,...,FT,FTA,FT%,ORB,TRB,AST,STL,BLK,TOV,PF
396,361,Wyoming,32,12,20,.375,-0.61,4.26,,5,...,348,534,.652,337,1114,373,141,89,412,617
397,362,Xavier,34,22,12,.647,15.37,8.84,,13,...,561,711,.789,264,1131,568,263,86,371,533
398,363,Yale,30,22,8,.733,6.74,-3.02,,13,...,418,573,.729,329,1151,492,173,108,302,489


In [11]:
# Mack code (Modified by Kevin)

# code to manually make bracket for this year's march madness

import pandas as pd

first_four = [

    # first four
    {"teamA": "Alabama St.",    "teamB": "Saint Francis U",        "teamASeed": 16,  "teamBSeed": 16},
    {"teamA": "Texas",       "teamB": "Xavier", "teamASeed": 11,  "teamBSeed": 11},
    {"teamA": "American",    "teamB": "Mount St. Mary's",       "teamASeed": 16,  "teamBSeed": 16},
    {"teamA": "San Diego St.",       "teamB": "North Carolina",    "teamASeed": 11,  "teamBSeed": 11}

]

df_first_four_bracket = pd.DataFrame(first_four, columns=["teamA", "teamB", "teamASeed", "teamBSeed"])

replacements = ["REPLACE 1", "REPLACE 4", "REPLACE 3", "REPLACE 2"]

df_first_four_bracket["Replacements"] = replacements
df_first_four_bracket

Unnamed: 0,teamA,teamB,teamASeed,teamBSeed,Replacements
0,Alabama St.,Saint Francis U,16,16,REPLACE 1
1,Texas,Xavier,11,11,REPLACE 4
2,American,Mount St. Mary's,16,16,REPLACE 3
3,San Diego St.,North Carolina,11,11,REPLACE 2


In [12]:
# Mack code

# first four matchup --> row is a the matchup

import pandas as pd

# Example columns to compute differences for
stats_columns = [
    "W-L%", "SRS", "SOS", "Tm.", "Opp.", "MP",
    "FG", "FGA", "FG%", "3P", "3PA", "3P%",
    "FT", "FTA", "FT%", "ORB", "TRB", "AST",
    "STL", "BLK", "TOV", "PF"
]

df_2025.loc[:, stats_columns] = df_2025.loc[:, stats_columns].apply(pd.to_numeric, errors='coerce')

matchup_rows = []

for matchup in first_four:
    teamA = matchup["teamA"]
    teamB = matchup["teamB"]
    seedA = matchup["teamASeed"]
    seedB = matchup["teamBSeed"]

    # Get each team's stats from df_march_2024 (assuming exact matching of "School" names)
    teamA_stats = df_2025.loc[df_2025["School"] == teamA]
    teamB_stats = df_2025.loc[df_2025["School"] == teamB]

    # If either lookup is empty, handle or skip
    if teamA_stats.empty or teamB_stats.empty:
        print(f"Warning: No stats found for {teamA} or {teamB}. Skipping...")
        continue

    # Convert each to a single row (Series)
    teamA_stats = teamA_stats.iloc[0]
    teamB_stats = teamB_stats.iloc[0]

    # Build a dictionary for the difference row
    row_dict = {
        "teamA": teamA,
        "teamB": teamB,
        "seedA": seedA,
        "seedB": seedB,
    }

    row_dict["Seed_diff"] = seedA - seedB

    # Compute differences for each stat
    for col in stats_columns:
        # e.g. "SRS_diff" = TeamA_SRS - TeamB_SRS
        row_dict[f"{col}_diff"] = float(teamA_stats[col]) - float(teamB_stats[col])

    # Append to the list
    matchup_rows.append(row_dict)

# Create a new DataFrame of matchup-level differences
df_first_four_diff = pd.DataFrame(matchup_rows)

df_first_four_diff

Unnamed: 0,teamA,teamB,seedA,seedB,Seed_diff,W-L%_diff,SRS_diff,SOS_diff,Tm._diff,Opp._diff,...,FT_diff,FTA_diff,FT%_diff,ORB_diff,TRB_diff,AST_diff,STL_diff,BLK_diff,TOV_diff,PF_diff
0,Alabama St.,Saint Francis U,16,16,0,0.071,2.94,-3.45,292.0,264.0,...,137.0,226.0,-0.041,133.0,173.0,-51.0,82.0,20.0,-94.0,171.0
1,Texas,Xavier,11,11,0,-0.104,0.27,0.94,91.0,108.0,...,8.0,49.0,-0.04,92.0,117.0,-116.0,-44.0,64.0,-23.0,121.0
2,American,Mount St. Mary's,16,16,0,-0.01,0.17,-0.47,-118.0,-168.0,...,-34.0,-60.0,0.018,-96.0,-293.0,-96.0,12.0,-33.0,-149.0,62.0
3,San Diego St.,North Carolina,11,11,0,0.055,-4.54,-3.12,-793.0,-759.0,...,-199.0,-210.0,-0.077,-48.0,-267.0,-143.0,16.0,34.0,-57.0,-54.0


In [13]:
# Kevin code
# Predicting the First Four part of the tournament match ups to then fill out the First Round match ups
bestModel2_FirstFourBracket = df_first_four_bracket.copy()
bestModel2_predictions = bestModel2_pipeline.predict(df_first_four_diff[bestModel2_inputFeatures])
bestModel2_FirstFourBracket["Predicted Winner"] = bestModel2_predictions

bestModel1_FirstFourBracket = df_first_four_bracket.copy()
bestModel1_predictions = bestModel1_pipeline.predict(df_first_four_diff[bestModel1_inputFeatures])
bestModel1_FirstFourBracket["Predicted Winner"] = bestModel1_predictions

modelPredictions = [bestModel2_FirstFourBracket, bestModel1_FirstFourBracket]

for model in modelPredictions:
  model["Predicted Winner"] = model["Predicted Winner"].astype(str)
  model.loc[model["Predicted Winner"] == "1", "Predicted Winner"] = model["teamA"]
  model.loc[model["Predicted Winner"] == "0", "Predicted Winner"] = model["teamB"]


In [14]:
# Mack code (Modified by Kevin)
import copy

first_round = [
    {"teamA": "Auburn", "teamB": "REPLACE 1", "teamASeed": 1, "teamBSeed": 16},
    {"teamA": "Louisville", "teamB": "Creighton", "teamASeed": 8, "teamBSeed": 9},
    {"teamA": "Michigan", "teamB": "UC San Diego", "teamASeed": 5, "teamBSeed": 12},
    {"teamA": "Texas A&M", "teamB": "Yale", "teamASeed": 4, "teamBSeed": 13},
    {"teamA": "Ole Miss", "teamB": "REPLACE 2", "teamASeed": 6, "teamBSeed": 11},
    {"teamA": "Iowa St.", "teamB": "Lipscomb", "teamASeed": 3, "teamBSeed": 14},
    {"teamA": "Marquette", "teamB": "New Mexico", "teamASeed": 7, "teamBSeed": 10},
    {"teamA": "Michigan St.", "teamB": "Bryant", "teamASeed": 2, "teamBSeed": 15},
    {"teamA": "Florida", "teamB": "Norfolk St.", "teamASeed": 1, "teamBSeed": 16},
    {"teamA": "UConn", "teamB": "Oklahoma", "teamASeed": 8, "teamBSeed": 9},
    {"teamA": "Memphis", "teamB": "Colorado St.", "teamASeed": 5, "teamBSeed": 12},
    {"teamA": "Maryland", "teamB": "Grand Canyon", "teamASeed": 4, "teamBSeed": 13},
    {"teamA": "Missouri", "teamB": "Drake", "teamASeed": 6, "teamBSeed": 11},
    {"teamA": "Texas Tech", "teamB": "UNC Wilmington", "teamASeed": 3, "teamBSeed": 14},
    {"teamA": "Kansas", "teamB": "Arkansas", "teamASeed": 7, "teamBSeed": 10},
    {"teamA": "Saint John's", "teamB": "Omaha", "teamASeed": 2, "teamBSeed": 15},
    {"teamA": "Duke", "teamB": "REPLACE 3", "teamASeed": 1, "teamBSeed": 16},
    {"teamA": "Mississippi St.", "teamB": "Baylor", "teamASeed": 8, "teamBSeed": 9},
    {"teamA": "Oregon", "teamB": "Liberty", "teamASeed": 5, "teamBSeed": 12},
    {"teamA": "Arizona", "teamB": "Akron", "teamASeed": 4, "teamBSeed": 13},
    {"teamA": "BYU", "teamB": "VCU", "teamASeed": 6, "teamBSeed": 11},
    {"teamA": "Wisconsin", "teamB": "Montana", "teamASeed": 3, "teamBSeed": 14},
    {"teamA": "Saint Mary's", "teamB": "Vanderbilt", "teamASeed": 7, "teamBSeed": 10},
    {"teamA": "Alabama", "teamB": "Robert Morris", "teamASeed": 2, "teamBSeed": 15},
    {"teamA": "Houston", "teamB": "SIU Edwardsville", "teamASeed": 1, "teamBSeed": 16},
    {"teamA": "Gonzaga", "teamB": "Georgia", "teamASeed": 8, "teamBSeed": 9},
    {"teamA": "Clemson", "teamB": "McNeese", "teamASeed": 5, "teamBSeed": 12},
    {"teamA": "Purdue", "teamB": "High Point", "teamASeed": 4, "teamBSeed": 13},
    {"teamA": "Illinois", "teamB": "REPLACE 4", "teamASeed": 6, "teamBSeed": 11},
    {"teamA": "Kentucky", "teamB": "Troy", "teamASeed": 3, "teamBSeed": 14},
    {"teamA": "UCLA", "teamB": "Utah St.", "teamASeed": 7, "teamBSeed": 10},
    {"teamA": "Tennessee", "teamB": "Wofford", "teamASeed": 2, "teamBSeed": 15}
]


modelsPredictions = [bestModel2_FirstFourBracket, bestModel1_FirstFourBracket]
bestModel2_FirstRound = copy.deepcopy(first_round)
bestModel1_FirstRound = copy.deepcopy(first_round)
modelsFirstRound = [bestModel2_FirstRound, bestModel1_FirstRound]


for model in range(len(modelsPredictions)):
  curr_model = modelsPredictions[model]
  model_FirstRound = modelsFirstRound[model]
  for matchup in model_FirstRound:
    if "REPLACE" in matchup["teamB"]:
      teamReplace = curr_model[curr_model["Replacements"] == matchup["teamB"]]["Predicted Winner"].iloc[0]
      matchup["teamB"] = teamReplace


bestModel2_AllTeams = []
bestModel1_AllTeams = []
models_AllTeams = [bestModel2_AllTeams, bestModel1_AllTeams]

for teamsList in range(len(models_AllTeams)):
  all_teams = models_AllTeams[teamsList]

  # Add teams from the First Round
  for matchup in modelsFirstRound[teamsList]:
          all_teams.append(matchup["teamA"])
          all_teams.append(matchup["teamB"])

bestModel2_uniqueTeams = list(set(models_AllTeams[0]))
bestModel1_uniqueTeams = list(set(models_AllTeams[1]))

In [15]:
# Mack code (Modified by Kevin)


df_march2025_bestModel2 = df_2025[df_2025["School"].isin(bestModel2_uniqueTeams)]

df_march2025_bestModel2.reset_index(drop=True, inplace=True)
df_march2025_bestModel2

df_march2025_bestModel1 = df_2025[df_2025["School"].isin(bestModel1_uniqueTeams)]

df_march2025_bestModel1.reset_index(drop=True, inplace=True)
df_march2025_bestModel1

Unnamed: 0,Rk,School,G,W,L,W-L%,SRS,SOS,Unnamed: 9,W.1,...,FT,FTA,FT%,ORB,TRB,AST,STL,BLK,TOV,PF
0,3,Akron,35,28,7,0.8,3.3,-3.67,,17,...,445.0,590.0,0.754,414.0,1347.0,621.0,267.0,131.0,414.0,623.0
1,4,Alabama,34,26,8,0.765,25.23,15.56,,13,...,627.0,877.0,0.715,431.0,1459.0,580.0,207.0,152.0,434.0,644.0
2,6,Alabama St.,36,20,16,0.556,-8.9,-8.1,,12,...,456.0,665.0,0.686,417.0,1303.0,418.0,285.0,101.0,327.0,662.0
3,11,Arizona,35,23,12,0.657,23.2,13.37,,14,...,596.0,766.0,0.778,431.0,1399.0,575.0,267.0,155.0,406.0,582.0
4,13,Arkansas,35,22,13,0.629,17.0,11.4,,8,...,537.0,730.0,0.736,332.0,1244.0,500.0,267.0,199.0,404.0,570.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,337,Vanderbilt,33,20,13,0.606,14.68,10.07,,8,...,507.0,680.0,0.746,379.0,1145.0,459.0,295.0,129.0,336.0,592.0
60,341,VCU,35,28,7,0.8,14.6,1.45,,15,...,498.0,670.0,0.743,463.0,1337.0,524.0,282.0,190.0,394.0,632.0
61,358,Wisconsin,37,27,10,0.73,20.74,11.5,,13,...,616.0,746.0,0.826,347.0,1350.0,537.0,190.0,87.0,356.0,611.0
62,359,Wofford,35,19,16,0.543,-0.07,-2.29,,10,...,413.0,626.0,0.66,453.0,1337.0,524.0,209.0,88.0,383.0,592.0


In [16]:
# Kevin code

# First round matchups dataframes

df_bestModel2_FirstRound = pd.DataFrame(bestModel2_FirstRound)
df_bestModel1_FirstRound = pd.DataFrame(bestModel1_FirstRound)

In [17]:
# Kevin code
# Predicting this year's March Madness bracket using best model 2

# A list containing data frames for the predictions of each round of march madness
# Index 0 starts as the first round of the bracket (There are 6 rounds with the final one being the championship)
bestModel2_predictedBracket = []

# The team matchups for each round
matchups = [df_bestModel2_FirstRound]

for round in range(6):
  round_matchups = matchups[round]
  matchup_rows = []

  for matchup in range(len(round_matchups)):
    matchup_info = round_matchups.iloc[matchup]

    teamA = matchup_info["teamA"]
    teamB = matchup_info["teamB"]
    seedA = matchup_info["teamASeed"]
    seedB = matchup_info["teamBSeed"]

    teamA_stats = df_march2025_bestModel2.loc[df_march2025_bestModel2["School"] == teamA].iloc[0]
    teamB_stats = df_march2025_bestModel2.loc[df_march2025_bestModel2["School"] == teamB].iloc[0]


    row_dict = {
        "teamA": teamA,
        "teamB": teamB,
        "seedA": seedA,
        "seedB": seedB,
    }

    row_dict["Seed_diff"] = int(seedA) - int(seedB)

    for col in stats_columns:
        row_dict[f"{col}_diff"] = teamA_stats[col] - teamB_stats[col]

    matchup_rows.append(row_dict)

  matchupStats_df = pd.DataFrame(matchup_rows)

  # Predict the result of the matchup
  X_matchup = matchupStats_df[bestModel2_inputFeatures]
  predictions = bestModel2_pipeline.predict(X_matchup)

  matchupStats_df["Predicted Winner"] = predictions

  predictedTeams_seed = []
  upsets= []

  for teams in range(len(matchupStats_df)):
    if matchupStats_df.iloc[teams]["Predicted Winner"] == 1:
      predictedTeams_seed.append(matchupStats_df.iloc[teams]["seedA"])
      if matchupStats_df.iloc[teams]["seedA"] > matchupStats_df.iloc[teams]["seedB"]:
        upsets.append("Yes")
      else:
        upsets.append("No")
    elif matchupStats_df.iloc[teams]["Predicted Winner"] == 0:
      predictedTeams_seed.append(matchupStats_df.iloc[teams]["seedB"])
      if matchupStats_df.iloc[teams]["seedB"] > matchupStats_df.iloc[teams]["seedA"]:
        upsets.append("Yes")
      else:
        upsets.append("No")

  matchupStats_df["Predicted Winner Seed"] = predictedTeams_seed
  matchupStats_df["Upset"] = upsets

  matchupStats_df["Predicted Winner"] = matchupStats_df["Predicted Winner"].astype(str)

  matchupStats_df.loc[matchupStats_df["Predicted Winner"] == "1", "Predicted Winner"] = matchupStats_df["teamA"]
  matchupStats_df.loc[matchupStats_df["Predicted Winner"] == "0", "Predicted Winner"] = matchupStats_df["teamB"]

  matchupPredictions_df = matchupStats_df[["teamA", "teamB", "Predicted Winner", "Predicted Winner Seed", "Upset"]]
  bestModel2_predictedBracket.append(matchupPredictions_df)


  next_matchUps = {"teamA": [], "teamB": [], "teamASeed": [], "teamBSeed": []}

  for i in range(0, len(matchupPredictions_df) - 1, 2):
    next_matchUps["teamA"].append(matchupPredictions_df.iloc[i]["Predicted Winner"])
    next_matchUps["teamB"].append(matchupPredictions_df.iloc[i + 1]["Predicted Winner"])
    next_matchUps["teamASeed"].append(matchupPredictions_df.iloc[i]["Predicted Winner Seed"])
    next_matchUps["teamBSeed"].append(matchupPredictions_df.iloc[i + 1]["Predicted Winner Seed"])

  nextMatchUps_df = pd.DataFrame(next_matchUps)
  matchups.append(nextMatchUps_df)

bestModel2_Bracket = {"First Round": bestModel2_predictedBracket[0], "Second Round": bestModel2_predictedBracket[1],
                            "Sweet 16": bestModel2_predictedBracket[2], "Elite Eight": bestModel2_predictedBracket[3],
                            "Final Four": bestModel2_predictedBracket[4], "Championship": bestModel2_predictedBracket[5]}

In [18]:
# Kevin code
# Predicting this year's March Madness bracket using best model 1

# A list containing data frames for the predictions of each round of march madness
# Index 0 starts as the first round of the bracket (There are 6 rounds with the final one being the championship)
bestModel1_predictedBracket = []

# The team matchups for each round
matchups = [df_bestModel1_FirstRound]

for round in range(6):
  round_matchups = matchups[round]
  matchup_rows = []

  for matchup in range(len(round_matchups)):
    matchup_info = round_matchups.iloc[matchup]

    teamA = matchup_info["teamA"]
    teamB = matchup_info["teamB"]
    seedA = matchup_info["teamASeed"]
    seedB = matchup_info["teamBSeed"]

    teamA_stats = df_march2025_bestModel1.loc[df_march2025_bestModel1["School"] == teamA].iloc[0]
    teamB_stats = df_march2025_bestModel1.loc[df_march2025_bestModel1["School"] == teamB].iloc[0]


    row_dict = {
        "teamA": teamA,
        "teamB": teamB,
        "seedA": seedA,
        "seedB": seedB,
    }

    row_dict["Seed_diff"] = int(seedA) - int(seedB)

    for col in stats_columns:
        row_dict[f"{col}_diff"] = teamA_stats[col] - teamB_stats[col]

    matchup_rows.append(row_dict)

  matchupStats_df = pd.DataFrame(matchup_rows)

  # Predict the result of the matchup
  X_matchup = matchupStats_df[bestModel1_inputFeatures]
  predictions = bestModel1_pipeline.predict(X_matchup)

  matchupStats_df["Predicted Winner"] = predictions

  predictedTeams_seed = []
  upsets = []

  for teams in range(len(matchupStats_df)):
    if matchupStats_df.iloc[teams]["Predicted Winner"] == 1:
      predictedTeams_seed.append(matchupStats_df.iloc[teams]["seedA"])
      if matchupStats_df.iloc[teams]["seedA"] > matchupStats_df.iloc[teams]["seedB"]:
        upsets.append("Yes")
      else:
        upsets.append("No")
    elif matchupStats_df.iloc[teams]["Predicted Winner"] == 0:
      predictedTeams_seed.append(matchupStats_df.iloc[teams]["seedB"])
      if matchupStats_df.iloc[teams]["seedB"] > matchupStats_df.iloc[teams]["seedA"]:
        upsets.append("Yes")
      else:
        upsets.append("No")

  matchupStats_df["Predicted Winner Seed"] = predictedTeams_seed
  matchupStats_df["Upset"] = upsets

  matchupStats_df["Predicted Winner"] = matchupStats_df["Predicted Winner"].astype(str)

  matchupStats_df.loc[matchupStats_df["Predicted Winner"] == "1", "Predicted Winner"] = matchupStats_df["teamA"]
  matchupStats_df.loc[matchupStats_df["Predicted Winner"] == "0", "Predicted Winner"] = matchupStats_df["teamB"]

  matchupPredictions_df = matchupStats_df[["teamA", "teamB", "Predicted Winner", "Predicted Winner Seed", "Upset"]]
  bestModel1_predictedBracket.append(matchupPredictions_df)


  next_matchUps = {"teamA": [], "teamB": [], "teamASeed": [], "teamBSeed": []}

  for i in range(0, len(matchupPredictions_df) - 1, 2):
    next_matchUps["teamA"].append(matchupPredictions_df.iloc[i]["Predicted Winner"])
    next_matchUps["teamB"].append(matchupPredictions_df.iloc[i + 1]["Predicted Winner"])
    next_matchUps["teamASeed"].append(matchupPredictions_df.iloc[i]["Predicted Winner Seed"])
    next_matchUps["teamBSeed"].append(matchupPredictions_df.iloc[i + 1]["Predicted Winner Seed"])

  nextMatchUps_df = pd.DataFrame(next_matchUps)
  matchups.append(nextMatchUps_df)

bestModel1_Bracket = {"First Round": bestModel1_predictedBracket[0], "Second Round": bestModel1_predictedBracket[1],
                            "Sweet 16": bestModel1_predictedBracket[2], "Elite Eight": bestModel1_predictedBracket[3],
                            "Final Four": bestModel1_predictedBracket[4], "Championship": bestModel1_predictedBracket[5]}

In [27]:
bestModel1_Bracket["First Round"]
# bestModel1_Bracket["Second Round"]
# bestModel1_Bracket["Sweet 16"]
# bestModel1_Bracket["Elite Eight"]
# bestModel1_Bracket["Final Four"]
# bestModel1_Bracket["Championship"]

Unnamed: 0,teamA,teamB,Predicted Winner,Predicted Winner Seed,Upset
0,Auburn,Alabama St.,Auburn,1,No
1,Louisville,Creighton,Creighton,9,Yes
2,Michigan,UC San Diego,Michigan,5,No
3,Texas A&M,Yale,Yale,13,Yes
4,Ole Miss,North Carolina,North Carolina,11,Yes
5,Iowa St.,Lipscomb,Iowa St.,3,No
6,Marquette,New Mexico,New Mexico,10,Yes
7,Michigan St.,Bryant,Michigan St.,2,No
8,Florida,Norfolk St.,Florida,1,No
9,UConn,Oklahoma,UConn,8,No


In [28]:
bestModel2_Bracket["First Round"]
# bestModel2_Bracket["Second Round"]
# bestModel2_Bracket["Sweet 16"]
# bestModel2_Bracket["Elite Eight"]
# bestModel2_Bracket["Final Four"]
# bestModel2_Bracket["Championship"]

Unnamed: 0,teamA,teamB,Predicted Winner,Predicted Winner Seed,Upset
0,Auburn,Alabama St.,Auburn,1,No
1,Louisville,Creighton,Louisville,8,No
2,Michigan,UC San Diego,Michigan,5,No
3,Texas A&M,Yale,Texas A&M,4,No
4,Ole Miss,North Carolina,North Carolina,11,Yes
5,Iowa St.,Lipscomb,Iowa St.,3,No
6,Marquette,New Mexico,Marquette,7,No
7,Michigan St.,Bryant,Michigan St.,2,No
8,Florida,Norfolk St.,Florida,1,No
9,UConn,Oklahoma,UConn,8,No
