# Logistic Regression is the Way to Go

After some deliberation, we chose basic Logisitc Regression as our model of choice.

## Data Preparation and Feature Selection

This part of the code was copied directly from the previous step.

In [1]:
import pandas as pd
import numpy as np
import collections

In [2]:
# Regular Season Results since 1985 (Only includes who won and the points)
reg_season_compact_pd = pd.read_csv('Data/MRegularSeasonCompactResults.csv') \

# Regular Season Results since 2003 but includes useful stats like Rebounds, Assists, etc.
reg_season_detailed_pd = pd.read_csv('Data/MRegularSeasonDetailedResults.csv')

# List of teams who are/was in Division I along with their ID
teams_pd = pd.read_csv('Data/MTeams.csv')

# Like the first two DataFrames but for the tournament
tourney_compact_pd = pd.read_csv('Data/MNCAATourneyCompactResults.csv')
tourney_detailed_pd = pd.read_csv('Data/MNCAATourneyDetailedResults.csv')

# The Conference Tourney Detailed Results since 2003
conference_tourney_results_pd = pd.read_csv('Data/MConferenceTourneyGames.csv')

# List of Teams along with their Conferences and ID per Year
conferences_pd = pd.read_csv('Data/MTeamConferences.csv')

# Seeds 
seeds_pd = pd.read_csv('Data/MNCAATourneySeeds.csv')

# List of Conference Tourney Winners since 2001
l = []
for i in range(len(conference_tourney_results_pd) - 1):
    if conference_tourney_results_pd.iloc[i, 1] != conference_tourney_results_pd.iloc[i + 1, 1]:
        season = conference_tourney_results_pd.iloc[i, 0]
        conference = conference_tourney_results_pd.iloc[i, 1]
        winner = conference_tourney_results_pd.iloc[i, 3]
        l.append({'Season': season, 'Conference': conference, 'Winner': winner})
        
conference_tourney_winners_pd = pd.DataFrame(l)

We will unfortunately not use a lot of the features presented last time, but we will use the following:

- Regular Season Wins
- Points per game season average
- Points per game allowed season average
- Whether or not in Power 6 conference (ACC, Big Ten, Big 12, SEC, Pac 12, Big East) - Binary label
- Number of 3's per game
- Turnovers per game average
- Assists per game average
- Conference Tournament Championship - binary label
- Tournament Seed
- Rebounds per game average
- Steals per game average
- Number of NCAA appearances since 1985

In [3]:
# Get ID given Name
def getTeamID(name):
    return teams_pd[teams_pd['TeamName'] == name].values[0][0]

# Get Name given ID
def getTeamName(team_id):
    return teams_pd[teams_pd['TeamID'] == team_id].values[0][1]

print("ID for Baylor is", getTeamID("Baylor"))
print("The team with ID 1124 is", getTeamName(1124))

ID for Baylor is 1124
The team with ID 1124 is Baylor


In [4]:
# How many wins did a Team win in a given Season
def getRegSeasonWins(team_id, year):
    c1 = reg_season_compact_pd['WTeamID'] == team_id
    c2 = reg_season_compact_pd['Season'] == year
    return len(reg_season_compact_pd[c1 & c2])

# What was a team's Points per Game in a given Season
def getPPG(team_id, year):
    ppg = 0
    c1 = reg_season_compact_pd['WTeamID'] == team_id
    c2 = reg_season_compact_pd['Season'] == year
    c3 = reg_season_compact_pd['LTeamID'] == team_id
    gamesWon = reg_season_compact_pd[c1 & c2]
    ppg = gamesWon['WScore'].sum()
    gamesLost = reg_season_compact_pd[c2 & c3]
    ppg += gamesLost['LScore'].sum()
    total_games = len(gamesWon) + len(gamesLost)
    ppg /= total_games
    return round(ppg, 2)

# In a given season, how many points did a given team give up per game
def getOPPG(team_id, year):
    oppg = 0
    c1 = reg_season_compact_pd['WTeamID'] == team_id
    c2 = reg_season_compact_pd['Season'] == year
    c3 = reg_season_compact_pd['LTeamID'] == team_id
    gamesWon = reg_season_compact_pd[c1 & c2]
    oppg = gamesWon['LScore'].sum()
    gamesLost = reg_season_compact_pd[c2 & c3]
    oppg += gamesLost['WScore'].sum()
    total_games = len(gamesWon) + len(gamesLost)
    oppg /= total_games
    return round(oppg, 2)

baylor_id = getTeamID("Baylor")
print("In 2022, Baylor won", getRegSeasonWins(baylor_id, 2022), "games.")
print("They averaged", getPPG(baylor_id, 2022), "points per game.")
print("While holding opponents to", getOPPG(baylor_id, 2022), "points per game.")

In 2022, Baylor won 26 games.
They averaged 76.5 points per game.
While holding opponents to 63.62 points per game.


In [5]:
# Hardcoded the divisions as of 2022
ACC = [1181, 1314, 1323, 1274, 1448, 1438, 1439, 1199, 1393, 1155, 1257, 1130, 1338, 1210, 1301]
Big12 = [1242, 1124, 1403, 1400, 1395, 1329, 1235, 1328, 1243, 1452]
BigEast = [1437, 1344, 1163, 1166, 1371, 1266, 1462, 1385, 1177, 1139, 1207]
Big10 = [1458, 1228, 1345, 1234, 1326, 1353, 1276, 1277, 1321, 1231, 1268, 1336, 1278, 1304]
Pac12 = [1112, 1113, 1417, 1425, 1160, 1450, 1449, 1332, 1333, 1143, 1390, 1428]
SEC = [1120, 1397, 1246, 1116, 1401, 1261, 1196, 1376, 1104, 1280, 1435, 1281, 1279, 1208]

# Quick function to check if a team's ID is in these lists
def getPower6(team_id):
    if team_id in ACC or team_id in Big12 or team_id in BigEast or team_id in Big10 or team_id in Pac12 or team_id in SEC:
        return 1
    else:
        return 0

if getPower6(getTeamID("BYU")):
    print("BYU is in a Power 6 conference.")
else:
    print("BYU is NOT in a Power 6 conference.")
    
if getPower6(getTeamID("Duke")):
    print("Duke is in a Power 6 conference.")
else:
    print("Duke is NOT in a Power 6 conference.")

BYU is NOT in a Power 6 conference.
Duke is in a Power 6 conference.


In [6]:
# How many three's did a team make per game in a given season
def get3PT(team_id, year):
    if year < 2003:
        return 0
    threes = 0
    c1 = reg_season_detailed_pd['WTeamID'] == team_id
    c2 = reg_season_detailed_pd['Season'] == year
    c3 = reg_season_detailed_pd['LTeamID'] == team_id
    gamesWon = reg_season_detailed_pd[c1 & c2]
    threes = gamesWon['WFGM3'].sum()
    gamesLost = reg_season_detailed_pd[c2 & c3]
    threes += gamesLost['LFGM3'].sum()
    total_games = len(gamesWon) + len(gamesLost)
    threes /= total_games
    return round(threes, 2)

# How many turnovers did a team make per game in a given season
def getTO(team_id, year):
    if year < 2003:
        return 0
    to = 0
    c1 = reg_season_detailed_pd['WTeamID'] == team_id
    c2 = reg_season_detailed_pd['Season'] == year
    c3 = reg_season_detailed_pd['LTeamID'] == team_id
    gamesWon = reg_season_detailed_pd[c1 & c2]
    to = gamesWon['WTO'].sum()
    gamesLost = reg_season_detailed_pd[c2 & c3]
    to += gamesLost['LTO'].sum()
    total_games = len(gamesWon) + len(gamesLost)
    to /= total_games
    return round(to, 2)

ETSU_id = getTeamID("ETSU")
print("In 2009, ETSU averaged", get3PT(ETSU_id, 2009), "threes per game.")
print("While committing", getTO(ETSU_id, 2009), "turnovers per game.")

In 2009, ETSU averaged 5.94 threes per game.
While committing 14.0 turnovers per game.


In [7]:
# How many Assists did a team make per game
def getAST(team_id, year):
    if year < 2003:
        return 0
    ast = 0
    c1 = reg_season_detailed_pd['WTeamID'] == team_id
    c2 = reg_season_detailed_pd['Season'] == year
    c3 = reg_season_detailed_pd['LTeamID'] == team_id
    gamesWon = reg_season_detailed_pd[c1 & c2]
    ast = gamesWon['WAst'].sum()
    gamesLost = reg_season_detailed_pd[c2 & c3]
    ast += gamesLost['LAst'].sum()
    total_games = len(gamesWon) + len(gamesLost)
    ast /= total_games
    return round(ast, 2)

# Determine if a team was the conference champion in their division in a given year
def getTourneyConferenceChampion(team_id, year):
    if year < 2001:
        return 0
    conf = getConference(team_id, year)
    c1 = conference_tourney_winners_pd['Season'] == year
    c2 = conference_tourney_winners_pd['Conference'] == conf
    if len(conference_tourney_winners_pd[c1 & c2]) == 0:
        return 0
    if team_id == conference_tourney_winners_pd[c1 & c2]['Winner'].values[0]:
        return 1
    else:
        return 0

In [8]:
# Easy way to get a team's conference in a given year
def getConference(team_id, year):
    c1 = conferences_pd['TeamID'] == team_id
    c2 = conferences_pd['Season'] == year
    c3 = conferences_pd[c1 & c2]
    if len(c3) == 0:
        return conferences_pd[c1].values[0][2]
    return c3['ConfAbbrev'].values[0]

# Get the seed of the team in a given year
def getSeed(team_id, year):
    c1 = seeds_pd['TeamID'] == team_id
    c2 = seeds_pd['Season'] == year
    if len(seeds_pd[c1 & c2]) == 0:
        return 0
    return int(seeds_pd[c1 & c2]['Seed'].values[0][1:3])

# Get rebounds per game in a given year
def getRPG(team_id, year):
    if year < 2003:
        return 0
    reb = 0
    c1 = reg_season_detailed_pd['WTeamID'] == team_id
    c2 = reg_season_detailed_pd['Season'] == year
    c3 = reg_season_detailed_pd['LTeamID'] == team_id
    gamesWon = reg_season_detailed_pd[c1 & c2]
    reb = gamesWon['WOR'].sum()
    reb += gamesWon['WDR'].sum()
    gamesLost = reg_season_detailed_pd[c2 & c3]
    reb += gamesLost['LOR'].sum()
    reb += gamesLost['LDR'].sum()
    total_games = len(gamesWon) + len(gamesLost)
    reb /= total_games
    return round(reb, 2)

In [9]:
# Steals per game
def getSTL(team_id, year):
    if year < 2003:
        return 0
    stl = 0
    c1 = reg_season_detailed_pd['WTeamID'] == team_id
    c2 = reg_season_detailed_pd['Season'] == year
    c3 = reg_season_detailed_pd['LTeamID'] == team_id
    gamesWon = reg_season_detailed_pd[c1 & c2]
    stl = gamesWon['WStl'].sum()
    gamesLost = reg_season_detailed_pd[c2 & c3]
    stl += gamesLost['LStl'].sum()
    total_games = len(gamesWon) + len(gamesLost)
    stl /= total_games
    return round(stl, 2)

# How many times did a team appear in the tournament as of 2022
def getNumOfAppearances(team_id):
    return len(seeds_pd[seeds_pd['TeamID'] == team_id])

# Helper function for below
def getHomeStat(row):
    if (row == 'H'):
        home = 1
    if (row == 'A'):
        home = -1
    if (row == 'N'):
        home = 0
    return home

In [10]:
# Build the vector
def getSeasonData(team_id, year):
    # Check first if the team was Division 1 at the time
    c1 = teams_pd[teams_pd['TeamID'] == team_id]['FirstD1Season'].values[0] <= year
    c2 = teams_pd[teams_pd['TeamID'] == team_id]['LastD1Season'].values[0] >= year
    if ~c1 or ~c2:
        return []
    return [getRegSeasonWins(team_id, year),
            getPPG(team_id, year),
            getOPPG(team_id, year),
            getPower6(team_id),
            get3PT(team_id, year),
            getTO(team_id, year),
            getAST(team_id, year),
            getTourneyConferenceChampion(team_id, year),
            getSeed(team_id, year),
            getRPG(team_id, year),
            getSTL(team_id, year),
            getNumOfAppearances(team_id)]

# Build vectors for every team in a given season
def createSeasonDict(year):
    seasonDictionary = collections.defaultdict(list)
    for team in teams_pd['TeamName'].tolist():
        team_id = teams_pd[teams_pd['TeamName'] == team].values[0][0]
        team_vector = getSeasonData(team_id, year)
        seasonDictionary[team_id] = team_vector
    return seasonDictionary

# Below is the season data for Toledo in 2013
getSeasonData(1405, 2013)

[15, 68.86, 68.25, 0, 5.93, 12.0, 13.04, 0, 0, 32.82, 7.04, 0]

In [11]:
# Basically run the function directly above this one on a set of years
def createTrainingSet(years):
    totalNumGames = 0
    for year in years:
        season = reg_season_compact_pd[reg_season_compact_pd['Season'] == year]
        totalNumGames += len(season.index)
        tourney = tourney_compact_pd[tourney_compact_pd['Season'] == year]
        totalNumGames += len(tourney.index)
    numFeatures = len(getSeasonData(1181,2012)) #Just choosing a random team and seeing the dimensionality of the vector
    xTrain = np.zeros(( totalNumGames, numFeatures + 1))
    yTrain = np.zeros(( totalNumGames ))
    indexCounter = 0
    for year in years:
        team_vectors = createSeasonDict(year)
        season = reg_season_compact_pd[reg_season_compact_pd['Season'] == year]
        numGamesInSeason = len(season.index)
        tourney = tourney_compact_pd[tourney_compact_pd['Season'] == year]
        numGamesInSeason += len(tourney.index)
        xTrainSeason = np.zeros(( numGamesInSeason, numFeatures + 1))
        yTrainSeason = np.zeros(( numGamesInSeason ))
        counter = 0
        for index, row in season.iterrows():
            w_team = row['WTeamID']
            w_vector = team_vectors[w_team]
            l_team = row['LTeamID']
            l_vector = team_vectors[l_team]
            diff = [a - b for a, b in zip(w_vector, l_vector)]
            home = getHomeStat(row['WLoc'])
            if (counter % 2 == 0):
                diff.append(home) 
                xTrainSeason[counter] = diff
                yTrainSeason[counter] = 1
            else:
                diff.append(-home)
                xTrainSeason[counter] = [ -p for p in diff]
                yTrainSeason[counter] = 0
            counter += 1
        for index, row in tourney.iterrows():
            w_team = row['WTeamID']
            w_vector = team_vectors[w_team]
            l_team = row['LTeamID']
            l_vector = team_vectors[l_team]
            diff = [a - b for a, b in zip(w_vector, l_vector)]
            home = 0 #All tournament games are neutral
            if (counter % 2 == 0):
                diff.append(home) 
                xTrainSeason[counter] = diff
                yTrainSeason[counter] = 1
            else:
                diff.append(-home)
                xTrainSeason[counter] = [ -p for p in diff]
                yTrainSeason[counter] = 0
            counter += 1
        xTrain[indexCounter:numGamesInSeason+indexCounter] = xTrainSeason
        yTrain[indexCounter:numGamesInSeason+indexCounter] = yTrainSeason
        indexCounter += numGamesInSeason
    return xTrain, yTrain

In [12]:
x_data, y_data = createTrainingSet([2022])
np.save('x_dataset', x_data)
np.save('y_dataset', y_data)

In [13]:
xTrain = np.load('x_dataset.npy')
yTrain = np.load('y_dataset.npy')
xTrain.shape

(5412, 13)

## Setting Up the Predictions for 2022

The ways we are going to "score" this model is by first creating a DataFrame that contains the first round games. We will order them in a way such that the second round matchups can be easily constructed by the model. We will use the seeds below to properly format the results DataFrame. I will not go into detail as to why we should order this way, but just know that this will prevent impossible matchups at any point in the tournament.

In [125]:
tourney_2022 = tourney_compact_pd[tourney_compact_pd["Season"] == 2022]
seeds_2022 = seeds_pd[seeds_pd["Season"] == 2022]
seeds_2022

Unnamed: 0,Season,Seed,TeamID
2354,2022,W01,1124
2355,2022,W02,1246
2356,2022,W03,1345
2357,2022,W04,1417
2358,2022,W05,1388
...,...,...,...
2417,2022,Z13,1151
2418,2022,Z14,1255
2419,2022,Z15,1174
2420,2022,Z16a,1136


First we need to know which 'region' faces who (i.e. does W face X, Y, or Z?). We can do that by looking at the final 4 matchups.

In [126]:
id1 = tourney_2022.iloc[-2, 2]
id2 = tourney_2022.iloc[-2, 4]
print(seeds_2022[seeds_2022["TeamID"] == id1])
print(seeds_2022[seeds_2022["TeamID"] == id2])

      Season Seed  TeamID
2361    2022  W08    1314
      Season Seed  TeamID
2372    2022  X02    1181


So W faces X, so we must make sure that after the W02 and W15 matchup comes the X01 and X16 matchup.

Second we need to deal with the pesky play in game (i.e. those seeds Z16a and Z16b). We can make a list below that contains the play-in winners and add the appropiate team to the matchup.

In [127]:
playin = []
for i in range(4):
    playin.append(tourney_2022.iloc[i, 2])
    
playin

[1231, 1411, 1323, 1460]

Now we construct the DataFrame.

In [128]:
# Helper function
def swap_it(m, x1, x2):
    m[x1], m[x2] = m[x2], m[x1]
    return m

In [129]:
matchups = []
k = 0

for i in range(8):
    if len(seeds_2022.iloc[16 - i, 1]) == 4:
        if seeds_2022.iloc[16 - i, 2] not in playin:
            k = 1
    matchups.append({'Round': 1, 'Team1ID': seeds_2022.iloc[i, 2], 'Team2ID': seeds_2022.iloc[16 - i - k, 2]})

k = 0

for i in range(17, 25):
    if len(seeds_2022.iloc[50 - i, 1]) == 4:
        if seeds_2022.iloc[50 - i, 2] not in playin:
            k = 1
    matchups.append({'Round': 1, 'Team1ID': seeds_2022.iloc[i, 2], 'Team2ID': seeds_2022.iloc[50 - i - k, 2]})
    
k = 0

for i in range(34, 42):
    if len(seeds_2022.iloc[84 - i, 1]) == 4:
        if seeds_2022.iloc[84 - i, 2] not in playin:
            k = 1
    matchups.append({'Round': 1, 'Team1ID': seeds_2022.iloc[i, 2], 'Team2ID': seeds_2022.iloc[84 - i - k, 2]})

k = 0

for i in range(51, 59):
    if len(seeds_2022.iloc[118 - i, 1]) == 4:
        if seeds_2022.iloc[118 - i, 2] not in playin:
            k = 1
    matchups.append({'Round': 1, 'Team1ID': seeds_2022.iloc[i, 2], 'Team2ID': seeds_2022.iloc[118 - i - k, 2]})

for i in range(0, 4):
    matchups = swap_it(matchups, 8 * i + 1, 8 * i + 7)
    matchups = swap_it(matchups, 8 * i + 2, 8 * i + 4)
    
matchups

[{'Round': 1, 'Team1ID': 1124, 'Team2ID': 1313},
 {'Round': 1, 'Team1ID': 1314, 'Team2ID': 1266},
 {'Round': 1, 'Team1ID': 1388, 'Team2ID': 1231},
 {'Round': 1, 'Team1ID': 1417, 'Team2ID': 1103},
 {'Round': 1, 'Team1ID': 1345, 'Team2ID': 1463},
 {'Round': 1, 'Team1ID': 1400, 'Team2ID': 1439},
 {'Round': 1, 'Team1ID': 1293, 'Team2ID': 1362},
 {'Round': 1, 'Team1ID': 1246, 'Team2ID': 1389},
 {'Round': 1, 'Team1ID': 1211, 'Team2ID': 1209},
 {'Round': 1, 'Team1ID': 1129, 'Team2ID': 1272},
 {'Round': 1, 'Team1ID': 1163, 'Team2ID': 1308},
 {'Round': 1, 'Team1ID': 1116, 'Team2ID': 1436},
 {'Round': 1, 'Team1ID': 1403, 'Team2ID': 1286},
 {'Round': 1, 'Team1ID': 1104, 'Team2ID': 1323},
 {'Round': 1, 'Team1ID': 1277, 'Team2ID': 1172},
 {'Round': 1, 'Team1ID': 1181, 'Team2ID': 1168},
 {'Round': 1, 'Team1ID': 1242, 'Team2ID': 1411},
 {'Round': 1, 'Team1ID': 1361, 'Team2ID': 1166},
 {'Round': 1, 'Team1ID': 1234, 'Team2ID': 1350},
 {'Round': 1, 'Team1ID': 1344, 'Team2ID': 1355},
 {'Round': 1, 'Team1

Now let us start fitting our Logisitc Regression model. This time, we will refer to our test set as the 2022 tournament, so we will rename our 'X_test' and 'y_test' to 'X_val' and 'y_val'.

In [130]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_val, y_train, y_val = train_test_split(xTrain, yTrain, test_size = .3, random_state = 1239)

lr = LogisticRegression()
lr = lr.fit(X_train, y_train)
y_pred = lr.predict(X_val)
accuracy_score(y_val, y_pred)

0.7389162561576355

Let us bring back our function that will give a difference vector.

In [131]:
# Functions to predict probability that Team 1 wins
def predictOutcome(team_data1, team_data2):
    diff = [a - b for a, b in zip(team_data1, team_data2)]
    diff.append(0)
    return lr.predict([diff])

# Will the model predict St. Peter's will win? (St. Peter's ID = 1389, Kentucky's ID = 1246)
predictOutcome(getSeasonData(1389, 2022), getSeasonData(1246, 2022))[0] == 1

False

If `predictOutcome(getSeasonData(team1, year = 2022), getSeasonData(team2, year = 2022))[0] == 1`, then we would predict Team 1 will advance. Otherwise, it will be Team 2. So now let us predict the whole tournament.

In [132]:
# These variables will be used to increment the round appropiately
r = 2
threshold = 32

# store the odd team
odd_team = 0

for i in range(63):
    # First get the IDs
    team1 = matchups[i]['Team1ID']
    team2 = matchups[i]['Team2ID']
    
    # Then get their data
    t1_data = getSeasonData(team1, 2022)
    t2_data = getSeasonData(team2, 2022)
    
    # Get 0 or 1 value by running the above function
    prob = predictOutcome(t1_data, t2_data)
    
    # Predict team
    if prob: 
        matchups[i]['Predicted_Winner'] = team1
    else:
        matchups[i]['Predicted_Winner'] = team2
        
    # Add a new row to the matchups once two games are complete until we have 63 games
    if len(matchups) < 64:
        if i % 2 == 1:
            matchups.append({'Round': r, 'Team1ID': odd_team, 'Team2ID': matchups[i]['Predicted_Winner']})
            odd_team = 0
        else:
            odd_team = team1
            
    if i == threshold:
        threshold += (32 / 2**(r - 1))
        r += 1
        
matchups

[{'Round': 1, 'Team1ID': 1124, 'Team2ID': 1313, 'Predicted_Winner': 1124},
 {'Round': 1, 'Team1ID': 1314, 'Team2ID': 1266, 'Predicted_Winner': 1314},
 {'Round': 1, 'Team1ID': 1388, 'Team2ID': 1231, 'Predicted_Winner': 1231},
 {'Round': 1, 'Team1ID': 1417, 'Team2ID': 1103, 'Predicted_Winner': 1417},
 {'Round': 1, 'Team1ID': 1345, 'Team2ID': 1463, 'Predicted_Winner': 1345},
 {'Round': 1, 'Team1ID': 1400, 'Team2ID': 1439, 'Predicted_Winner': 1400},
 {'Round': 1, 'Team1ID': 1293, 'Team2ID': 1362, 'Predicted_Winner': 1293},
 {'Round': 1, 'Team1ID': 1246, 'Team2ID': 1389, 'Predicted_Winner': 1246},
 {'Round': 1, 'Team1ID': 1211, 'Team2ID': 1209, 'Predicted_Winner': 1211},
 {'Round': 1, 'Team1ID': 1129, 'Team2ID': 1272, 'Predicted_Winner': 1129},
 {'Round': 1, 'Team1ID': 1163, 'Team2ID': 1308, 'Predicted_Winner': 1163},
 {'Round': 1, 'Team1ID': 1116, 'Team2ID': 1436, 'Predicted_Winner': 1116},
 {'Round': 1, 'Team1ID': 1403, 'Team2ID': 1286, 'Predicted_Winner': 1403},
 {'Round': 1, 'Team1ID': 

Next we are going to make a list of lists containing the winners per round in the test dataset.

In [133]:
tourney_2022 = tourney_compact_pd[tourney_compact_pd["Season"] == 2022]
actual_winners_2022 = []
r1 = []
r2 = []
r3 = []
r4 = []
r5 = []
r6 = []

# Make Round 1 list
for i in range(4, 36):
    r1.append(tourney_2022.iloc[i, 2])
    
# Round 2
for i in range(36, 52):
    r2.append(tourney_2022.iloc[i, 2])   
    
# Round 3
for i in range(52, 60):
    r3.append(tourney_2022.iloc[i, 2]) 
    
# Round 4
for i in range(60, 64):
    r4.append(tourney_2022.iloc[i, 2]) 
    
# Round 5
for i in range(64, 66):
    r5.append(tourney_2022.iloc[i, 2]) 
    
# Round 6
r6.append(tourney_2022.iloc[66, 2]) 
    
actual_winners_2022.append(r1)
actual_winners_2022.append(r2)
actual_winners_2022.append(r3)
actual_winners_2022.append(r4)
actual_winners_2022.append(r5)
actual_winners_2022.append(r6)

So now we are going to check if each projected winner was in their respective list.

In [134]:
for i in range(len(matchups)):
    predicted_winner = matchups[i]['Predicted_Winner']
    r = matchups[i]['Round']
    if predicted_winner in actual_winners_2022[r - 1]:
        matchups[i]['Correct'] = 1
    else:
        matchups[i]['Correct'] = 0

Finally, let us look at how many correct predictions we made in predicting March Madness 2022.

In [149]:
prediction_2022_pd = pd.DataFrame(matchups)

correct = prediction_2022_pd[prediction_2022_pd['Correct'] == 1]
incorrect = prediction_2022_pd[prediction_2022_pd['Correct'] == 0]

print(len(correct))
prediction_2022_pd

39


Unnamed: 0,Round,Team1ID,Team2ID,Predicted_Winner,Correct
0,1,1124,1313,1124,1
1,1,1314,1266,1314,1
2,1,1388,1231,1231,0
3,1,1417,1103,1417,1
4,1,1345,1463,1345,1
...,...,...,...,...,...
58,4,1242,1120,1242,1
59,4,1112,1437,1112,0
60,5,1124,1181,1181,0
61,5,1242,1112,1112,0


So 39 correct predictions out of 63. Let us see the round-by-round breakdown.

In [153]:
correct_per_round = []
for i in range(1, 7):
    c1 = prediction_2022_pd['Round'] == i
    cor = prediction_2022_pd['Correct'] == 1
    inc = prediction_2022_pd['Correct'] == 0
    correct_per_round.append((i, len(prediction_2022_pd[c1 & cor])))
    
for i in range(len(correct_per_round)):
    print("In round %d, we had %d correct" % (correct_per_round[i][0], correct_per_round[i][1]))

In round 1, we had 24 correct
In round 2, we had 9 correct
In round 3, we had 4 correct
In round 4, we had 2 correct
In round 5, we had 0 correct
In round 6, we had 0 correct


In short, we predicted 2 Final Four teams in 2022. Very impressive. With a 61.9% accuracy along with our Final Four prediction, this makes Logistic Regression a great model to move forward.