# The Deep Learning Approach

Although I did not use Deep Learning for this project because we do not have a lot of data (I'm talking at least a million), but here is an approach on what it will look like.

First we will load the data and stuff.

In [1]:
import pandas as pd
import numpy as np
import collections

In [2]:
# Regular Season Results since 1985 (Only includes who won and the points)
reg_season_compact_pd = pd.read_csv('Data/MRegularSeasonCompactResults.csv') \

# Regular Season Results since 2003 but includes useful stats like Rebounds, Assists, etc.
reg_season_detailed_pd = pd.read_csv('Data/MRegularSeasonDetailedResults.csv')

# List of teams who are/was in Division I along with their ID
teams_pd = pd.read_csv('Data/MTeams.csv')

# Like the first two DataFrames but for the tournament
tourney_compact_pd = pd.read_csv('Data/MNCAATourneyCompactResults.csv')
tourney_detailed_pd = pd.read_csv('Data/MNCAATourneyDetailedResults.csv')

# The Conference Tourney Detailed Results since 2003
conference_tourney_results_pd = pd.read_csv('Data/MConferenceTourneyGames.csv')

# List of Teams along with their Conferences and ID per Year
conferences_pd = pd.read_csv('Data/MTeamConferences.csv')

# Seeds 
seeds_pd = pd.read_csv('Data/MNCAATourneySeeds.csv')

# List of Conference Tourney Winners since 2001
l = []
for i in range(len(conference_tourney_results_pd) - 1):
    if conference_tourney_results_pd.iloc[i, 1] != conference_tourney_results_pd.iloc[i + 1, 1]:
        season = conference_tourney_results_pd.iloc[i, 0]
        conference = conference_tourney_results_pd.iloc[i, 1]
        winner = conference_tourney_results_pd.iloc[i, 3]
        l.append({'Season': season, 'Conference': conference, 'Winner': winner})
        
conference_tourney_winners_pd = pd.DataFrame(l)

In [3]:
# Get ID given Name
def getTeamID(name):
    return teams_pd[teams_pd['TeamName'] == name].values[0][0]

# Get Name given ID
def getTeamName(team_id):
    return teams_pd[teams_pd['TeamID'] == team_id].values[0][1]

print("ID for Baylor is", getTeamID("Baylor"))
print("The team with ID 1124 is", getTeamName(1124))

ID for Baylor is 1124
The team with ID 1124 is Baylor


In [4]:
# How many wins did a Team win in a given Season
def getRegSeasonWins(team_id, year):
    c1 = reg_season_compact_pd['WTeamID'] == team_id
    c2 = reg_season_compact_pd['Season'] == year
    return len(reg_season_compact_pd[c1 & c2])

# What was a team's Points per Game in a given Season
def getPPG(team_id, year):
    ppg = 0
    c1 = reg_season_compact_pd['WTeamID'] == team_id
    c2 = reg_season_compact_pd['Season'] == year
    c3 = reg_season_compact_pd['LTeamID'] == team_id
    gamesWon = reg_season_compact_pd[c1 & c2]
    ppg = gamesWon['WScore'].sum()
    gamesLost = reg_season_compact_pd[c2 & c3]
    ppg += gamesLost['LScore'].sum()
    total_games = len(gamesWon) + len(gamesLost)
    ppg /= total_games
    return round(ppg, 2)

# In a given season, how many points did a given team give up per game
def getOPPG(team_id, year):
    oppg = 0
    c1 = reg_season_compact_pd['WTeamID'] == team_id
    c2 = reg_season_compact_pd['Season'] == year
    c3 = reg_season_compact_pd['LTeamID'] == team_id
    gamesWon = reg_season_compact_pd[c1 & c2]
    oppg = gamesWon['LScore'].sum()
    gamesLost = reg_season_compact_pd[c2 & c3]
    oppg += gamesLost['WScore'].sum()
    total_games = len(gamesWon) + len(gamesLost)
    oppg /= total_games
    return round(oppg, 2)

baylor_id = getTeamID("Baylor")
print("In 2022, Baylor won", getRegSeasonWins(baylor_id, 2022), "games.")
print("They averaged", getPPG(baylor_id, 2022), "points per game.")
print("While holding opponents to", getOPPG(baylor_id, 2022), "points per game.")

In 2022, Baylor won 26 games.
They averaged 76.5 points per game.
While holding opponents to 63.62 points per game.


In [5]:
# Hardcoded the divisions as of 2022
ACC = [1181, 1314, 1323, 1274, 1448, 1438, 1439, 1199, 1393, 1155, 1257, 1130, 1338, 1210, 1301]
Big12 = [1242, 1124, 1403, 1400, 1395, 1329, 1235, 1328, 1243, 1452]
BigEast = [1437, 1344, 1163, 1166, 1371, 1266, 1462, 1385, 1177, 1139, 1207]
Big10 = [1458, 1228, 1345, 1234, 1326, 1353, 1276, 1277, 1321, 1231, 1268, 1336, 1278, 1304]
Pac12 = [1112, 1113, 1417, 1425, 1160, 1450, 1449, 1332, 1333, 1143, 1390, 1428]
SEC = [1120, 1397, 1246, 1116, 1401, 1261, 1196, 1376, 1104, 1280, 1435, 1281, 1279, 1208]

# Quick function to check if a team's ID is in these lists
def getPower6(team_id):
    if team_id in ACC or team_id in Big12 or team_id in BigEast or team_id in Big10 or team_id in Pac12 or team_id in SEC:
        return 1
    else:
        return 0

if getPower6(getTeamID("BYU")):
    print("BYU is in a Power 6 conference.")
else:
    print("BYU is NOT in a Power 6 conference.")
    
if getPower6(getTeamID("Duke")):
    print("Duke is in a Power 6 conference.")
else:
    print("Duke is NOT in a Power 6 conference.")

BYU is NOT in a Power 6 conference.
Duke is in a Power 6 conference.


In [6]:
# How many three's did a team make per game in a given season
def get3PT(team_id, year):
    if year < 2003:
        return 0
    threes = 0
    c1 = reg_season_detailed_pd['WTeamID'] == team_id
    c2 = reg_season_detailed_pd['Season'] == year
    c3 = reg_season_detailed_pd['LTeamID'] == team_id
    gamesWon = reg_season_detailed_pd[c1 & c2]
    threes = gamesWon['WFGM3'].sum()
    gamesLost = reg_season_detailed_pd[c2 & c3]
    threes += gamesLost['LFGM3'].sum()
    total_games = len(gamesWon) + len(gamesLost)
    threes /= total_games
    return round(threes, 2)

# How many turnovers did a team make per game in a given season
def getTO(team_id, year):
    if year < 2003:
        return 0
    to = 0
    c1 = reg_season_detailed_pd['WTeamID'] == team_id
    c2 = reg_season_detailed_pd['Season'] == year
    c3 = reg_season_detailed_pd['LTeamID'] == team_id
    gamesWon = reg_season_detailed_pd[c1 & c2]
    to = gamesWon['WTO'].sum()
    gamesLost = reg_season_detailed_pd[c2 & c3]
    to += gamesLost['LTO'].sum()
    total_games = len(gamesWon) + len(gamesLost)
    to /= total_games
    return round(to, 2)

ETSU_id = getTeamID("ETSU")
print("In 2009, ETSU averaged", get3PT(ETSU_id, 2009), "threes per game.")
print("While committing", getTO(ETSU_id, 2009), "turnovers per game.")

In 2009, ETSU averaged 5.94 threes per game.
While committing 14.0 turnovers per game.


In [7]:
# How many Assists did a team make per game
def getAST(team_id, year):
    if year < 2003:
        return 0
    ast = 0
    c1 = reg_season_detailed_pd['WTeamID'] == team_id
    c2 = reg_season_detailed_pd['Season'] == year
    c3 = reg_season_detailed_pd['LTeamID'] == team_id
    gamesWon = reg_season_detailed_pd[c1 & c2]
    ast = gamesWon['WAst'].sum()
    gamesLost = reg_season_detailed_pd[c2 & c3]
    ast += gamesLost['LAst'].sum()
    total_games = len(gamesWon) + len(gamesLost)
    ast /= total_games
    return round(ast, 2)

# Determine if a team was the conference champion in their division in a given year
def getTourneyConferenceChampion(team_id, year):
    if year < 2001:
        return 0
    conf = getConference(team_id, year)
    c1 = conference_tourney_winners_pd['Season'] == year
    c2 = conference_tourney_winners_pd['Conference'] == conf
    if len(conference_tourney_winners_pd[c1 & c2]) == 0:
        return 0
    if team_id == conference_tourney_winners_pd[c1 & c2]['Winner'].values[0]:
        return 1
    else:
        return 0

In [8]:
# Easy way to get a team's conference in a given year
def getConference(team_id, year):
    c1 = conferences_pd['TeamID'] == team_id
    c2 = conferences_pd['Season'] == year
    c3 = conferences_pd[c1 & c2]
    if len(c3) == 0:
        return conferences_pd[c1].values[0][2]
    return c3['ConfAbbrev'].values[0]

# Get the seed of the team in a given year
def getSeed(team_id, year):
    c1 = seeds_pd['TeamID'] == team_id
    c2 = seeds_pd['Season'] == year
    if len(seeds_pd[c1 & c2]) == 0:
        return 0
    return int(seeds_pd[c1 & c2]['Seed'].values[0][1:3])

# Get rebounds per game in a given year
def getRPG(team_id, year):
    if year < 2003:
        return 0
    reb = 0
    c1 = reg_season_detailed_pd['WTeamID'] == team_id
    c2 = reg_season_detailed_pd['Season'] == year
    c3 = reg_season_detailed_pd['LTeamID'] == team_id
    gamesWon = reg_season_detailed_pd[c1 & c2]
    reb = gamesWon['WOR'].sum()
    reb += gamesWon['WDR'].sum()
    gamesLost = reg_season_detailed_pd[c2 & c3]
    reb += gamesLost['LOR'].sum()
    reb += gamesLost['LDR'].sum()
    total_games = len(gamesWon) + len(gamesLost)
    reb /= total_games
    return round(reb, 2)

In [9]:
# Steals per game
def getSTL(team_id, year):
    if year < 2003:
        return 0
    stl = 0
    c1 = reg_season_detailed_pd['WTeamID'] == team_id
    c2 = reg_season_detailed_pd['Season'] == year
    c3 = reg_season_detailed_pd['LTeamID'] == team_id
    gamesWon = reg_season_detailed_pd[c1 & c2]
    stl = gamesWon['WStl'].sum()
    gamesLost = reg_season_detailed_pd[c2 & c3]
    stl += gamesLost['LStl'].sum()
    total_games = len(gamesWon) + len(gamesLost)
    stl /= total_games
    return round(stl, 2)

# How many times did a team appear in the tournament as of 2022
def getNumOfAppearances(team_id):
    return len(seeds_pd[seeds_pd['TeamID'] == team_id])

# Helper function for below
def getHomeStat(row):
    if (row == 'H'):
        home = 1
    if (row == 'A'):
        home = -1
    if (row == 'N'):
        home = 0
    return home

In [10]:
# Build the vector
def getSeasonData(team_id, year):
    # Check first if the team was Division 1 at the time
    c1 = teams_pd[teams_pd['TeamID'] == team_id]['FirstD1Season'].values[0] <= year
    c2 = teams_pd[teams_pd['TeamID'] == team_id]['LastD1Season'].values[0] >= year
    if ~c1 or ~c2:
        return []
    return [getRegSeasonWins(team_id, year),
            getPPG(team_id, year),
            getOPPG(team_id, year),
            getPower6(team_id),
            get3PT(team_id, year),
            getTO(team_id, year),
            getAST(team_id, year),
            getTourneyConferenceChampion(team_id, year),
            getSeed(team_id, year),
            getRPG(team_id, year),
            getSTL(team_id, year),
            getNumOfAppearances(team_id)]

# Build vectors for every team in a given season
def createSeasonDict(year):
    seasonDictionary = collections.defaultdict(list)
    for team in teams_pd['TeamName'].tolist():
        team_id = teams_pd[teams_pd['TeamName'] == team].values[0][0]
        team_vector = getSeasonData(team_id, year)
        seasonDictionary[team_id] = team_vector
    return seasonDictionary

# Below is the season data for Toledo in 2013
getSeasonData(1405, 2013)

[15, 68.86, 68.25, 0, 5.93, 12.0, 13.04, 0, 0, 32.82, 7.04, 0]

In [11]:
# Basically run the function directly above this one on a set of years
def createTrainingSet(years):
    totalNumGames = 0
    for year in years:
        season = reg_season_compact_pd[reg_season_compact_pd['Season'] == year]
        totalNumGames += len(season.index)
        tourney = tourney_compact_pd[tourney_compact_pd['Season'] == year]
        totalNumGames += len(tourney.index)
    numFeatures = len(getSeasonData(1181,2012)) #Just choosing a random team and seeing the dimensionality of the vector
    xTrain = np.zeros(( totalNumGames, numFeatures + 1))
    yTrain = np.zeros(( totalNumGames ))
    indexCounter = 0
    for year in years:
        team_vectors = createSeasonDict(year)
        season = reg_season_compact_pd[reg_season_compact_pd['Season'] == year]
        numGamesInSeason = len(season.index)
        tourney = tourney_compact_pd[tourney_compact_pd['Season'] == year]
        numGamesInSeason += len(tourney.index)
        xTrainSeason = np.zeros(( numGamesInSeason, numFeatures + 1))
        yTrainSeason = np.zeros(( numGamesInSeason ))
        counter = 0
        for index, row in season.iterrows():
            w_team = row['WTeamID']
            w_vector = team_vectors[w_team]
            l_team = row['LTeamID']
            l_vector = team_vectors[l_team]
            diff = [a - b for a, b in zip(w_vector, l_vector)]
            home = getHomeStat(row['WLoc'])
            if (counter % 2 == 0):
                diff.append(home) 
                xTrainSeason[counter] = diff
                yTrainSeason[counter] = 1
            else:
                diff.append(-home)
                xTrainSeason[counter] = [ -p for p in diff]
                yTrainSeason[counter] = 0
            counter += 1
        for index, row in tourney.iterrows():
            w_team = row['WTeamID']
            w_vector = team_vectors[w_team]
            l_team = row['LTeamID']
            l_vector = team_vectors[l_team]
            diff = [a - b for a, b in zip(w_vector, l_vector)]
            home = 0 #All tournament games are neutral
            if (counter % 2 == 0):
                diff.append(home) 
                xTrainSeason[counter] = diff
                yTrainSeason[counter] = 1
            else:
                diff.append(-home)
                xTrainSeason[counter] = [ -p for p in diff]
                yTrainSeason[counter] = 0
            counter += 1
        xTrain[indexCounter:numGamesInSeason+indexCounter] = xTrainSeason
        yTrain[indexCounter:numGamesInSeason+indexCounter] = yTrainSeason
        indexCounter += numGamesInSeason
    return xTrain, yTrain

In [15]:
x_data, y_data = createTrainingSet([i for i in range(1985, 2022)])
np.save('x_dataset', x_data)
np.save('y_dataset', y_data)

  ppg /= total_games
  oppg /= total_games
  threes /= total_games
  to /= total_games
  reb /= total_games


In [16]:
xTrain = np.load('x_dataset.npy')
yTrain = np.load('y_dataset.npy')
xTrain.shape

(173052, 13)

We will use a simple DNN model to determine if using every regular season game to predict the 2022 March Madness is viable.

In [17]:
import tensorflow as tf

2023-07-19 22:18:31.562534: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [101]:
# Set up the layers
dnn = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape = (13, )),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation = 'sigmoid')])

In [102]:
# Compile
dnn.compile(optimizer = 'adam',
            loss = tf.keras.losses.BinaryCrossentropy(from_logits = True),
            metrics = ['accuracy'])

In [119]:
# Train it
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(xTrain, yTrain, test_size = .3, random_state = 254)

dnn.fit(X_train, y_train, epochs = 38)

Epoch 1/38
Epoch 2/38
Epoch 3/38
Epoch 4/38
Epoch 5/38
Epoch 6/38
Epoch 7/38
Epoch 8/38
Epoch 9/38
Epoch 10/38
Epoch 11/38
Epoch 12/38
Epoch 13/38
Epoch 14/38
Epoch 15/38
Epoch 16/38
Epoch 17/38
Epoch 18/38
Epoch 19/38
Epoch 20/38
Epoch 21/38
Epoch 22/38
Epoch 23/38
Epoch 24/38
Epoch 25/38
Epoch 26/38
Epoch 27/38
Epoch 28/38
Epoch 29/38
Epoch 30/38
Epoch 31/38
Epoch 32/38
Epoch 33/38
Epoch 34/38
Epoch 35/38
Epoch 36/38
Epoch 37/38
Epoch 38/38


<keras.src.callbacks.History at 0x7fd7dd4958d0>

In [120]:
# Accuracy on Validation set
val_loss, val_acc = dnn.evaluate(X_val, y_val, verbose = 4)
val_acc

0.7336400747299194

In [136]:
# Accuracy on Test set
c1 = tourney_detailed_pd['DayNum'] == 136
c2 = tourney_detailed_pd['DayNum'] == 137
c3 = tourney_detailed_pd['Season'] == 2022
first_round_2022_pd = tourney_detailed_pd[(c1 | c2) & c3]

d = {'WTeamID': first_round_2022_pd['WTeamID'].tolist(),
     'LTeamID': first_round_2022_pd['LTeamID'].tolist(),
     'WTeamWin': 0
    }

first_round_2022_prediction_pd = pd.DataFrame(d)
first_round_2022_prediction_pd.head()

def predictOutcome(w_data, l_data):
    diff = [a - b for a, b in zip(w_data, l_data)]
    diff.append(0)
    prediction = dnn.predict([diff])
    if prediction < .5:
        return 0
    else:
        return 1
    
X_test = getSeasonData
results = []

for i in range(32):
    w_team = first_round_2022_prediction_pd.iloc[i, 0]
    l_team = first_round_2022_prediction_pd.iloc[i, 1]
    w_data = getSeasonData(w_team, 2022)
    l_data = getSeasonData(l_team, 2022)
    results.append(predictOutcome(w_data, l_data)) 



In [139]:
results.count(1) / 32

0.625

With only a 62.5% accuracy score in Round 1, DNN's are not it.