## NCAA March Madness Prediction (Pt. 2)

In [19]:
import numpy as np
import pandas as pd
import random
import time
from colorama import Fore

import warnings
warnings.filterwarnings("ignore")

#### Part 1: Importing Teams + Games Data

In [20]:
cleaned_teams = pd.read_csv("data/cleaned_teams_1718.csv")
game_data = pd.read_csv("data/games.csv", dtype={'w_l': str}) 

#### Part 2: Cleaning Game Data (Extract 2017-2018 Data)

In [21]:
# Extracting games between 2017-2018 Season, Regular Season, before NCAA Tournament
start_date = "2017-11-10"
end_date = "2018-03-11"
games_1718 = game_data[(game_data["date"] >= start_date) & (game_data["date"] <= end_date)]

# March Madness Tournament Games
games_1718_tourney = game_data[(game_data["date"] > end_date)]

games_1718.reset_index(inplace=True)

# Dropping all columns except for teams and outcome
# This data would not be known before the game began, so it is not relevant
games_1718.drop(games_1718.columns.difference(["team_name", "opp_name", "w_l"]), axis=1, inplace=True)
print(games_1718.shape)
games_1718.head(3)

(11274, 3)


Unnamed: 0,team_name,opp_name,w_l
0,Abilene Christian Wildcats,Hillsdale,W
1,Abilene Christian Wildcats,Arkansas State,L
2,Abilene Christian Wildcats,Howard Payne,W


**Side Note**: Code for creating a map for school names (Purpose: bridge differences between datasets)

In [22]:
team_name_map = game_data[["team_name"]].drop_duplicates("team_name")
team_name_map.reset_index(inplace=True, drop=True)
# Removing Mascot from team name map
team_name_map["Name 2"] = team_name_map.apply(lambda x: " ".join(x["team_name"].split()[:-1]), axis=1)
team_name_map.columns = ["Name 1", "Name 2"]
# team_name_map.to_csv("data/team_name_map.csv", index=False) # Commented out to prevent overwriting

team_name_map = pd.read_csv("data/team_name_map.csv")
team_name_map.head(3)

Unnamed: 0,Name 1,Name 2
0,Abilene Christian Wildcats,Abilene Christian
1,Air Force Falcons,Air Force
2,Akron Zips,Akron


#### Part 3: Create Feature Vector based on teams

In [23]:
# NOTE: NO LONGER NECESSARY
# Sampling originally for debugging purposes, trying the below cells on a smaller subset of the table

# Taking Random Sample of Games Data
sample_size = 0.01 * int(games_1718.shape[0])
print("Sample Size:", sample_size, "games")
sample = games_1718.sample(n=sample_size, axis=0, )
sample.reset_index(drop=True, inplace=True)
sample.head()

Sample Size: 11274 games


Unnamed: 0,team_name,opp_name,w_l
0,North Dakota State Bison,Stephen F. Austin,L
1,Florida Gators,Mississippi State,W
2,Wichita State Shockers,Temple,L
3,Mount St. Mary's Mountaineers,Robert Morris,W
4,Virginia Tech Hokies,Pitt,W


In [29]:
# Function for retrieving corresponding team names
def map_team_name(name):
    if name in team_name_map["Name 1"].values:
        return team_name_map[team_name_map["Name 1"] == name]["Name 2"].values[0]
    return name

# Define new feature vector data frames
feature_cols = cleaned_teams.columns.difference(["School"])
games_1718_x = pd.DataFrame(columns=feature_cols)
games_1718_y = pd.DataFrame(columns=["w_l"])

# Create vector quantifying difference between team's stats
second_call = False
def create_vector(row):
    global games_1718_x
    global games_1718_y
    global second_call
    
    # Perform string matching to compensate for 
    home_team = map_team_name(row["team_name"])
    away_team = map_team_name(row["opp_name"])
    
    home = cleaned_teams[cleaned_teams["School"] == home_team].drop("School", axis=1)
    away = cleaned_teams[cleaned_teams["School"] == away_team].drop("School", axis=1)
    diff_vector = home.values - away.values
    new_row = pd.DataFrame(data=diff_vector, columns=feature_cols)
    
#     print(home_team, "vs.", away_team)
    # print(diff_vector.shape)
    # print("Difference: ", new_row)
    
    games_1718_x = games_1718_x.append(new_row)
    games_1718_y = games_1718_y.append(row[["w_l"]])
    
#     if not second_call:
#         games_1718_x = games_1718_x.append(new_row)
#         games_1718_y = games_1718_y.append(row[["w_l"]])
#         second_call = True
#     else:
#         second_call = False

start_time = time.perf_counter()

games_1718.apply(create_vector, axis=1) # Replace w/ "sample" for debugging purposes

end_time = time.perf_counter()
print(f"Time to compute results: {Fore.BLUE} {end_time - start_time :.5f} {Fore.RESET} seconds")

print(games_1718_x.shape)
games_1718_x.head()

Time to compute results: [34m 63.96568 [39m seconds
(9435, 65)


Unnamed: 0,3P,3PA,3PAr,AST,AST%,BLK,BLK%,FG,FGA,FT,...,TOV%,TRB,TRB%,TS%,Tm.,W,W.1,W.2,W.3,eFG%
0,5.0,-5.0,0.51,-3.8,2.0,-2.0,2.0,-1.0,2.0,-4.0,...,0.027,-0.045,-1.1,-3.6,0.9,-1.8,-0.059,2.7,-2.3,0.052
0,-7.0,6.0,-7.04,-3.02,-2.0,6.0,-4.0,4.0,-4.0,2.0,...,0.009,-0.004,2.5,-1.3,-0.1,-4.2,-0.012,1.7,4.5,0.039
0,-2.0,0.0,-3.46,-0.14,-2.0,2.0,-5.0,2.0,2.0,-2.0,...,0.015,0.0,-1.2,-1.3,0.0,-1.2,-0.007,3.5,1.9,0.063
0,0.0,0.0,-2.76,-3.94,1.0,-1.0,0.0,0.0,0.0,0.0,...,0.01,-0.016,1.5,-3.1,0.4,-1.5,-0.026,4.2,2.3,0.071
0,4.0,-3.0,-4.83,-8.54,2.0,-2.0,0.0,-1.0,3.0,-1.0,...,-0.057,-0.036,-1.0,-9.1,0.6,-3.1,-0.05,1.2,0.6,0.064


In [27]:
games_1718_y.head()

Unnamed: 0,w_l
0,W
1,L
2,W
3,L
4,W


In [38]:
# Appending win loss values to training table.
combined = games_1718_x.copy()
combined.reset_index(drop=True, inplace=True)
combined['w_l'] = games_1718_y['w_l']
print(combined.shape)
combined.head()

(9435, 66)


Unnamed: 0,3P,3PA,3PAr,AST,AST%,BLK,BLK%,FG,FGA,FT,...,TRB,TRB%,TS%,Tm.,W,W.1,W.2,W.3,eFG%,w_l
0,5.0,-5.0,0.51,-3.8,2.0,-2.0,2.0,-1.0,2.0,-4.0,...,-0.045,-1.1,-3.6,0.9,-1.8,-0.059,2.7,-2.3,0.052,W
1,-7.0,6.0,-7.04,-3.02,-2.0,6.0,-4.0,4.0,-4.0,2.0,...,-0.004,2.5,-1.3,-0.1,-4.2,-0.012,1.7,4.5,0.039,L
2,-2.0,0.0,-3.46,-0.14,-2.0,2.0,-5.0,2.0,2.0,-2.0,...,0.0,-1.2,-1.3,0.0,-1.2,-0.007,3.5,1.9,0.063,W
3,0.0,0.0,-2.76,-3.94,1.0,-1.0,0.0,0.0,0.0,0.0,...,-0.016,1.5,-3.1,0.4,-1.5,-0.026,4.2,2.3,0.071,L
4,4.0,-3.0,-4.83,-8.54,2.0,-2.0,0.0,-1.0,3.0,-1.0,...,-0.036,-1.0,-9.1,0.6,-3.1,-0.05,1.2,0.6,0.064,W


#### Part 4: Exporting Data to be used for Training / Testing in Section 3

In [39]:
combined.to_csv("data/train_data.csv", index=False)
print(combined.columns)

Index(['3P', '3PA', '3PAr', 'AST', 'AST%', 'BLK', 'BLK%', 'FG', 'FGA', 'FT',
       'FT/FGA', 'FTA', 'FTr', 'L', 'L.1', 'L.2', 'L.3', 'MP', 'ORB', 'ORB%',
       'ORtg', 'Opp.', 'Opp_3P', 'Opp_3PA', 'Opp_3PAr', 'Opp_AST', 'Opp_AST%',
       'Opp_BLK', 'Opp_BLK%', 'Opp_FG', 'Opp_FGA', 'Opp_FT', 'Opp_FT/FGA',
       'Opp_FTA', 'Opp_FTr', 'Opp_ORB', 'Opp_ORB%', 'Opp_ORtg', 'Opp_PF',
       'Opp_Pace', 'Opp_STL', 'Opp_STL%', 'Opp_TOV', 'Opp_TOV%', 'Opp_TRB',
       'Opp_TRB%', 'Opp_TS%', 'Opp_eFG%', 'PF', 'Pace', 'SOS', 'SRS', 'STL',
       'STL%', 'TOV', 'TOV%', 'TRB', 'TRB%', 'TS%', 'Tm.', 'W', 'W.1', 'W.2',
       'W.3', 'eFG%', 'w_l'],
      dtype='object')
