## NCAA March Madness Prediction (Pt. 2)

In [302]:
import numpy as np
import pandas as pd
import random
import time
from colorama import Fore

import warnings
warnings.filterwarnings("ignore")

#### Part 1: Importing Teams + Games Data

In [303]:
cleaned_teams = pd.read_csv("data/cleaned_teams_1718.csv")
game_data = pd.read_csv("data/games.csv", dtype={'w_l': str})

#### Part 2: Cleaning Game Data (Extract 2017-2018 Data)

In [304]:
# Extracting games between 2017-2018 Season, Regular Season, before NCAA Tournament
start_date = "2017-11-10"
end_date = "2018-03-11"
games_1718 = game_data[(game_data["date"] >= start_date) & (game_data["date"] <= end_date)]

# March Madness Tournament Games
games_1718_tourney = game_data[(game_data["date"] > end_date)]

games_1718.reset_index(inplace=True)

# Dropping all columns except for teams and outcome
# This data would not be known before the game began, so it is not relevant
games_1718.drop(games_1718.columns.difference(["team_name", "opp_name", "w_l"]), axis=1, inplace=True)
print(games_1718.shape)
games_1718.head(3)

(11274, 3)


Unnamed: 0,team_name,opp_name,w_l
0,Abilene Christian Wildcats,Hillsdale,W
1,Abilene Christian Wildcats,Arkansas State,L
2,Abilene Christian Wildcats,Howard Payne,W


**Side Note**: Code for creating a map for school names (Purpose: bridge differences between datasets)

In [305]:
team_name_map = game_data[["team_name"]].drop_duplicates("team_name")
team_name_map.reset_index(inplace=True, drop=True)
# Removing Mascot from team name map
team_name_map["Name 2"] = team_name_map.apply(lambda x: " ".join(x["team_name"].split()[:-1]), axis=1)
team_name_map.columns = ["Name 1", "Name 2"]
# team_name_map.to_csv("data/team_name_map.csv", index=False) # Commented out to prevent overwriting

team_name_map = pd.read_csv("data/team_name_map.csv")
team_name_map.head(3)

Unnamed: 0,Name 1,Name 2
0,Abilene Christian Wildcats,Abilene Christian
1,Air Force Falcons,Air Force
2,Akron Zips,Akron


#### Part 3: Create Feature Vector based on teams

In [308]:
# Taking Random Sample of Games Data
sample_size = 15 # int(0.001*games_1718.shape[0])
print("Sample Size:", sample_size, "games")
sample = games_1718.sample(n=sample_size, axis=0, )
sample.reset_index(drop=True, inplace=True)
sample.head()

Sample Size: 15 games


Unnamed: 0,team_name,opp_name,w_l
0,Fairfield Stags,Wagner,L
1,Illinois State Redbirds,Loyola (IL),L
2,St. Francis (NY) Terriers,Denver,L
3,Kansas Jayhawks,Kansas State,W
4,Texas-El Paso Miners,South Carolina,L


In [309]:
# Function for retrieving corresponding team names
def map_team_name(name):
    if name in team_name_map["Name 1"].values:
        return team_name_map[team_name_map["Name 1"] == name]["Name 2"].values[0]
    return name

# Define new feature vector data frames
feature_cols = cleaned_teams.columns.difference(["School"])
games_1718_x = pd.DataFrame(columns=feature_cols)
games_1718_y = pd.DataFrame(columns=["w_l"])

# Create vector quantifying difference between team's stats
second_call = False
def create_vector(row):
    global games_1718_x
    global games_1718_y
    global second_call
    
    # Perform string matching to compensate for 
    home_team = map_team_name(row["team_name"])
    away_team = map_team_name(row["opp_name"])
    
    home = cleaned_teams[cleaned_teams["School"] == home_team].drop("School", axis=1)
    away = cleaned_teams[cleaned_teams["School"] == away_team].drop("School", axis=1)
    diff_vector = home.values - away.values
    new_row = pd.DataFrame(data=diff_vector, columns=feature_cols)
    
    print(home_team, "vs.", away_team)
    # print(diff_vector.shape)
    # print("Difference: ", new_row)
    
    games_1718_x = games_1718_x.append(new_row)
    games_1718_y = games_1718_y.append(row[["w_l"]])
    
#     if not second_call:
#         games_1718_x = games_1718_x.append(new_row)
#         games_1718_y = games_1718_y.append(row[["w_l"]])
#         second_call = True
#     else:
#         second_call = False

start_time = time.perf_counter()

sample.apply(create_vector, axis=1)

end_time = time.perf_counter()
print(f"Time to compute results: {Fore.BLUE} {end_time - start_time :.5f} {Fore.RESET} seconds")

print(games_1718_x.shape)
games_1718_x.head()

Fairfield vs. Wagner
Illinois State vs. Loyola (IL)
St. Francis (NY) vs. Denver
Kansas vs. Kansas State
Texas-El Paso vs. South Carolina
Mercer vs. ETSU
Harvard vs. UMass
Oklahoma State vs. Wichita State
Rutgers vs. Michigan State
Grand Canyon vs. Louisville
Northwestern vs. Ohio State
Green Bay vs. Eastern Illinois
North Carolina-Asheville vs. Austin Peay
Virginia Military Institute vs. Southern Wesleyan
James Madison vs. Drexel
Time to compute results: [34m 0.07561 [39m seconds
(12, 65)


Unnamed: 0,3P,3PA,3PAr,AST,AST%,BLK,BLK%,FG,FGA,FT,...,TOV%,TRB,TRB%,TS%,Tm.,W,W.1,W.2,W.3,eFG%
0,-6.0,6.0,-1.79,3.22,-5.0,5.0,-6.0,2.0,-4.0,1.0,...,-0.011,0.035,5.4,-4.4,0.1,-2.3,0.048,-1.4,-1.2,-0.057
0,-14.0,9.0,-9.3,1.86,-5.0,5.0,-3.0,2.0,-6.0,6.0,...,0.026,0.025,3.7,6.7,-0.9,0.7,0.014,-1.3,5.7,0.095
0,-2.0,3.0,-7.03,-4.78,2.0,2.0,0.0,-1.0,0.0,4.0,...,-0.114,0.021,7.4,-1.1,-1.3,1.3,0.018,3.8,6.6,0.06
0,6.0,-4.0,8.55,3.13,3.0,-3.0,0.0,0.0,2.0,-2.0,...,-0.008,-0.014,-2.9,-6.0,-0.6,2.8,0.0,-3.0,-0.8,-0.094
0,-6.0,4.0,-14.75,-9.43,-1.0,1.0,-1.0,3.0,-2.0,0.0,...,0.04,-0.003,5.8,1.7,-2.6,-9.4,0.015,-3.1,-0.5,-0.076
