## NCAA March Madness Prediction (Pt. 1)

In [11]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

#### Part 1: Data Retrieval
Purpose: Given Raw Data of all Division 1 Basketball Schools from the 2017-2018 season, retrieve all NCAA March Madness participants.

In [14]:
# Loading in Raw Data of School Features
basic = pd.read_csv("data/season1718/basic_school.csv")
opponents = pd.read_csv("data/season1718/basic_opps.csv")
advanced = pd.read_csv("data/season1718/adv_school.csv")
opp_adv = pd.read_csv("data/season1718/adv_opps.csv")

print(basic.shape)
basic.head(3)

(351, 34)


Unnamed: 0,Rk,School,G,W,L,W-L%,SRS,SOS,W.1,L.1,...,FT,FTA,FT%,ORB,TRB,AST,STL,BLK,TOV,PF
0,1,Abilene Christian,32,16,16,0.5,-9.14,-6.82,8,10,...,406,579,0.701,305,1093,482,260,128,461,671
1,2,Air Force,31,12,19,0.387,-4.31,1.72,6,12,...,408,556,0.734,302,1014,445,201,78,391,557
2,3,Akron,32,14,18,0.438,-6.82,-1.92,6,12,...,405,582,0.696,300,1071,418,189,81,440,637


In [13]:
# NOTE: Do not run this block to generate data for all teams

# Remove all schools that are not March Madness Tourney participants
basic = basic[(basic["School"].str.contains("NCAA"))]
opponents = opponents[(opponents["School"].str.contains("NCAA"))]
advanced = advanced[(advanced["School"].str.contains("NCAA"))]
opp_adv = opp_adv[(opp_adv["School"].str.contains("NCAA"))]

# Check the above (68 Total Competitors, Same Schools in every Table)
assert basic.shape[0] == 68
assert opponents.shape[0] == 68
assert advanced.shape[0] == 68
assert opp_adv.shape[0] == 68

assert sorted(basic["School"].values) == sorted(opponents["School"].values)
assert sorted(opponents["School"].values) == sorted(advanced["School"].values)
assert sorted(opponents["School"].values) == sorted(opp_adv["School"].values)

In [15]:
# Remove "NCAA" String from School Names
# *WARNING* - Do not run previous block after this one, otherwise will eliminate all rows
def remove_NCAA_str(row):
    return row["School"].replace("NCAA", "").strip()

basic["School"] = basic.apply(remove_NCAA_str, axis=1)
opponents["School"] = opponents.apply(remove_NCAA_str, axis=1)
advanced["School"] = advanced.apply(remove_NCAA_str, axis=1)
opp_adv["School"] = opp_adv.apply(remove_NCAA_str, axis=1)

#### Data Cleaning
Purpose: Begin to Remove Columns that are unnecessary before merge

In [16]:
# Basic Table:
# Dropped: [Rk, G, W-L%, Unnamed: 16, FG%, FT%, 3P%, FT%]
# Reminders: SRS - Simple Rating System, SOS - Strength of Schedule
# W,L.1 - Conference, W,L.2 - Home, W,L.3 - Away
# Tm./Opp. - Points for/against
redundant = ['Rk', 'G', 'W-L%', 'Unnamed: 16', 'FG%', 'FT%', '3P%', 'FT%']
try:
    basic.drop(redundant, axis=1, inplace=True)
except ValueError as err:
    print("Columns Already Removed from Basic Table")

basic.reset_index(drop=True, inplace=True)
basic.head()

Unnamed: 0,School,W,L,SRS,SOS,W.1,L.1,W.2,L.2,W.3,...,3PA,FT,FTA,ORB,TRB,AST,STL,BLK,TOV,PF
0,Abilene Christian,16,16,-9.14,-6.82,8,10,9,6,6,...,656,406,579,305,1093,482,260,128,461,671
1,Air Force,12,19,-4.31,1.72,6,12,9,7,3,...,755,408,556,302,1014,445,201,78,391,557
2,Akron,14,18,-6.82,-1.92,6,12,12,4,2,...,851,405,582,300,1071,418,189,81,440,637
3,Alabama A&M,3,28,-23.97,-8.04,3,15,2,9,1,...,601,345,533,328,1046,340,123,49,514,506
4,Alabama-Birmingham,20,13,4.9,-0.65,10,8,14,3,5,...,646,423,564,332,1255,560,178,127,437,490


In [17]:
# Opponents Table
keep = ["School", "FG", "FGA", "3P", "3PA", "FT", "FTA", "ORB", "TRB", "AST", "STL", "BLK", "TOV", "PF"]
try:
    opponents.drop(opponents.columns.difference(keep), axis=1, inplace=True)
except ValueError as err:
    print("Columns already removed from Opponents Table")

# Renaming Columns before Merging with 
opponents.reset_index(drop=True, inplace=True)
opponents.head()

Unnamed: 0,School,FG,FGA,3P,3PA,FT,FTA,ORB,TRB,AST,STL,BLK,TOV,PF
0,Abilene Christian,770,1759,214,657,525,743,313,1107,370,205,91,512,556
1,Air Force,781,1686,288,726,394,553,274,1069,447,174,106,436,582
2,Akron,828,1797,257,720,498,689,298,1106,423,202,114,410,554
3,Alabama A&M,881,1844,228,595,377,549,340,1126,460,237,151,353,528
4,Alabama-Birmingham,827,1952,292,853,357,513,301,1035,440,232,100,414,555


In [18]:
# Advanced Statistics Table
original_index = basic.columns.append(pd.Index(redundant))
common_cols = original_index.intersection(advanced.columns)
to_remove = common_cols.drop(["School"])
# print("Dropped: ", to_remove.tolist())

try:
    advanced.drop(to_remove, axis=1, inplace=True)
except ValueError as err:
    print("Columns already removed from Advanced Table")

advanced.reset_index(drop=True, inplace=True)
advanced.head()

Unnamed: 0,School,Pace,ORtg,FTr,3PAr,TS%,TRB%,AST%,STL%,BLK%,eFG%,TOV%,ORB%,FT/FGA
0,Abilene Christian,71.6,102.2,0.309,0.35,0.549,49.7,55.4,11.3,11.6,0.521,17.7,27.8,0.217
1,Air Force,67.7,100.8,0.318,0.431,0.527,48.7,60.7,9.5,8.1,0.49,16.3,27.5,0.233
2,Akron,69.1,102.6,0.319,0.467,0.547,49.2,52.7,8.4,7.5,0.518,17.3,27.1,0.222
3,Alabama A&M,68.3,88.1,0.314,0.354,0.48,48.2,50.5,5.8,3.9,0.45,20.9,29.4,0.203
4,Alabama-Birmingham,69.5,109.8,0.291,0.334,0.575,54.8,59.3,7.7,11.6,0.545,16.5,31.1,0.218


In [19]:
# Advanced Statistics (Opponents) Table
common_cols = original_index.intersection(opp_adv.columns)
to_remove = common_cols.drop(["School"])

try:
    opp_adv.drop(to_remove, axis=1, inplace=True)
except ValueError as err:
    print("Columns already removed from Advanced (Opponents) Table")

opp_adv.reset_index(drop=True, inplace=True)
opp_adv.head()

Unnamed: 0,School,Pace,ORtg,FTr,3PAr,TS%,TRB%,AST%,STL%,BLK%,eFG%,TOV%,ORB%,FT/FGA
0,Abilene Christian,71.6,98.7,0.422,0.374,0.54,50.3,48.1,8.9,7.5,0.499,19.5,28.4,0.298
1,Air Force,67.7,106.5,0.328,0.431,0.576,51.3,57.2,8.3,10.6,0.549,18.3,27.8,0.234
2,Akron,69.1,107.7,0.383,0.401,0.567,50.8,51.1,9.0,11.7,0.532,16.2,27.9,0.277
3,Alabama A&M,68.3,111.3,0.298,0.323,0.562,51.8,52.2,11.1,13.8,0.54,14.4,32.1,0.204
4,Alabama-Birmingham,69.5,99.7,0.263,0.437,0.524,45.2,53.2,10.0,7.7,0.498,15.9,24.6,0.183


#### Merge Tables
Purpose: Construct an extensive feature vector of each school

In [20]:
# Combine features for each school from 4 separate DF's into one dataframe
opponents.columns = [keep[0]] + ["Opp_" + x for x in keep[1:]]
print("Opponent Columns Renamed: ", opponents.columns.tolist())
combined = basic.merge(opponents, how="inner", on="School")
# assert combined.shape[0] == 68 # Comment out for non-tourney participants
assert combined.shape[1] == (basic.shape[1]+opponents.shape[1])-1 # -1 for "School"

if "Opp_3PAr" not in opp_adv.columns.tolist():
    orig = opp_adv.columns.tolist()
    opp_adv.columns = [orig[0]] + ["Opp_" + x for x in orig[1:]]
print("Advanced (Opponents) Columns Renamed: ", opp_adv.columns.tolist())
combined2 = advanced.merge(opp_adv, how="inner", on="School")
# assert combined2.shape[0] == 68 # Comment out for non-tourney participants
assert combined2.shape[1] == (advanced.shape[1]+opp_adv.shape[1])-1

# Ensure only overlapping column is school
assert combined2.columns.intersection(combined.columns) == pd.Index(["School"])

combined3 = combined.merge(combined2, how="inner", on="School")
# assert combined3.shape[0] == 68 # Comment out for non-tourney participants
assert combined3.shape[1] == (combined.shape[1]+combined2.shape[1])-1

combined3.head()

Opponent Columns Renamed:  ['School', 'Opp_FG', 'Opp_FGA', 'Opp_3P', 'Opp_3PA', 'Opp_FT', 'Opp_FTA', 'Opp_ORB', 'Opp_TRB', 'Opp_AST', 'Opp_STL', 'Opp_BLK', 'Opp_TOV', 'Opp_PF']
Advanced (Opponents) Columns Renamed:  ['School', 'Opp_Pace', 'Opp_ORtg', 'Opp_FTr', 'Opp_3PAr', 'Opp_TS%', 'Opp_TRB%', 'Opp_AST%', 'Opp_STL%', 'Opp_BLK%', 'Opp_eFG%', 'Opp_TOV%', 'Opp_ORB%', 'Opp_FT/FGA']


Unnamed: 0,School,W,L,SRS,SOS,W.1,L.1,W.2,L.2,W.3,...,Opp_3PAr,Opp_TS%,Opp_TRB%,Opp_AST%,Opp_STL%,Opp_BLK%,Opp_eFG%,Opp_TOV%,Opp_ORB%,Opp_FT/FGA
0,Abilene Christian,16,16,-9.14,-6.82,8,10,9,6,6,...,0.374,0.54,50.3,48.1,8.9,7.5,0.499,19.5,28.4,0.298
1,Air Force,12,19,-4.31,1.72,6,12,9,7,3,...,0.431,0.576,51.3,57.2,8.3,10.6,0.549,18.3,27.8,0.234
2,Akron,14,18,-6.82,-1.92,6,12,12,4,2,...,0.401,0.567,50.8,51.1,9.0,11.7,0.532,16.2,27.9,0.277
3,Alabama A&M,3,28,-23.97,-8.04,3,15,2,9,1,...,0.323,0.562,51.8,52.2,11.1,13.8,0.54,14.4,32.1,0.204
4,Alabama-Birmingham,20,13,4.9,-0.65,10,8,14,3,5,...,0.437,0.524,45.2,53.2,10.0,7.7,0.498,15.9,24.6,0.183


#### Saving Results
Purpose: Transfer dataframe values to CSV to be used in next notebook for EDA analysis.
Modularizing the code means chunks of code don't have to be run repeatedly.

In [21]:
combined3.to_csv("data/cleaned_teams_1718.csv", index=False)
print(combined3.columns)
combined3.head()

Index(['School', 'W', 'L', 'SRS', 'SOS', 'W.1', 'L.1', 'W.2', 'L.2', 'W.3',
       'L.3', 'Tm.', 'Opp.', 'MP', 'FG', 'FGA', '3P', '3PA', 'FT', 'FTA',
       'ORB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'Opp_FG', 'Opp_FGA',
       'Opp_3P', 'Opp_3PA', 'Opp_FT', 'Opp_FTA', 'Opp_ORB', 'Opp_TRB',
       'Opp_AST', 'Opp_STL', 'Opp_BLK', 'Opp_TOV', 'Opp_PF', 'Pace', 'ORtg',
       'FTr', '3PAr', 'TS%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'eFG%', 'TOV%',
       'ORB%', 'FT/FGA', 'Opp_Pace', 'Opp_ORtg', 'Opp_FTr', 'Opp_3PAr',
       'Opp_TS%', 'Opp_TRB%', 'Opp_AST%', 'Opp_STL%', 'Opp_BLK%', 'Opp_eFG%',
       'Opp_TOV%', 'Opp_ORB%', 'Opp_FT/FGA'],
      dtype='object')


Unnamed: 0,School,W,L,SRS,SOS,W.1,L.1,W.2,L.2,W.3,...,Opp_3PAr,Opp_TS%,Opp_TRB%,Opp_AST%,Opp_STL%,Opp_BLK%,Opp_eFG%,Opp_TOV%,Opp_ORB%,Opp_FT/FGA
0,Abilene Christian,16,16,-9.14,-6.82,8,10,9,6,6,...,0.374,0.54,50.3,48.1,8.9,7.5,0.499,19.5,28.4,0.298
1,Air Force,12,19,-4.31,1.72,6,12,9,7,3,...,0.431,0.576,51.3,57.2,8.3,10.6,0.549,18.3,27.8,0.234
2,Akron,14,18,-6.82,-1.92,6,12,12,4,2,...,0.401,0.567,50.8,51.1,9.0,11.7,0.532,16.2,27.9,0.277
3,Alabama A&M,3,28,-23.97,-8.04,3,15,2,9,1,...,0.323,0.562,51.8,52.2,11.1,13.8,0.54,14.4,32.1,0.204
4,Alabama-Birmingham,20,13,4.9,-0.65,10,8,14,3,5,...,0.437,0.524,45.2,53.2,10.0,7.7,0.498,15.9,24.6,0.183
