In [473]:
import numpy as np
import pandas as pd
import re
import pytorch as torch

POWER_CONF = ['SEC', 'B10', 'B12', 'P12', 'ACC']
NUM_FEATURES = 12

# Data Preprocessing

In [449]:
reg_season_res_df = pd.read_csv('data/RegularSeasonCompactResults.csv', index_col=["Season", "Daynum", "Wteam", "Lteam"]).loc[2002:]
tournament_res_df = pd.read_csv('data/TourneyCompactResults.csv', index_col=["Season", "Daynum", "Wteam", "Lteam"]).loc[2002:]
id_to_team = pd.read_csv('data/Teams.csv', index_col='Team_Id').to_dict()['Team_Name']
team_to_id = dict((v,k) for k,v in id_to_team.items())

In [450]:
results_df = pd.concat([reg_season_res_df, tournament_res_df])

In [451]:
def load_df(year):
    df = pd.read_csv('data/{}.csv'.format(year))
    df['year'] = [year] * df.shape[0]
    return df

def replace(team):
    df.replace(to_replace=team, value=team.replace('.', ''), inplace=True)
    df.replace(to_replace=team, value=team.replace('Cal St', 'CS'), inplace=True)
    df.replace(to_replace=team, value=team.replace('Eastern ', 'E '), inplace=True)
    df.replace(to_replace=team, value=team.replace('Western ', 'W '), inplace=True)
    df.replace(to_replace=team, value=team.replace('Northern ', 'N '), inplace=True)
    df.replace(to_replace=team, value=team.replace('Southern ', 'S '), inplace=True)
    df.replace(to_replace=team, value=team.replace('Saint ', 'St '), inplace=True)
    
df = pd.concat([load_df(year) for year in range(2002, 2018)])
df.drop(['13', '14', '15', '16', '17', '18', '19', '20'], 1, inplace=True)
_ = [replace(team) for team in set(df['Team_9'])]

In [452]:
df.replace(to_replace="St Mary's", value="St Mary's CA", inplace=True)
df.replace(to_replace="Mount St Mary's", value="Mt St Mary's", inplace=True)
df.replace(to_replace="Grambling St", value="Grambling", inplace=True)
df.replace(to_replace="VCU", value="VA Commonwealth", inplace=True)
df.replace(to_replace='Northwestern St', value='Northwestern LA', inplace=True)
df.replace(to_replace='Middle Tennessee', value='MTSU', inplace=True)
df.replace(to_replace='Cal Poly', value='Cal Poly SLO', inplace=True)
df.replace(to_replace='Texas Southern', value='TX Southern', inplace=True)
df.replace(to_replace='Mississippi Valley St', value='MS Valley St', inplace=True)
df.replace(to_replace='UMass Lowell', value='MA Lowell', inplace=True)
df.replace(to_replace='Loyola Chicago', value='Loyola-Chicago', inplace=True)
df.replace(to_replace='Coastal Carolina', value='Coastal Car', inplace=True)
df.replace(to_replace='American', value='American Univ', inplace=True)
df.replace(to_replace='Bethune Cookman', value='Bethune-Cookman', inplace=True)
df.replace(to_replace='LIU Brooklyn', value='Brooklyn', inplace=True)
df.replace(to_replace='The Citadel', value='Citadel', inplace=True)
df.replace(to_replace='Cal St Fullerton', value='CS Fullerton', inplace=True)
df.replace(to_replace='Cal St Bakersfield', value='CS Bakersfield', inplace=True)
df.replace(to_replace='Cal St Northridge', value='CS Northridge', inplace=True)
df.replace(to_replace='W Kentucky', value='WKU', inplace=True)
df.replace(to_replace='Arkansas Pine Bluff', value='Ark Pine Bluff', inplace=True)
df.replace(to_replace='Southern', value='Southern Univ', inplace=True)
df.replace(to_replace='Southeast Missouri St', value='SE Missouri St', inplace=True)
df.replace(to_replace='Sacramento St', value='CS Sacramento', inplace=True)
df.replace(to_replace='Green Bay', value='WI Green Bay', inplace=True)
df.replace(to_replace='Boston University', value='Boston Univ', inplace=True)
df.replace(to_replace='North Carolina St', value='NC State', inplace=True)
df.replace(to_replace='South Carolina St', value='South Carolina', inplace=True)
df.replace(to_replace='North Carolina Central', value='NC Central', inplace=True)
df.replace(to_replace='S Utah', value='Southern Utah', inplace=True)
df.replace(to_replace='Houston Baptist', value='Houston Bap', inplace=True)
df.replace(to_replace='North Dakota St', value='N Dakota St', inplace=True)
df.replace(to_replace='Monmouth', value='Monmouth NJ', inplace=True)
df.replace(to_replace='East Tennessee St', value='ETSU', inplace=True)
df.replace(to_replace='Florida Gulf Coast', value='FL Gulf Coast', inplace=True)
df.replace(to_replace='Florida Atlantic', value='FL Atlantic', inplace=True)
df.replace(to_replace='Little Rock', value='Ark Little Rock', inplace=True)
df.replace(to_replace='North Carolina A&T', value='NC A&T', inplace=True)
df.replace(to_replace='N Iowa', value='Northern Iowa', inplace=True)
df.replace(to_replace='Kennesaw St', value='Kennesaw', inplace=True)
df.replace(to_replace='South Dakota St', value='S Dakota St', inplace=True)
df.replace(to_replace='Albany', value='Albany NY', inplace=True)
df.replace(to_replace='FIU', value='Florida Intl', inplace=True)
df.replace(to_replace='Central Michigan', value='C Michigan', inplace=True)
df.replace(to_replace='Prairie View A&M', value='Prairie View', inplace=True)
df.replace(to_replace='N Arizona', value='Northern Arizona', inplace=True)
df.replace(to_replace='Illinois Chicago', value='IL Chicago', inplace=True)
df.replace(to_replace='Stephen F Austin', value='SF Austin', inplace=True)
df.replace(to_replace='SIU Edwardsville', value='Edwardsville', inplace=True)
df.replace(to_replace='Tennessee Martin', value='TN Martin', inplace=True)
df.replace(to_replace='Georgia Southern', value='Ga Southern', inplace=True)
df.replace(to_replace='Charleston Southern', value='Charleston So', inplace=True)
df.replace(to_replace='Central Arkansas', value='Cent Arkansas', inplace=True)
df.replace(to_replace='Milwaukee', value='WI Milwaukee', inplace=True)
df.replace(to_replace='Central Connecticut', value='Central Conn', inplace=True)
df.replace(to_replace='Abilene Christian', value='Abilene Chr', inplace=True)
df.replace(to_replace='USC Upstate', value='SC Upstate', inplace=True)
df.replace(to_replace="St Joseph's", value="St Joseph's PA", inplace=True)
df.replace(to_replace='George Washington', value='G Washington', inplace=True)
df.replace(to_replace='UC Santa Barbara', value='Santa Barbara', inplace=True)
df.replace(to_replace='College of Charleston', value='Col Charleston', inplace=True)
df.replace(to_replace='Maryland E Shore', value='MD E Shore', inplace=True)
df.replace(to_replace='Fairleigh Dickinson', value='F Dickinson', inplace=True)
df.replace(to_replace='Kent St', value='Kent', inplace=True)
df.replace(to_replace='Southeastern Louisiana', value='SE Louisiana', inplace=True)
df.replace(to_replace='Loyola Marymount', value='Loy Marymount', inplace=True)
df.replace(to_replace='UT Rio Grande Valley', value='UTRGV', inplace=True)
df.replace(to_replace='Nebraska Omaha', value='NE Omaha', inplace=True)
df.replace(to_replace='Louisiana Monroe', value='ULM', inplace=True)
df.replace(to_replace='Texas A&M Corpus Chris', value='TAM C. Christi', inplace=True)
df.replace(to_replace='S Miss', value='Southern Miss', inplace=True)
df.replace(to_replace='UTSA', value='UT San Antonio', inplace=True)
df.replace(to_replace='UMKC', value='Missouri KC', inplace=True)
df.replace(to_replace='Louisiana Lafayette', value='ULL', inplace=True)
df.replace(to_replace='Fort Wayne', value='IPFW', inplace=True)

In [None]:
_ = [df.replace(to_replace=t, value=team_to_id[t], inplace=True) for t in set(df['Team_9'])]

In [None]:
def extract_win_percent(record):
    record = re.split('[-]', record)
    return float(record[0]) / (float(record[0]) + float(record[1]))
    
df['W-L_9'] = df['W-L_9'].apply(extract_win_percent)

In [471]:
def set_power_conf(conf):
    return 1 if conf in POWER_CONF else 0

df['Conf_9'] = df['Conf_9'].apply(set_power_conf)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [460]:
df.set_index(['year', 'Team_9'], inplace=True)
df = df[~df.index.duplicated(keep='first')]

In [494]:
X, y = np.zeros(NUM_FEATURES), np.zeros(1)

for i, (index, game) in enumerate(results_df.iterrows()):
    year, _, w_team, l_team = index
    ranked_teams = df.loc[year].index
    if w_team not in ranked_teams or l_team not in ranked_teams: continue
    w_stats = df.loc[year, w_team]
    l_stats = df.loc[year, l_team]
    
    # 0 if right team wins, 1 if left team wins
    if i % 2 == 0: X_i, y_i= l_stats - w_stats, 0
    else: X_i, y_i = w_stats - l_stats, 1
        
    X = np.vstack((X, X_i))
    y = np.vstack((X, y_i))
    if i % 10000 == 0: print('Iteration {}'.format(i))

Iteration 0
Iteration 1000
Iteration 2000
Iteration 3000
Iteration 4000
Iteration 5000
Iteration 6000
Iteration 7000
Iteration 8000
Iteration 9000
Iteration 10000
Iteration 11000
Iteration 12000
Iteration 13000
Iteration 14000
Iteration 15000
Iteration 16000
Iteration 17000
Iteration 19000
Iteration 20000
Iteration 21000
Iteration 22000
Iteration 23000
Iteration 24000
Iteration 25000
Iteration 26000
Iteration 27000
Iteration 28000
Iteration 29000
Iteration 30000
Iteration 31000
Iteration 32000
Iteration 33000
Iteration 34000
Iteration 35000
Iteration 36000
Iteration 37000
Iteration 38000
Iteration 39000
Iteration 40000
Iteration 41000
Iteration 42000
Iteration 43000
Iteration 45000
Iteration 46000
Iteration 47000
Iteration 48000
Iteration 49000
Iteration 50000
Iteration 51000
Iteration 52000
Iteration 53000
Iteration 54000
Iteration 55000
Iteration 56000
Iteration 57000
Iteration 58000
Iteration 59000
Iteration 60000
Iteration 61000
Iteration 62000
Iteration 63000
Iteration 64000
Itera

# Exploratory Analysis

In [513]:
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle

model = LogisticRegression()
X_shuf, y_shuf = shuffle(X_train[:50000], y_train[:50000, 0])
model.fit(X_shuf, y_shuf)
print("Train Accuracy: {0}".format(model.score(X_shuf, y_shuf)))
print("Test Accuracy: {0}".format(model.score(X_train[-10000:], y_train[-10000:])))

Train Accuracy: 0.61552
Test Accuracy: 0.6436


# Model Architecture

ImportError: No module named 'pytorch'