In [1]:
import pandas as pd
import numpy as np
import random
from time import time
from operator import itemgetter
from scipy.stats import randint as sp_randint

from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV



In [2]:
bigdata = pd.read_csv("data/RegularSeasonDetailedResults.csv")

Get rate statistics

In [3]:
# Score Dif
bigdata['dif'] = bigdata.Wscore-bigdata.Lscore
# Pace
bigdata['pace'] = (bigdata.Wfga+bigdata.Lfga - bigdata.Wor - bigdata.Lor+bigdata.Wto+bigdata.Lto+0.475*(bigdata.Wfta + bigdata.Lfta))/2
# Offensive (and by the converse, defensive) rebounding percentage
bigdata['Worp'] = bigdata.Wor/(bigdata.Ldr+bigdata.Wor)
bigdata['Lorp'] = bigdata.Lor/(bigdata.Wdr+bigdata.Lor)
# Turnover percentage
bigdata['Wtop'] = bigdata.Wto/bigdata.pace
bigdata['Ltop'] = bigdata.Lto/bigdata.pace
# Assist percentage
bigdata['Wastp'] = bigdata.Wast/bigdata.Wfgm
bigdata['Lastp'] = bigdata.Last/bigdata.Lfgm
# Shooting percentage
bigdata['Wfgp'] = bigdata.Wfgm/bigdata.Wfga
bigdata['Lfgp'] = bigdata.Lfgm/bigdata.Lfga
# 3 pt percentage
bigdata['Wfg3p'] = bigdata.Wfgm3/bigdata.Wfga3
bigdata['Lfg3p'] = bigdata.Lfgm3/bigdata.Lfga3
# Free throw shooting percentage (perhaps not needed?)
bigdata['Wftp'] = bigdata.Wftm/bigdata.Wfta
bigdata['Lftp'] = bigdata.Lftm/bigdata.Lfta
# Steal percentage
bigdata['Wstlp'] = bigdata.Wstl/bigdata.pace
bigdata['Lstlp'] = bigdata.Lstl/bigdata.pace
# Block percentage
bigdata['Wblkp'] = bigdata.Wblk/bigdata.pace
bigdata['Lblkp'] = bigdata.Lblk/bigdata.pace
# 3 Point Rate
bigdata['W3pr'] = bigdata.Wfga3/bigdata.Wfga
bigdata['L3pr'] = bigdata.Lfga3/bigdata.Lfga
# Free throw Rate
bigdata['Wftr'] = bigdata.Wfta/bigdata.Wfga
bigdata['Lftr'] = bigdata.Lfta/bigdata.Lfga
# Offensive and Defensive overall
bigdata['Wor'] = bigdata.Wscore/bigdata.pace
bigdata['Lor'] = bigdata.Lscore/bigdata.pace

In [28]:
# by construction will get both offensive and defensive values
STATS_TO_GET = [
    'or', # offensive rating
    'orp', # offensive rebounding percentage
    'astp', # assist percentage
    'fgp', # field goal shooting percantage
    'fg3p', # 3 pt field goal shooting percentage
    'ftp', # free throw percentage
    'stlp', # steal percentage
    'blkp', # block percentage
    '3pr', # 3 point rate
    'ftr', # free throw rate
    'top' # turnover percentage
]

In [4]:
def seperate_off_and_def(series):
    off_index = [x for x in series.index if '_o' in x]
    def_index = [x for x in series.index if '_d' in x]
    if len(off_index) == 0:
        return series.to_frame()
    off_stat = series.loc[off_index]
    def_stat = series.loc[def_index]
    off_stat.index = [x.replace('_o','') for x in off_stat.index]
    def_stat.index = [x.replace('_d','') for x in def_stat.index]
    return pd.concat([off_stat.to_frame().add_suffix('_o'),
                      def_stat.to_frame().add_suffix('_d')], axis=1)

In [8]:
def get_stat_series(xdf, ydf, name):
    isnull_index = ydf[ydf.isnull()].index
    ydf = ydf.drop(isnull_index)
    xdf = xdf.drop(isnull_index)
    model = RidgeCV(alphas=[float(x)/20 for x in range(1,100)])
    model.fit(xdf,ydf)
    index = [str(x) for x in xdf.columns]
    return seperate_off_and_def(pd.Series(model.coef_, index=index, name=name))

In [45]:
def get_all_stats_for_a_year(df, year):
    data_train = df[lambda x: x.Season==year]
    Wteam_dummies =pd.get_dummies(data_train.Wteam)
    Lteam_dummies = pd.get_dummies(data_train.Lteam)
    for col in Wteam_dummies.columns.difference(Lteam_dummies.columns):
        Lteam_dummies[col] = 0
    for col in Lteam_dummies.columns.difference(Wteam_dummies.columns):
        Wteam_dummies[col] = 0
    abs_df = pd.concat([Wteam_dummies + Lteam_dummies], axis=1).reset_index(drop=True)
    first_df = pd.concat([Wteam_dummies.add_suffix('_o'), -1*Lteam_dummies.add_suffix('_d')], axis=1).reset_index(drop=True)
    second_df = pd.concat([-1*Wteam_dummies.add_suffix('_d'), Lteam_dummies.add_suffix('_o')], axis=1).reset_index(drop=True)
    home_vector = pd.concat([(data_train['Wloc'] == 'H').astype(int),(data_train['Wloc'] == 'H').astype(int)]).reset_index(drop=True)
    xdf = pd.concat([pd.concat([first_df, second_df]).reset_index(drop=True), home_vector], axis=1).fillna(0)
    list_of_stat_rankings = []
    for stat in ['pace']:
        ydf = data_train[stat].reset_index(drop=True)
        list_of_stat_rankings.append(get_stat_series(abs_df, ydf, stat))
    for stat in STATS_TO_GET:
        ydf = pd.concat([data_train['W{}'.format(stat)], data_train['L{}'.format(stat)]]).reset_index(drop=True)
        list_of_stat_rankings.append(get_stat_series(xdf, ydf, stat))
    all_stats = pd.concat(list_of_stat_rankings, axis=1)
    all_stats.index.name='team'
    return all_stats.reset_index().assign(year=year)

In [46]:

stat_database = []
for year in range(min(bigdata.Season),max(bigdata.Season)+1):
    print 'Cacluating stats for year {}'.format(year)
    stat_database.append(get_all_stats_for_a_year(bigdata, year))

Cacluating stats for year 2003
Cacluating stats for year 2004
Cacluating stats for year 2005
Cacluating stats for year 2006
Cacluating stats for year 2007
Cacluating stats for year 2008
Cacluating stats for year 2009
Cacluating stats for year 2010
Cacluating stats for year 2011
Cacluating stats for year 2012
Cacluating stats for year 2013
Cacluating stats for year 2014
Cacluating stats for year 2015
Cacluating stats for year 2016
Cacluating stats for year 2017


In [47]:
all_stats = pd.concat(stat_database)

In [48]:
all_stats.shape

(5130, 25)

In [50]:
all_stats.to_msgpack('all_stats.mp')