In [7]:
import pandas as pd
import numpy as np
import random
from time import time
from operator import itemgetter
from scipy.stats import randint as sp_randint

from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Ridge

In [2]:
bigdata = pd.read_csv("RegularSeasonDetailedResults.csv")

In [3]:
bigdata.head()

Unnamed: 0,Season,Daynum,Wteam,Wscore,Lteam,Lscore,Wloc,Numot,Wfgm,Wfga,...,Lfga3,Lftm,Lfta,Lor,Ldr,Last,Lto,Lstl,Lblk,Lpf
0,2003,10,1104,68,1328,62,N,0,27,58,...,10,16,22,10,22,8,18,9,2,20
1,2003,10,1272,70,1393,63,N,0,26,62,...,24,9,20,20,25,7,12,8,6,16
2,2003,11,1266,73,1437,61,N,0,24,58,...,26,14,23,31,22,9,12,2,5,23
3,2003,11,1296,56,1457,50,N,0,18,38,...,22,8,15,17,20,9,19,4,3,23
4,2003,11,1400,77,1208,71,N,0,30,61,...,16,17,27,21,15,12,10,7,1,14


Get rate statistics because everyone knows rate statistics are better than count statistics

In [30]:
# Score Dif
bigdata['dif'] = bigdata.Wscore-bigdata.Lscore
# Pace
bigdata['pace'] = (bigdata.Wfga+bigdata.Lfga - bigdata.Wor - bigdata.Lor+bigdata.Wto+bigdata.Lto+0.475*(bigdata.Wfta + bigdata.Lfta))/2
# Offensive (and by the converse, defensive) rebounding percentage
bigdata['Worp'] = bigdata.Wor/(bigdata.Ldr+bigdata.Wor)
bigdata['Lorp'] = bigdata.Lor/(bigdata.Wdr+bigdata.Lor)
# Turnover percentage
bigdata['Wtop'] = bigdata.Wto/bigdata.pace
bigdata['Ltop'] = bigdata.Lto/bigdata.pace
# Assist percentage
bigdata['Wastp'] = bigdata.Wast/bigdata.Wfgm
bigdata['Lastp'] = bigdata.Last/bigdata.Lfgm
# Shooting percentage
bigdata['Wfgp'] = bigdata.Wfgm/bigdata.Wfga
bigdata['Lfgp'] = bigdata.Lfgm/bigdata.Lfga
# 3 pt percentage
bigdata['Wfg3p'] = bigdata.Wfgm3/bigdata.Wfga3
bigdata['Lfg3p'] = bigdata.Lfgm3/bigdata.Lfga3
# Free throw shooting percentage (perhaps not needed?)
bigdata['Wftp'] = bigdata.Wftm/bigdata.Wfta
bigdata['Lftp'] = bigdata.Lftm/bigdata.Lfta
# Steal percentage
bigdata['Wstlp'] = bigdata.Wstl/bigdata.pace
bigdata['Lstlp'] = bigdata.Lstl/bigdata.pace
# Block percentage
bigdata['Wblkp'] = bigdata.Wblk/bigdata.pace
bigdata['Lblkp'] = bigdata.Lblk/bigdata.pace
# 3 Point Rate
bigdata['W3pr'] = bigdata.Wfga3/bigdata.Wfga
bigdata['L3pr'] = bigdata.Lfga3/bigdata.Lfga
# Free throw Rate
bigdata['Wftr'] = bigdata.Wfta/bigdata.Wfga
bigdata['Lftr'] = bigdata.Lfta/bigdata.Lfga
# Offensive and Defensive overall
bigdata['Wor'] = bigdata.Wscore/bigdata.pace
bigdata['Lor'] = bigdata.Lscore/bigdata.pace

In [5]:
bigdata.head()

Unnamed: 0,Season,Daynum,Wteam,Wscore,Lteam,Lscore,Wloc,Numot,Wfgm,Wfga,...,Wftp,Lftp,Wstlp,Lstlp,Wblkp,Lblkp,W3pr,L3pr,Wftr,Lftr
0,2003,10,1104,68,1328,62,N,0,27,58,...,0.611111,0.727273,0.095238,0.122449,0.013605,0.027211,0.241379,0.188679,0.310345,0.415094
1,2003,10,1272,70,1393,63,N,0,26,62,...,0.526316,0.45,0.058171,0.116342,0.058171,0.087257,0.322581,0.358209,0.306452,0.298507
2,2003,11,1266,73,1437,61,N,0,24,58,...,0.586207,0.608696,0.077101,0.03084,0.03084,0.077101,0.310345,0.356164,0.5,0.315068
3,2003,11,1296,56,1457,50,N,0,18,38,...,0.548387,0.533333,0.239623,0.068464,0.034232,0.051348,0.236842,0.44898,0.815789,0.306122
4,2003,11,1400,77,1208,71,N,0,30,61,...,0.846154,0.62963,0.0625,0.109375,0.0625,0.015625,0.229508,0.258065,0.213115,0.435484


We are doing ridge regression here (lambda = 1) to determine ranking of teams in various categories. This is a relatively common approach, used in NBA RPM and RAPM. 

In [22]:
def append_rate_stats(stat,data_train,team_ridge):
    teams = list(set(list(data_train.Wteam) + list(data_train.Lteam)))
    off = [str(x)+"_o" for x in teams]
    deff = [str(x)+"_d" for x in teams]
    all_t = off+deff
    df = pd.DataFrame(index=[str(x) + "_w" for x in data_train.index]+[str(x) + "_l" for x in data_train.index], columns=all_t)
    df= df.fillna(0) 
    df['ha']=0
    y = pd.DataFrame(index=df.index, columns=[stat])
    y= y.fillna(0.) 
    w_stat = "W"+stat
    l_stat = "L"+stat
    for index,row in data_train.iterrows():
        w_index = str(index)+"_w"
        l_index = str(index)+ "_l"
        df[str(row['Wteam'])+"_o"][w_index]=1
        df[str(row['Lteam'])+"_o"][l_index]=1
        df[str(row['Wteam'])+"_d"][l_index]=-1
        df[str(row['Lteam'])+"_d"][w_index]=-1
        y[stat][w_index]=row[w_stat]
        y[stat][l_index]=row[l_stat]
        if(np.isnan(y[stat][l_index])):
            y[stat][l_index] = 0
        if(np.isnan(y[stat][w_index])):
            y[stat][w_index] = 0
        if row['Wloc']=="H":
            df['ha'][w_index]=1
            df['ha'][l_index]=-1
        if row['Wloc']=='A':
            df['ha'][w_index]=-1
            df['ha'][l_index]=1
    model = Ridge(alpha=1)
    model.fit(df,y)
    team_ridge["o" + stat] = model.coef_[0][:len(team_ridge)]
    team_ridge["d" + stat] = model.coef_[0][len(team_ridge):-1]
    return team_ridge

In [27]:
def get_rankings(year):
    data_train = bigdata[bigdata.Season==year]
    teams = list(set(list(data_train.Wteam) + list(data_train.Lteam)))
    df = pd.DataFrame(index=data_train.index, columns=teams)
    df= df.fillna(0) 
    df['ha'] = 0
    y = pd.DataFrame(index=df.index, columns=['Score'])
    y= y.fillna(0.) 
    for index,row in data_train.iterrows():
        df[row['Wteam']][index]=1
        df[row['Lteam']][index]=-1
        y['Score'][index]=float(row['dif'])/row['pace']
        if row['Wloc']=="H":
            df['ha'][index]=1
        if row['Wloc']=='A':
            df['ha'][index]=-1
    model = Ridge(alpha=1)
    model.fit(df,y)
    team_ridge= pd.DataFrame(index=teams, columns=['overall'])
    team_ridge= team_ridge.fillna(0) 
    team_ridge.overall = model.coef_[0][:-1]
    df = pd.DataFrame(index=data_train.index, columns=teams)
    df= df.fillna(0) 
    df['ha'] = 0
    y = pd.DataFrame(index=df.index, columns=['Pace'])
    y= y.fillna(0.) 
    for index,row in data_train.iterrows():
        df[row['Wteam']][index]=1
        df[row['Lteam']][index]=-1
        y['Pace'][index]=row['pace']
        if row['Wloc']=="H":
            df['ha'][index]=1
        if row['Wloc']=='A':
            df['ha'][index]=-1
    model = Ridge(alpha=1)
    model.fit(df,y)
    team_ridge['pace']=model.coef_[0][:-1]
    for stat in ['or','orp','astp','fgp','fg3p','ftp','stlp','blkp','3pr','ftr']:
        print stat
        team_ridge = append_rate_stats(stat,data_train,team_ridge)
    return team_ridge

In [31]:
teams_2014=get_rankings(2014)

or
orp
astp
fgp
fg3p
ftp
stlp
blkp
3pr
ftr


In [35]:
teams_2014.sort('dor', ascending=False).head()

Unnamed: 0,overall,pace,oor,dor,oorp,dorp,oastp,dastp,ofgp,dfgp,...,oftp,dftp,ostlp,dstlp,oblkp,dblkp,o3pr,d3pr,oftr,dftr
1112,0.160798,-1.345149,0.099359,0.183329,0.008495,0.008918,0.076447,0.058798,0.048862,0.083488,...,-0.021868,0.004059,0.006075,0.017715,0.013931,0.012341,-0.061144,0.047372,0.046161,0.087136
1257,0.194379,2.644455,0.120464,0.164235,0.004189,0.000769,0.013897,0.011332,0.042135,0.054182,...,-0.04151,0.006534,0.045681,0.027138,0.012683,0.020213,0.019227,0.003767,0.002611,0.02814
1361,0.079398,-0.73225,0.025314,0.152372,0.003397,0.005145,-0.112027,0.06855,0.005519,0.048445,...,-0.045151,0.03696,0.023091,0.011285,0.02167,-0.007307,-0.07585,0.028264,0.080511,0.097804
1326,0.126447,-1.040683,0.058737,0.150384,0.000495,0.004871,-0.014398,0.083641,0.026645,0.049534,...,-0.004586,-0.006327,0.025477,0.028732,0.012593,0.008934,-0.000209,0.014914,0.038657,0.103241
1438,0.126743,-5.233,0.076425,0.150255,0.00747,0.006076,0.049627,0.019085,0.033734,0.070423,...,-0.027101,0.042979,1.5e-05,0.006772,0.013997,0.004701,-0.042936,-0.021613,0.035869,0.086493


#Comments
1. How do these results look? Initial impression - very good. They track KenPom statistics, widely considered the cream of the crop for NCAA statistics, very closely for nearly all statistics

2. Can they be improved? Yes.

3. Find the lamba value for the Ridge Regression that optimizies accuracy either on regular season games (cross validation) or March Madness games.

4. Perhaps weigh games that occur more recently, or are against tougher opponents, or are away (sample-weights parameter in Ridge)

5. How do we use these? Random forest on all of them. 
6. Can we train on regular season data or do we have to only train on March Madness data? Good question, I think it is probably okay to train on regular season data but we may want to look into this.
7. Would other data be useful? Potentially add in the seed they are for the tournament and their AP poll rankings from preseason to week prior to tournament. If we used seed that might restrict our training data to only tournament games?