In [181]:
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
import time

# machine learning
from sklearn import linear_model, ensemble
from sklearn import model_selection

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

#ignore warnings
import warnings
warnings.filterwarnings('ignore')

data_dir = '../input/NCAA/'
df_seeds = pd.read_csv(data_dir + 'Stage2/NCAATourneySeeds.csv')
df_tour = pd.read_csv(data_dir + 'Stage2/NCAATourneyDetailedResults.csv')
df_slots = pd.read_csv(data_dir + 'Stage2/NCAATourneySlots.csv')
df_seeds.head()

Unnamed: 0,Season,Seed,TeamID
0,1985,W01,1207
1,1985,W02,1210
2,1985,W03,1228
3,1985,W04,1260
4,1985,W05,1374


In [182]:
def seed_to_int(seed):
    s_int = int(seed[1:3])
    return s_int
df_seeds['seed_int'] = df_seeds.Seed.apply(seed_to_int)
df_seeds.drop(labels=['Seed'], inplace=True, axis=1) 
df_seeds.rename(columns={'seed_int':'Seed'},inplace=True)
df_seeds.head()

Unnamed: 0,Season,TeamID,Seed
0,1985,1207,1
1,1985,1210,2
2,1985,1228,3
3,1985,1260,4
4,1985,1374,5


In [183]:
df_tour = df_tour.loc[:,['Season','WTeamID','LTeamID']]
df_tour.head()

Unnamed: 0,Season,WTeamID,LTeamID
0,2003,1421,1411
1,2003,1112,1436
2,2003,1113,1272
3,2003,1141,1166
4,2003,1143,1301


In [184]:
df_regular_detail = pd.read_csv(data_dir + 'Stage2/RegularSeasonDetailedResults.csv')
df_regular_detail.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2003,10,1104,68,1328,62,N,0,27,58,...,10,16,22,10,22,8,18,9,2,20
1,2003,10,1272,70,1393,63,N,0,26,62,...,24,9,20,20,25,7,12,8,6,16
2,2003,11,1266,73,1437,61,N,0,24,58,...,26,14,23,31,22,9,12,2,5,23
3,2003,11,1296,56,1457,50,N,0,18,38,...,22,8,15,17,20,9,19,4,3,23
4,2003,11,1400,77,1208,71,N,0,30,61,...,16,17,27,21,15,12,10,7,1,14


In [185]:
wPos = df_regular_detail.apply(lambda row: 0.96*(row.WFGA + row.WTO + 0.44*row.WFTA - row.WOR), axis=1)
lPos = df_regular_detail.apply(lambda row: 0.96*(row.LFGA + row.LTO + 0.44*row.LFTA - row.LOR), axis=1)
df_regular_detail['Pos'] = (wPos+lPos)/2
#Offensive efficiency (OffRtg) = 100 x (Points / Possessions)
df_regular_detail['WOffRtg'] = df_regular_detail.apply(lambda row: 100 * (row.WScore / row.Pos), axis=1)
df_regular_detail['LOffRtg'] = df_regular_detail.apply(lambda row: 100 * (row.LScore / row.Pos), axis=1)
#Defensive efficiency (DefRtg) = 100 x (Opponent points / Opponent possessions)
df_regular_detail['WDefRtg'] = df_regular_detail.LOffRtg
df_regular_detail['LDefRtg'] = df_regular_detail.WOffRtg
#Net Rating = Off.Rtg - Def.Rtg
df_regular_detail['WNetRtg'] = df_regular_detail.apply(lambda row:(row.WOffRtg - row.WDefRtg), axis=1)
df_regular_detail['LNetRtg'] = df_regular_detail.apply(lambda row:(row.LOffRtg - row.LDefRtg), axis=1)
#The Shooting Percentage : Measure of Shooting Efficiency (FGA/FGA3, FTA)
df_regular_detail['WTSP'] = df_regular_detail.apply(lambda row: 100 * row.WScore / (2 * (row.WFGA + 0.44 * row.WFTA)), axis=1)
df_regular_detail['LTSP'] = df_regular_detail.apply(lambda row: 100 * row.LScore / (2 * (row.LFGA + 0.44 * row.LFTA)), axis=1)
#PIE Player Impact Estimate
wtmp = df_regular_detail.apply(lambda row: row.WScore + row.WFGM + row.WFTM - row.WFGA - row.WFTA + row.WDR + 0.5*row.WOR + row.WAst +row.WStl + 0.5*row.WBlk - row.WPF - row.WTO, axis=1)
ltmp = df_regular_detail.apply(lambda row: row.LScore + row.LFGM + row.LFTM - row.LFGA - row.LFTA + row.LDR + 0.5*row.LOR + row.LAst +row.LStl + 0.5*row.LBlk - row.LPF - row.LTO, axis=1) 
df_regular_detail['WPIE'] = wtmp/(wtmp + ltmp)
df_regular_detail['LPIE'] = ltmp/(wtmp + ltmp)
df_regular_detail['WNetScore'] = df_regular_detail['WScore'] - df_regular_detail['LScore'] 
df_regular_detail['LNetScore'] = df_regular_detail['LScore'] - df_regular_detail['WScore'] 
df_regular_detail.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,WDefRtg,LDefRtg,WNetRtg,LNetRtg,WTSP,LTSP,WPIE,LPIE,WNetScore,LNetScore
0,2003,10,1104,68,1328,62,N,0,27,58,...,88.71337,97.298535,8.585165,-8.585165,51.57767,49.457562,0.532847,0.467153,6,-6
1,2003,10,1272,70,1393,63,N,0,26,62,...,96.393948,107.104387,10.710439,-10.710439,49.744173,41.556728,0.602339,0.397661,7,-7
2,2003,11,1266,73,1437,61,N,0,24,58,...,99.37702,118.926598,19.549578,-19.549578,51.582815,36.693936,0.736434,0.263566,12,-12
3,2003,11,1296,56,1457,50,N,0,18,38,...,90.391068,101.237996,10.846928,-10.846928,54.221534,44.964029,0.754717,0.245283,6,-6
4,2003,11,1400,77,1208,71,N,0,30,61,...,116.837809,126.711427,9.873618,-9.873618,57.703837,48.050893,0.570732,0.429268,6,-6


In [186]:
df_regular_composite = pd.DataFrame()

df_regular_composite['WINS'] = df_regular_detail['WTeamID'].groupby([df_regular_detail['Season'], df_regular_detail['WTeamID']]).count()
df_regular_composite['LOSSES'] = df_regular_detail['LTeamID'].groupby([df_regular_detail['Season'], df_regular_detail['LTeamID']]).count()
df_regular_composite['WINPCT'] = df_regular_composite['WINS'] / (df_regular_composite['WINS'] + df_regular_composite['LOSSES'])

df_regular_composite['WNetRtg'] = df_regular_detail['WNetRtg'].groupby([df_regular_detail['Season'], df_regular_detail['WTeamID']]).mean()
df_regular_composite['WTSP'] = df_regular_detail['WTSP'].groupby([df_regular_detail['Season'], df_regular_detail['WTeamID']]).mean()
df_regular_composite['WPIE'] = df_regular_detail['WPIE'].groupby([df_regular_detail['Season'], df_regular_detail['WTeamID']]).mean()
df_regular_composite['WNetScore'] = df_regular_detail['WNetScore'].groupby([df_regular_detail['Season'], df_regular_detail['WTeamID']]).mean()

df_regular_composite['LNetRtg'] = df_regular_detail['LNetRtg'].groupby([df_regular_detail['Season'], df_regular_detail['LTeamID']]).mean()
df_regular_composite['LTSP'] = df_regular_detail['LTSP'].groupby([df_regular_detail['Season'], df_regular_detail['LTeamID']]).mean()
df_regular_composite['LPIE'] = df_regular_detail['LPIE'].groupby([df_regular_detail['Season'], df_regular_detail['LTeamID']]).mean()
df_regular_composite['LNetScore'] = df_regular_detail['LNetScore'].groupby([df_regular_detail['Season'], df_regular_detail['LTeamID']]).mean()

df_regular_composite['NetRtg'] = df_regular_composite['WNetRtg'] * df_regular_composite['WINPCT'] + df_regular_composite['LNetRtg'] * (1 - df_regular_composite['WINPCT'])
df_regular_composite['TSP'] = df_regular_composite['WTSP'] * df_regular_composite['WINPCT'] + df_regular_composite['LTSP'] * (1 - df_regular_composite['WINPCT'])
df_regular_composite['PIE'] = df_regular_composite['WPIE'] * df_regular_composite['WINPCT'] + df_regular_composite['LPIE'] * (1 - df_regular_composite['WINPCT'])
df_regular_composite['NetScore'] = df_regular_composite['WNetScore'] * df_regular_composite['WINPCT'] + df_regular_composite['LNetScore'] * (1 - df_regular_composite['WINPCT'])

df_regular_composite.reset_index(inplace = True)
df_regular_composite.rename(columns={'WTeamID':'TeamID'},inplace=True)
df_regular_composite.head()

Unnamed: 0,Season,TeamID,WINS,LOSSES,WINPCT,WNetRtg,WTSP,WPIE,WNetScore,LNetRtg,LTSP,LPIE,LNetScore,NetRtg,TSP,PIE,NetScore
0,2003,1102,12,16.0,0.428571,29.449144,70.272383,0.755263,15.583333,-21.706689,53.38913,0.2886,-11.25,0.217239,60.62481,0.488599,0.25
1,2003,1103,13,14.0,0.481481,13.409115,63.82437,0.615446,9.384615,-11.894413,53.71252,0.41154,-7.5,0.288767,58.581188,0.509717,0.62963
2,2003,1104,17,11.0,0.607143,20.219957,54.914508,0.65782,13.176471,-15.851573,47.93588,0.349042,-9.454545,6.048999,52.172904,0.536514,4.285714
3,2003,1105,7,19.0,0.269231,17.536064,54.164526,0.641277,13.0,-15.926198,49.059444,0.331704,-11.473684,-6.917127,50.433889,0.41505,-4.884615
4,2003,1106,13,15.0,0.464286,15.847415,54.410152,0.741382,10.384615,-14.605172,47.961365,0.342262,-9.266667,-0.466471,50.955444,0.527568,-0.142857


In [187]:
df_regular_composite[df_regular_composite['LOSSES'].isnull()] #undefeated teams

Unnamed: 0,Season,TeamID,WINS,LOSSES,WINPCT,WNetRtg,WTSP,WPIE,WNetScore,LNetRtg,LTSP,LPIE,LNetScore,NetRtg,TSP,PIE,NetScore
4064,2014,1455,33,,,24.822144,57.571979,0.697612,15.060606,,,,,,,,
4211,2015,1246,34,,,33.957413,56.915615,0.809886,20.941176,,,,,,,,


In [188]:
df_regular_composite['LOSSES'].fillna(0,inplace=True)
df_regular_composite['WINPCT'].fillna(1,inplace=True)
df_regular_composite['NetRtg'].fillna(df_regular_composite['WNetRtg'],inplace=True)
df_regular_composite['TSP'].fillna(df_regular_composite['WTSP'],inplace=True)
df_regular_composite['PIE'].fillna(df_regular_composite['WPIE'],inplace=True)
df_regular_composite['NetScore'].fillna(df_regular_composite['WNetScore'],inplace=True)
df_regular_composite.loc[[4064,4211],:]#check replacement

Unnamed: 0,Season,TeamID,WINS,LOSSES,WINPCT,WNetRtg,WTSP,WPIE,WNetScore,LNetRtg,LTSP,LPIE,LNetScore,NetRtg,TSP,PIE,NetScore
4064,2014,1455,33,0.0,1.0,24.822144,57.571979,0.697612,15.060606,,,,,24.822144,57.571979,0.697612,15.060606
4211,2015,1246,34,0.0,1.0,33.957413,56.915615,0.809886,20.941176,,,,,33.957413,56.915615,0.809886,20.941176


In [189]:
df_regular_composite.drop(['WINS','WNetRtg','WTSP','WPIE','WNetScore'], axis=1, inplace=True)
df_regular_composite.drop(['LOSSES','LNetRtg','LTSP','LPIE','LNetScore'], axis=1, inplace=True)
#df_regular_composite = pd.merge(left=df_regular_composite,right=df_seeds,how='left',on=['Season','TeamID'])
df_regular_composite.head()

Unnamed: 0,Season,TeamID,WINPCT,NetRtg,TSP,PIE,NetScore
0,2003,1102,0.428571,0.217239,60.62481,0.488599,0.25
1,2003,1103,0.481481,0.288767,58.581188,0.509717,0.62963
2,2003,1104,0.607143,6.048999,52.172904,0.536514,4.285714
3,2003,1105,0.269231,-6.917127,50.433889,0.41505,-4.884615
4,2003,1106,0.464286,-0.466471,50.955444,0.527568,-0.142857


In [190]:
df_tour_win = pd.merge(left=df_tour, right=df_regular_composite ,how='left', left_on=['Season', 'WTeamID'], right_on=['Season', 'TeamID'])
df_tour_win.drop(labels=['TeamID'], inplace=True, axis=1)
df_tour_win = pd.merge(left=df_tour_win, right=df_seeds ,how='left', left_on=['Season', 'WTeamID'], right_on=['Season', 'TeamID'])
df_tour_win.drop(labels=['TeamID'], inplace=True, axis=1)
df_tour_win.head()

Unnamed: 0,Season,WTeamID,LTeamID,WINPCT,NetRtg,TSP,PIE,NetScore,Seed
0,2003,1421,1411,0.448276,-10.273157,54.182467,0.440672,-7.241379,16
1,2003,1112,1436,0.892857,20.281151,55.733373,0.654693,14.964286,1
2,2003,1113,1272,0.62069,10.463441,55.735448,0.58516,6.793103,10
3,2003,1141,1166,0.793103,8.771971,62.403685,0.569291,6.103448,11
4,2003,1143,1301,0.724138,6.939357,55.645587,0.55613,4.724138,8


In [191]:
df_tour_loss = pd.merge(left=df_tour, right=df_regular_composite ,how='left', left_on=['Season', 'LTeamID'], right_on=['Season', 'TeamID'])
df_tour_loss.drop(labels=['TeamID'], inplace=True, axis=1)
df_tour_loss = pd.merge(left=df_tour_loss, right=df_seeds ,how='left', left_on=['Season', 'LTeamID'], right_on=['Season', 'TeamID'])
df_tour_loss.drop(labels=['TeamID'], inplace=True, axis=1)
df_tour_loss.head()

Unnamed: 0,Season,WTeamID,LTeamID,WINPCT,NetRtg,TSP,PIE,NetScore,Seed
0,2003,1421,1411,0.6,2.371259,53.98189,0.547692,1.966667,16
1,2003,1112,1436,0.655172,7.739819,52.83826,0.561326,4.655172,16
2,2003,1113,1272,0.793103,13.024197,53.449587,0.597778,8.689655,7
3,2003,1141,1166,0.878788,22.124537,59.690051,0.644084,14.909091,6
4,2003,1143,1301,0.6,6.744849,58.210991,0.541945,4.4,9


In [192]:
df_tour_win_diff = df_tour_win.iloc[:, 3:] - df_tour_loss.iloc[:, 3:]
df_tour_win_diff['Result'] = 1
df_tour_win_diff.head()

Unnamed: 0,WINPCT,NetRtg,TSP,PIE,NetScore,Seed,Result
0,-0.151724,-12.644416,0.200577,-0.107021,-9.208046,0,1
1,0.237685,12.541332,2.895113,0.093367,10.309113,-15,1
2,-0.172414,-2.560755,2.285861,-0.012618,-1.896552,3,1
3,-0.085684,-13.352566,2.713634,-0.074793,-8.805643,5,1
4,0.124138,0.194509,-2.565404,0.014185,0.324138,-1,1


In [193]:
df_tour_loss_diff = df_tour_loss.iloc[:, 3:] - df_tour_win.iloc[:, 3:]
df_tour_loss_diff['Result'] = 0
df_tour_loss_diff.head()

Unnamed: 0,WINPCT,NetRtg,TSP,PIE,NetScore,Seed,Result
0,0.151724,12.644416,-0.200577,0.107021,9.208046,0,0
1,-0.237685,-12.541332,-2.895113,-0.093367,-10.309113,15,0
2,0.172414,2.560755,-2.285861,0.012618,1.896552,-3,0
3,0.085684,13.352566,-2.713634,0.074793,8.805643,-5,0
4,-0.124138,-0.194509,2.565404,-0.014185,-0.324138,1,0


In [194]:
df_predictions = pd.concat((df_tour_win_diff, df_tour_loss_diff), axis=0)
df_predictions.head()

Unnamed: 0,WINPCT,NetRtg,TSP,PIE,NetScore,Seed,Result
0,-0.151724,-12.644416,0.200577,-0.107021,-9.208046,0,1
1,0.237685,12.541332,2.895113,0.093367,10.309113,-15,1
2,-0.172414,-2.560755,2.285861,-0.012618,-1.896552,3,1
3,-0.085684,-13.352566,2.713634,-0.074793,-8.805643,5,1
4,0.124138,0.194509,-2.565404,0.014185,0.324138,-1,1


In [195]:
X_train1 = df_predictions.drop(['Result','NetRtg','TSP','PIE','NetScore','WINPCT'],axis=1)
y_train1 = df_predictions["Result"]
X_train1, y_train1 = shuffle(X_train1, y_train1)

X_train2 = df_predictions.drop(['Result','Seed','NetScore','WINPCT','TSP'],axis=1)
y_train2 = df_predictions["Result"]
X_train2, y_train2 = shuffle(X_train2, y_train2)

In [196]:
from sklearn.feature_selection import SelectPercentile, f_classif

selector = SelectPercentile(f_classif, percentile=100)
selector.fit(X_train1, y_train1)
p_scores = (selector.pvalues_) 
F_scores = (selector.scores_)

df_significance = pd.DataFrame({"Feature":X_train1.columns, "p_value":p_scores , "F_score":F_scores})

df_significance

Unnamed: 0,F_score,Feature,p_value
0,640.166283,Seed,1.8590269999999998e-122


In [197]:
selector = SelectPercentile(f_classif, percentile=100)
selector.fit(X_train2, y_train2)
p_scores = (selector.pvalues_) 
F_scores = (selector.scores_)

df_significance = pd.DataFrame({"Feature":X_train2.columns, "p_value":p_scores , "F_score":F_scores})

df_significance

Unnamed: 0,F_score,Feature,p_value
0,438.356483,NetRtg,5.239679e-88
1,346.504343,PIE,2.409322e-71


In [198]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

vote_est1 = [
    ('pipe1', Pipeline([('std1', StandardScaler()),('lr1', linear_model.LogisticRegression())]))
]

grid_param1 = [   
            [{
            #LogisticRegression
            'lr1__C': np.logspace(start=-5, stop=3, num=9)
            }] 
        ]

vote_est2 = [
    ('pipe2', Pipeline([('std2', StandardScaler()),('lr2', linear_model.LogisticRegression())]))
]

grid_param2 = [   
            [{
            #LogisticRegression
            'lr2__C': np.logspace(start=-5, stop=3, num=9)
            }] 
        ]

start_total = time.perf_counter() #https://docs.python.org/3/library/time.html#time.perf_counter

for clf1, param1 in zip (vote_est1, grid_param1): #https://docs.python.org/3/library/functions.html#zip
    start = time.perf_counter()        
    best_search1 = model_selection.GridSearchCV(estimator = clf1[1], param_grid = param1, scoring = 'neg_log_loss', refit=True)
    best_search1.fit(X_train1, y_train1)
    run = time.perf_counter() - start

    best_param = best_search1.best_params_
    print('The best parameter for {} is {} with a runtime of {:.2f} seconds.'.format(clf1[1].__class__.__name__, best_param, run))
    clf1[1].set_params(**best_param) 
    print('Best log_loss: {:.4}'.format(best_search1.best_score_))
print('-'*10)

The best parameter for Pipeline is {'lr1__C': 1.0} with a runtime of 0.33 seconds.
Best log_loss: -0.5561
----------


In [199]:
start_total = time.perf_counter() #https://docs.python.org/3/library/time.html#time.perf_counter

for clf2, param2 in zip (vote_est2, grid_param2): #https://docs.python.org/3/library/functions.html#zip
    start = time.perf_counter()        
    best_search2 = model_selection.GridSearchCV(estimator = clf2[1], param_grid = param2, scoring = 'neg_log_loss', refit=True)
    best_search2.fit(X_train2, y_train2)
    run = time.perf_counter() - start

    best_param = best_search2.best_params_
    print('The best parameter for {} is {} with a runtime of {:.2f} seconds.'.format(clf2[1].__class__.__name__, best_param, run))
    clf2[1].set_params(**best_param) 
    print('Best log_loss: {:.4}'.format(best_search2.best_score_))
print('-'*10)

The best parameter for Pipeline is {'lr2__C': 1.0} with a runtime of 0.32 seconds.
Best log_loss: -0.5918
----------


In [200]:
grid_search1 = linear_model.LogisticRegression(C=1)
scaler1 = StandardScaler()
X_train1[['Seed']] = scaler1.fit_transform(X_train1[['Seed']])
grid_search1.fit(X_train1, y_train1)

grid_search2 = linear_model.LogisticRegression(C=10)
scaler2 = StandardScaler()
X_train2[['NetRtg','PIE']] = scaler2.fit_transform(X_train2[['NetRtg','PIE']])#maintain order of data
grid_search2.fit(X_train2, y_train2)

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [201]:
from trueskill import TrueSkill, Rating, rate_1vs1

ts = TrueSkill(draw_probability=0.01) # 0.01 is arbitary small number
beta = 25 / 6  # default value

def win_probability(p1, p2):
    delta_mu = p1.mu - p2.mu
    sum_sigma = p1.sigma * p1.sigma + p2.sigma * p2.sigma
    denom = np.sqrt(2 * (beta * beta) + sum_sigma)
    return ts.cdf(delta_mu / denom)
    
submit = pd.read_csv('../input/NCAA/SampleSubmissionStage1.csv')
submit[['Season', 'Team1', 'Team2']] = submit.apply(lambda r:pd.Series([int(t) for t in r.ID.split('_')]), axis=1)

df_tour = pd.read_csv('../input/NCAA/RegularSeasonCompactResults.csv')
teamIds = np.unique(np.concatenate([df_tour.WTeamID.values, df_tour.LTeamID.values]))
ratings = { tid:ts.Rating() for tid in teamIds }

def feed_season_results(season):
    print("season = {}".format(season))
    df1 = df_tour[df_tour.Season == season]
    for r in df1.itertuples():
        ratings[r.WTeamID], ratings[r.LTeamID] = rate_1vs1(ratings[r.WTeamID], ratings[r.LTeamID])

def update_pred(season):
    beta = np.std([r.mu for r in ratings.values()]) 
    print("beta = {}".format(beta))
    submit.loc[submit.Season==season, 'Pred'] = submit[submit.Season==season].apply(lambda r:win_probability(ratings[r.Team1], ratings[r.Team2]), axis=1)

for season in sorted(df_tour.Season.unique()):
    feed_season_results(season)    
    
#for season in sorted(df_tour.Season.unique())[:-4]: # exclude last 4 years
#    feed_season_results(season)

#update_pred(2014)
#feed_season_results(2014)
#update_pred(2015)
#feed_season_results(2015)
#update_pred(2016)
#feed_season_results(2016)
#update_pred(2017)

submit.drop(['Season', 'Team1', 'Team2'], axis=1, inplace=True)

season = 1985
season = 1986
season = 1987
season = 1988
season = 1989
season = 1990
season = 1991
season = 1992
season = 1993
season = 1994
season = 1995
season = 1996
season = 1997
season = 1998
season = 1999
season = 2000
season = 2001
season = 2002
season = 2003
season = 2004
season = 2005
season = 2006
season = 2007
season = 2008
season = 2009
season = 2010
season = 2011
season = 2012
season = 2013
beta = 5.173598443091854
season = 2014
beta = 5.22032415936154
season = 2015
beta = 5.175775424826819
season = 2016
beta = 5.158745585314825


In [202]:
df_regular_composite.head()

Unnamed: 0,Season,TeamID,WINPCT,NetRtg,TSP,PIE,NetScore
0,2003,1102,0.428571,0.217239,60.62481,0.488599,0.25
1,2003,1103,0.481481,0.288767,58.581188,0.509717,0.62963
2,2003,1104,0.607143,6.048999,52.172904,0.536514,4.285714
3,2003,1105,0.269231,-6.917127,50.433889,0.41505,-4.884615
4,2003,1106,0.464286,-0.466471,50.955444,0.527568,-0.142857


In [203]:
df_sample_sub = pd.read_csv(data_dir + 'Stage2/SampleSubmissionStage1.csv')

def get_year_t1_t2(ID):
    """Return a tuple with ints `year`, `team1` and `team2`."""
    return (int(x) for x in ID.split('_'))

X_test1 = pd.DataFrame()
X_test2 = pd.DataFrame()
for ii, row in df_sample_sub.iterrows():
    year, t1, t2 = get_year_t1_t2(row.ID)
    t1_seed = df_seeds[(df_seeds.TeamID == t1) & (df_seeds.Season == year)].Seed.values[0]
    t2_seed = df_seeds[(df_seeds.TeamID == t2) & (df_seeds.Season == year)].Seed.values[0]
    diff_seed = t1_seed - t2_seed
    #t1_winpct = df_regular_composite[(df_regular_composite.TeamID == t1) & (df_regular_composite.Season == year)].WINPCT.values[0]
    #t2_winpct = df_regular_composite[(df_regular_composite.TeamID == t2) & (df_regular_composite.Season == year)].WINPCT.values[0]
    #diff_winpct = t1_winpct - t2_winpct
    #t1_netscore = df_regular_composite[(df_regular_composite.TeamID == t1) & (df_regular_composite.Season == year)].NetScore.values[0]
    #t2_netscore = df_regular_composite[(df_regular_composite.TeamID == t2) & (df_regular_composite.Season == year)].NetScore.values[0]
    #diff_netscore = t1_netscore - t2_netscore
    t1_netrtg = df_regular_composite[(df_regular_composite.TeamID == t1) & (df_regular_composite.Season == year)].NetRtg.values[0]
    t2_netrtg = df_regular_composite[(df_regular_composite.TeamID == t2) & (df_regular_composite.Season == year)].NetRtg.values[0]
    diff_netrtg = t1_netrtg - t2_netrtg
    t1_pie = df_regular_composite[(df_regular_composite.TeamID == t1) & (df_regular_composite.Season == year)].PIE.values[0]
    t2_pie = df_regular_composite[(df_regular_composite.TeamID == t2) & (df_regular_composite.Season == year)].PIE.values[0]
    diff_pie = t1_pie - t2_pie
    #t1_tsp = df_regular_composite[(df_regular_composite.TeamID == t1) & (df_regular_composite.Season == year)].TSP.values[0]
    #t2_tsp = df_regular_composite[(df_regular_composite.TeamID == t2) & (df_regular_composite.Season == year)].TSP.values[0]
    #diff_tsp = t1_tsp - t2_tsp        
    X_test1 = X_test1.append({'Seed':diff_seed},ignore_index=True)    
    X_test2 = X_test2.append({'NetRtg':diff_netrtg,'PIE':diff_pie},ignore_index=True)    

X_test1.head()

Unnamed: 0,Seed
0,1.0
1,15.0
2,6.0
3,10.0
4,6.0


In [204]:
X_test1[['Seed']] = scaler1.transform(X_test1[['Seed']])
X_test2[['NetRtg','PIE']] = scaler2.transform(X_test2[['NetRtg','PIE']])

In [205]:
preds1 = grid_search1.predict_proba(X_test1)[:,1]
#clipped_preds = np.clip(preds, 0.05, 0.95)
preds2 = grid_search2.predict_proba(X_test2)[:,1]
preds3 = submit.Pred.values
preds4 = (preds1 + preds2 + preds3)/3
df_sample_sub.Pred = preds4
df_sample_sub.to_csv('../working/NCAA_short4.csv',index=False)

In [206]:
df_sample_sub.Pred = preds1
df_sample_sub.to_csv('../working/NCAA_short1.csv',index=False)

df_sample_sub.Pred = preds2
df_sample_sub.to_csv('../working/NCAA_short2.csv',index=False)

df_sample_sub.Pred = preds3
df_sample_sub.to_csv('../working/NCAA_short3.csv',index=False)