In [236]:
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
import time

# machine learning
from sklearn import linear_model, ensemble
from sklearn import model_selection

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

#ignore warnings
import warnings
warnings.filterwarnings('ignore')

data_dir = '../input/NCAA/'
df_seeds = pd.read_csv(data_dir + 'NCAATourneySeeds.csv')
df_tour = pd.read_csv(data_dir + 'NCAATourneyCompactResults.csv')
df_slots = pd.read_csv(data_dir + 'NCAATourneySlots.csv')
df_seeds.head()

Unnamed: 0,Season,Seed,TeamID
0,1985,W01,1207
1,1985,W02,1210
2,1985,W03,1228
3,1985,W04,1260
4,1985,W05,1374


In [237]:
df_tour.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,136,1116,63,1234,54,N,0
1,1985,136,1120,59,1345,58,N,0
2,1985,136,1207,68,1250,43,N,0
3,1985,136,1229,58,1425,55,N,0
4,1985,136,1242,49,1325,38,N,0


In [238]:
def seed_to_int(seed):
    s_int = int(seed[1:3])
    return s_int
df_seeds['seed_int'] = df_seeds.Seed.apply(seed_to_int)
#df_seeds.drop(labels=['Seed'], inplace=True, axis=1)
df_seeds.head()

Unnamed: 0,Season,Seed,TeamID,seed_int
0,1985,W01,1207,1
1,1985,W02,1210,2
2,1985,W03,1228,3
3,1985,W04,1260,4
4,1985,W05,1374,5


In [239]:
df_tour.drop(labels=['DayNum', 'WScore', 'LScore', 'WLoc', 'NumOT'], inplace=True, axis=1)
df_tour.head()

Unnamed: 0,Season,WTeamID,LTeamID
0,1985,1116,1234
1,1985,1120,1345
2,1985,1207,1250
3,1985,1229,1425
4,1985,1242,1325


In [240]:
df_regular_detail = pd.read_csv(data_dir + 'RegularSeasonDetailedResults.csv')
df_regular_detail.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2003,10,1104,68,1328,62,N,0,27,58,...,10,16,22,10,22,8,18,9,2,20
1,2003,10,1272,70,1393,63,N,0,26,62,...,24,9,20,20,25,7,12,8,6,16
2,2003,11,1266,73,1437,61,N,0,24,58,...,26,14,23,31,22,9,12,2,5,23
3,2003,11,1296,56,1457,50,N,0,18,38,...,22,8,15,17,20,9,19,4,3,23
4,2003,11,1400,77,1208,71,N,0,30,61,...,16,17,27,21,15,12,10,7,1,14


In [241]:
wPos = df_regular_detail.apply(lambda row: 0.96*(row.WFGA + row.WTO + 0.44*row.WFTA - row.WOR), axis=1)
lPos = df_regular_detail.apply(lambda row: 0.96*(row.LFGA + row.LTO + 0.44*row.LFTA - row.LOR), axis=1)
df_regular_detail['Pos'] = (wPos+lPos)/2
#Offensive efficiency (OffRtg) = 100 x (Points / Possessions)
df_regular_detail['WOffRtg'] = df_regular_detail.apply(lambda row: 100 * (row.WScore / row.Pos), axis=1)
df_regular_detail['LOffRtg'] = df_regular_detail.apply(lambda row: 100 * (row.LScore / row.Pos), axis=1)
#Defensive efficiency (DefRtg) = 100 x (Opponent points / Opponent possessions)
df_regular_detail['WDefRtg'] = df_regular_detail.LOffRtg
df_regular_detail['LDefRtg'] = df_regular_detail.WOffRtg
#Net Rating = Off.Rtg - Def.Rtg
df_regular_detail['WNetRtg'] = df_regular_detail.apply(lambda row:(row.WOffRtg - row.WDefRtg), axis=1)
df_regular_detail['LNetRtg'] = df_regular_detail.apply(lambda row:(row.LOffRtg - row.LDefRtg), axis=1)
#The Shooting Percentage : Measure of Shooting Efficiency (FGA/FGA3, FTA)
df_regular_detail['WTSP'] = df_regular_detail.apply(lambda row: 100 * row.WScore / (2 * (row.WFGA + 0.44 * row.WFTA)), axis=1)
df_regular_detail['LTSP'] = df_regular_detail.apply(lambda row: 100 * row.LScore / (2 * (row.LFGA + 0.44 * row.LFTA)), axis=1)
#PIE Player Impact Estimate
wtmp = df_regular_detail.apply(lambda row: row.WScore + row.WFGM + row.WFTM - row.WFGA - row.WFTA + row.WDR + 0.5*row.WOR + row.WAst +row.WStl + 0.5*row.WBlk - row.WPF - row.WTO, axis=1)
ltmp = df_regular_detail.apply(lambda row: row.LScore + row.LFGM + row.LFTM - row.LFGA - row.LFTA + row.LDR + 0.5*row.LOR + row.LAst +row.LStl + 0.5*row.LBlk - row.LPF - row.LTO, axis=1) 
df_regular_detail['WPIE'] = wtmp/(wtmp + ltmp)
df_regular_detail['LPIE'] = ltmp/(wtmp + ltmp)
df_regular_detail.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,WOffRtg,LOffRtg,WDefRtg,LDefRtg,WNetRtg,LNetRtg,WTSP,LTSP,WPIE,LPIE
0,2003,10,1104,68,1328,62,N,0,27,58,...,97.298535,88.71337,88.71337,97.298535,8.585165,-8.585165,51.57767,49.457562,0.532847,0.467153
1,2003,10,1272,70,1393,63,N,0,26,62,...,107.104387,96.393948,96.393948,107.104387,10.710439,-10.710439,49.744173,41.556728,0.602339,0.397661
2,2003,11,1266,73,1437,61,N,0,24,58,...,118.926598,99.37702,99.37702,118.926598,19.549578,-19.549578,51.582815,36.693936,0.736434,0.263566
3,2003,11,1296,56,1457,50,N,0,18,38,...,101.237996,90.391068,90.391068,101.237996,10.846928,-10.846928,54.221534,44.964029,0.754717,0.245283
4,2003,11,1400,77,1208,71,N,0,30,61,...,126.711427,116.837809,116.837809,126.711427,9.873618,-9.873618,57.703837,48.050893,0.570732,0.429268


In [242]:
df_regular = df_regular_detail.drop(['DayNum','WLoc', 'NumOT', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA','WOR', 'WDR', 'WAst', 'WTO', 'WStl', 'WBlk', 'WPF', 'LFGM', 'LFGA','LFGM3', 'LFGA3', 'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO','LStl', 'LBlk', 'LPF', 'Pos', 'WOffRtg', 'LOffRtg', 'WDefRtg','LDefRtg'],axis=1)
df_regular.head()

Unnamed: 0,Season,WTeamID,WScore,LTeamID,LScore,WNetRtg,LNetRtg,WTSP,LTSP,WPIE,LPIE
0,2003,1104,68,1328,62,8.585165,-8.585165,51.57767,49.457562,0.532847,0.467153
1,2003,1272,70,1393,63,10.710439,-10.710439,49.744173,41.556728,0.602339,0.397661
2,2003,1266,73,1437,61,19.549578,-19.549578,51.582815,36.693936,0.736434,0.263566
3,2003,1296,56,1457,50,10.846928,-10.846928,54.221534,44.964029,0.754717,0.245283
4,2003,1400,77,1208,71,9.873618,-9.873618,57.703837,48.050893,0.570732,0.429268


In [243]:
df_regular_win = df_regular.groupby(['Season','WTeamID'])[["WScore","LScore"]].agg(['sum']).reset_index(level=['Season', 'WTeamID']).rename(columns={'WTeamID':'TeamID', 'WScore':'home_score1', 'LScore':'away_score1'})

In [244]:
df_regular_loss = df_regular.groupby(['Season','LTeamID'])[["WScore","LScore"]].agg(['sum']).reset_index(level=['Season', 'LTeamID']).rename(columns={'LTeamID':'TeamID', 'LScore':'home_score2', 'WScore':'away_score2'})
df_regular_loss.head()

Unnamed: 0_level_0,Season,TeamID,away_score2,home_score2
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,sum,sum
0,2003,1102,958,778
1,2003,1103,1091,986
2,2003,1104,774,670
3,2003,1105,1528,1310
4,2003,1106,1032,893


In [245]:
df_regular_full = pd.merge(left=df_regular_win, right=df_regular_loss, how='left', on=['Season', 'TeamID'])
df_regular_full.head()

Unnamed: 0_level_0,Season,TeamID,home_score1,away_score1,away_score2,home_score2
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,sum,sum,sum,sum
0,2003,1102,825,638,958.0,778.0
1,2003,1103,1141,1019,1091.0,986.0
2,2003,1104,1270,1046,774.0,670.0
3,2003,1105,556,465,1528.0,1310.0
4,2003,1106,888,753,1032.0,893.0


In [246]:
df_regular_full.columns = df_regular_full.columns.droplevel(1)
df_regular_full[df_regular_full['away_score2'].apply(np.isnan)]

Unnamed: 0,Season,TeamID,home_score1,away_score1,away_score2,home_score2
4064,2014,1455,2473,1976,,
4211,2015,1246,2547,1835,,


In [247]:
df_regular_full['home_score2'].fillna(0, inplace=True)
df_regular_full['away_score2'].fillna(0, inplace=True)
df_regular_full['net_score'] = df_regular_full['home_score1'] + df_regular_full['home_score2'] - df_regular_full['away_score1'] - df_regular_full['away_score2']
df_regular_full.head()

Unnamed: 0,Season,TeamID,home_score1,away_score1,away_score2,home_score2,net_score
0,2003,1102,825,638,958.0,778.0,7.0
1,2003,1103,1141,1019,1091.0,986.0,17.0
2,2003,1104,1270,1046,774.0,670.0,120.0
3,2003,1105,556,465,1528.0,1310.0,-127.0
4,2003,1106,888,753,1032.0,893.0,-4.0


In [248]:
df_regular_full[df_regular_full['net_score'].apply(np.isnan)]

Unnamed: 0,Season,TeamID,home_score1,away_score1,away_score2,home_score2,net_score


In [249]:
df_regular_full['band_net'] = pd.qcut(df_regular_full['net_score'],8)
df_regular_full[['band_net', 'home_score1']].groupby(['band_net'], as_index=False).mean()

Unnamed: 0,band_net,home_score1
0,"[-750, -222.5]",442.290172
1,"(-222.5, -135]",673.619632
2,"(-135, -68]",852.791536
3,"(-68, -6]",1013.783912
4,"(-6, 57]",1169.513261
5,"(57, 133]",1352.580595
6,"(133, 234]",1562.683307
7,"(234, 773]",1912.827856


In [250]:
df_regular_full['band1'] = pd.qcut(df_regular_full['home_score1'],8)
df_regular_full[['band1', 'home_score2']].groupby(['band1'], as_index=False).mean()

Unnamed: 0,band1,home_score2
0,"[53, 552]",1373.059006
1,"(552, 764]",1208.995327
2,"(764, 920]",1095.010989
3,"(920, 1084]",1007.193448
4,"(1084, 1265.5]",900.635368
5,"(1265.5, 1459]",800.294393
6,"(1459, 1716.5]",671.169014
7,"(1716.5, 2858]",469.170047


In [251]:
df_regular_full['band2'] = pd.qcut(df_regular_full['home_score2'], 8)
df_regular_full[['band2', 'home_score1']].groupby(['band2'], as_index=False).mean()

Unnamed: 0,band2,home_score1
0,"[0, 554]",1863.406832
1,"(554, 717]",1545.41784
2,"(717, 841]",1319.839813
3,"(841, 941]",1148.855573
4,"(941, 1049]",1008.797508
5,"(1049, 1174]",882.41784
6,"(1174, 1324]",710.325545
7,"(1324, 1986]",488.084507


In [252]:
df_regular_full['band3'] = pd.qcut(df_regular_full['away_score1'], 8)
df_regular_full[['band3', 'away_score2']].groupby(['band3'], as_index=False).mean()

Unnamed: 0,band3,away_score2
0,"[43, 484]",1703.71938
1,"(484, 659]",1457.948195
2,"(659, 794]",1303.826625
3,"(794, 927]",1192.135008
4,"(927, 1068.5]",1053.65047
5,"(1068.5, 1216]",924.031201
6,"(1216, 1418]",772.021841
7,"(1418, 2302]",544.020313


In [253]:
df_regular_full['band4'] = pd.qcut(df_regular_full['away_score2'], 8)
df_regular_full[['band4', 'away_score1']].groupby(['band4'], as_index=False).mean()

Unnamed: 0,band4,away_score1
0,"[0, 633]",1507.359253
1,"(633, 832]",1282.771875
2,"(832, 979]",1115.174727
3,"(979, 1110]",980.781395
4,"(1110, 1246.5]",866.695584
5,"(1246.5, 1400]",763.698289
6,"(1400, 1605.5]",610.426332
7,"(1605.5, 2421]",409.24025


In [254]:
df_regular_full.head(10)

Unnamed: 0,Season,TeamID,home_score1,away_score1,away_score2,home_score2,net_score,band_net,band1,band2,band3,band4
0,2003,1102,825,638,958.0,778.0,7.0,"(-6, 57]","(764, 920]","(717, 841]","(484, 659]","(832, 979]"
1,2003,1103,1141,1019,1091.0,986.0,17.0,"(-6, 57]","(1084, 1265.5]","(941, 1049]","(927, 1068.5]","(979, 1110]"
2,2003,1104,1270,1046,774.0,670.0,120.0,"(57, 133]","(1265.5, 1459]","(554, 717]","(927, 1068.5]","(633, 832]"
3,2003,1105,556,465,1528.0,1310.0,-127.0,"(-135, -68]","(552, 764]","(1174, 1324]","[43, 484]","(1400, 1605.5]"
4,2003,1106,888,753,1032.0,893.0,-4.0,"(-6, 57]","(764, 920]","(841, 941]","(659, 794]","(979, 1110]"
5,2003,1107,492,459,1661.0,1354.0,-274.0,"[-750, -222.5]","[53, 552]","(1324, 1986]","[43, 484]","(1605.5, 2421]"
6,2003,1108,1050,920,1493.0,1230.0,-133.0,"(-135, -68]","(920, 1084]","(1174, 1324]","(794, 927]","(1400, 1605.5]"
7,2003,1110,1147,914,943.0,844.0,134.0,"(133, 234]","(1084, 1265.5]","(841, 941]","(794, 927]","(832, 979]"
8,2003,1111,1410,1209,916.0,755.0,40.0,"(-6, 57]","(1265.5, 1459]","(717, 841]","(1068.5, 1216]","(832, 979]"
9,2003,1112,2155,1723,244.0,231.0,419.0,"(234, 773]","(1716.5, 2858]","[0, 554]","(1418, 2302]","[0, 633]"


In [255]:
df_regular_full.loc[ df_regular_full['net_score'] <= -219, 'net_score'] = -219
df_regular_full.loc[(df_regular_full['net_score'] > -219) & (df_regular_full['net_score'] <= -133), 'net_score'] = -133
df_regular_full.loc[(df_regular_full['net_score'] > -133) & (df_regular_full['net_score'] <= -67), 'net_score'] = -67 
df_regular_full.loc[(df_regular_full['net_score'] > -67) & (df_regular_full['net_score'] <= -3), 'net_score'] = -3
df_regular_full.loc[(df_regular_full['net_score'] > -3) & (df_regular_full['net_score'] <= 58), 'net_score']   = 58
df_regular_full.loc[(df_regular_full['net_score'] > 58) & (df_regular_full['net_score'] <= 130), 'net_score'] = 130
df_regular_full.loc[(df_regular_full['net_score'] > 130) & (df_regular_full['net_score'] <= 227), 'net_score']   = 227
df_regular_full.loc[ df_regular_full['net_score'] > 227, 'net_score'] = 7
df_regular_full['net_score'] = df_regular_full['net_score'].map({-219:0,-133:1,-67:2,-3:3,58:4,130:5,227:6,7:7})
df_regular_full['net_score'] = df_regular_full['net_score'].astype(int)

df_regular_full.loc[ df_regular_full['home_score1'] <= 528, 'home_score1'] = 0
df_regular_full.loc[(df_regular_full['home_score1'] > 528) & (df_regular_full['home_score1'] <= 733), 'home_score1'] = 1
df_regular_full.loc[(df_regular_full['home_score1'] > 733) & (df_regular_full['home_score1'] <= 903), 'home_score1']   = 2
df_regular_full.loc[(df_regular_full['home_score1'] > 903) & (df_regular_full['home_score1'] <= 1065), 'home_score1'] = 3
df_regular_full.loc[(df_regular_full['home_score1'] > 1065) & (df_regular_full['home_score1'] <= 1237), 'home_score1']   = 4
df_regular_full.loc[(df_regular_full['home_score1'] > 1237) & (df_regular_full['home_score1'] <= 1432), 'home_score1'] = 5
df_regular_full.loc[(df_regular_full['home_score1'] > 1432) & (df_regular_full['home_score1'] <= 1685), 'home_score1']   = 6
df_regular_full.loc[ df_regular_full['home_score1'] > 1685, 'home_score1'] = 7
df_regular_full['home_score1'] = df_regular_full['home_score1'].astype(int)

df_regular_full.loc[ df_regular_full['home_score2'] <= 539, 'home_score2'] = 0
df_regular_full.loc[(df_regular_full['home_score2'] > 539) & (df_regular_full['home_score2'] <= 697), 'home_score2'] = 1
df_regular_full.loc[(df_regular_full['home_score2'] > 697) & (df_regular_full['home_score2'] <= 817), 'home_score2']   = 2
df_regular_full.loc[(df_regular_full['home_score2'] > 817) & (df_regular_full['home_score2'] <= 921), 'home_score2'] = 3
df_regular_full.loc[(df_regular_full['home_score2'] > 921) & (df_regular_full['home_score2'] <= 1029), 'home_score2']   = 4
df_regular_full.loc[(df_regular_full['home_score2'] > 1029) & (df_regular_full['home_score2'] <= 1153), 'home_score2'] = 5
df_regular_full.loc[(df_regular_full['home_score2'] > 1153) & (df_regular_full['home_score2'] <= 1300), 'home_score2']   = 6
df_regular_full.loc[ df_regular_full['home_score2'] > 1300, 'home_score2'] = 7
df_regular_full['home_score2'] = df_regular_full['home_score2'].astype(int)

df_regular_full.loc[ df_regular_full['away_score1'] <= 464, 'away_score1'] = 0
df_regular_full.loc[(df_regular_full['away_score1'] > 464) & (df_regular_full['away_score1'] <= 638.5), 'away_score1'] = 1
df_regular_full.loc[(df_regular_full['away_score1'] > 638.5) & (df_regular_full['away_score1'] <= 777), 'away_score1']   = 2
df_regular_full.loc[(df_regular_full['away_score1'] > 777) & (df_regular_full['away_score1'] <= 906), 'away_score1'] = 3
df_regular_full.loc[(df_regular_full['away_score1'] > 906) & (df_regular_full['away_score1'] <= 1044), 'away_score1']   = 4
df_regular_full.loc[(df_regular_full['away_score1'] > 1044) & (df_regular_full['away_score1'] <= 1196), 'away_score1'] = 5
df_regular_full.loc[(df_regular_full['away_score1'] > 1196) & (df_regular_full['away_score1'] <= 1394), 'away_score1']   = 6
df_regular_full.loc[ df_regular_full['away_score1'] > 1394, 'away_score1'] = 7
df_regular_full['away_score1'] = df_regular_full['away_score1'].astype(int)

df_regular_full.loc[ df_regular_full['away_score2'] <= 613, 'away_score2'] = 0
df_regular_full.loc[(df_regular_full['away_score2'] > 613) & (df_regular_full['away_score2'] <= 805), 'away_score2'] = 1
df_regular_full.loc[(df_regular_full['away_score2'] > 805) & (df_regular_full['away_score2'] <= 951.75), 'away_score2']   = 2
df_regular_full.loc[(df_regular_full['away_score2'] > 951.75) & (df_regular_full['away_score2'] <= 1082), 'away_score2'] = 3
df_regular_full.loc[(df_regular_full['away_score2'] > 1082) & (df_regular_full['away_score2'] <= 1221), 'away_score2']   = 4
df_regular_full.loc[(df_regular_full['away_score2'] > 1221) & (df_regular_full['away_score2'] <= 1376), 'away_score2'] = 5
df_regular_full.loc[(df_regular_full['away_score2'] > 1376) & (df_regular_full['away_score2'] <= 1575.75), 'away_score2']   = 6
df_regular_full.loc[ df_regular_full['away_score2'] > 1575.75, 'away_score2'] = 7
df_regular_full['away_score2'] = df_regular_full['away_score2'].astype(int)

df_regular_full.drop(labels=['band1','band2','band3','band4','band_net'], inplace=True, axis=1)

df_regular_full.head(10)

Unnamed: 0,Season,TeamID,home_score1,away_score1,away_score2,home_score2,net_score
0,2003,1102,2,1,3,2,4
1,2003,1103,4,4,4,4,4
2,2003,1104,5,5,1,1,5
3,2003,1105,1,1,6,7,2
4,2003,1106,2,2,3,3,3
5,2003,1107,0,0,7,7,0
6,2003,1108,3,4,6,6,1
7,2003,1110,4,4,2,3,6
8,2003,1111,5,6,2,2,4
9,2003,1112,7,7,0,0,7


In [256]:
df_regular1 = df_regular[['Season','WTeamID','WNetRtg','WTSP','WPIE']].rename(columns={'WTeamID':'TeamID','WNetRtg':'NetRtg','WTSP':'TSP','WPIE':'PIE'})
df_regular2 = df_regular[['Season','LTeamID','LNetRtg','LTSP','LPIE']].rename(columns={'LTeamID':'TeamID','LNetRtg':'NetRtg','LTSP':'TSP','LPIE':'PIE'})
df_regular3 = pd.concat([df_regular1,df_regular2])
df_regular_season = df_regular3.groupby(['Season','TeamID'])[["NetRtg","TSP","PIE"]].mean().reset_index(level=['Season', 'TeamID'])
df_regular_season.head()

Unnamed: 0,Season,TeamID,NetRtg,TSP,PIE
0,2003,1102,0.217239,60.62481,0.488599
1,2003,1103,0.288767,58.581188,0.509717
2,2003,1104,6.048999,52.172904,0.536514
3,2003,1105,-6.917127,50.433889,0.41505
4,2003,1106,-0.466471,50.955444,0.527568


In [257]:
df_regular_season['band1'] = pd.qcut(df_regular_season['NetRtg'], 8)
df_regular_season[['band1', 'NetRtg']].groupby(['band1'], as_index=False).mean()

Unnamed: 0,band1,NetRtg
0,"[-42.698, -12.0676]",-17.213742
1,"(-12.0676, -7.362]",-9.515821
2,"(-7.362, -3.683]",-5.46843
3,"(-3.683, -0.334]",-2.011042
4,"(-0.334, 2.933]",1.264149
5,"(2.933, 6.722]",4.758782
6,"(6.722, 11.56]",8.963242
7,"(11.56, 34.704]",16.555097


In [258]:
df_regular_season['band2'] = pd.qcut(df_regular_season['PIE'], 8)
df_regular_season[['band2', 'PIE']].groupby(['band2'], as_index=False).mean()

Unnamed: 0,band2,PIE
0,"[0.124, 0.404]",0.360541
1,"(0.404, 0.444]",0.424655
2,"(0.444, 0.473]",0.458899
3,"(0.473, 0.499]",0.486135
4,"(0.499, 0.525]",0.512361
5,"(0.525, 0.553]",0.539077
6,"(0.553, 0.591]",0.570363
7,"(0.591, 0.81]",0.627415


In [259]:
df_regular_season['band3'] = pd.qcut(df_regular_season['TSP'], 8)
df_regular_season[['band3', 'TSP']].groupby(['band3'], as_index=False).mean()

Unnamed: 0,band3,TSP
0,"[42.467, 50.139]",48.432917
1,"(50.139, 51.566]",50.885898
2,"(51.566, 52.672]",52.11897
3,"(52.672, 53.643]",53.143308
4,"(53.643, 54.593]",54.109149
5,"(54.593, 55.629]",55.11109
6,"(55.629, 57.107]",56.304997
7,"(57.107, 63.597]",58.574369


In [260]:
df_regular_season.loc[ df_regular_season['NetRtg'] <= -12.0676, 'NetRtg'] = -13
df_regular_season.loc[(df_regular_season['NetRtg'] > -12.0676) & (df_regular_season['NetRtg'] <= -7.362), 'NetRtg'] = -8
df_regular_season.loc[(df_regular_season['NetRtg'] > -7.362) & (df_regular_season['NetRtg'] <= -3.683), 'NetRtg'] = -4 
df_regular_season.loc[(df_regular_season['NetRtg'] > -3.683) & (df_regular_season['NetRtg'] <= -0.334), 'NetRtg'] = -1
df_regular_season.loc[(df_regular_season['NetRtg'] > -0.334) & (df_regular_season['NetRtg'] <= 2.933), 'NetRtg']   = 2
df_regular_season.loc[(df_regular_season['NetRtg'] > 2.933) & (df_regular_season['NetRtg'] <= 6.722), 'NetRtg'] = 6
df_regular_season.loc[(df_regular_season['NetRtg'] > 6.722) & (df_regular_season['NetRtg'] <= 11.56), 'NetRtg']   = 11
df_regular_season.loc[ df_regular_season['NetRtg'] > 11.56, 'NetRtg'] = 12
df_regular_season['NetRtg'] = df_regular_season['NetRtg'].map({-13:0,-8:1,-4:2,-1:3,2:4,6:5,11:6,12:7})
df_regular_season['NetRtg'] = df_regular_season['NetRtg'].astype(int)

df_regular_season.loc[ df_regular_season['PIE'] <= 0.404, 'PIE'] = -7
df_regular_season.loc[(df_regular_season['PIE'] > 0.404) & (df_regular_season['PIE'] <= 0.444), 'PIE'] = -6
df_regular_season.loc[(df_regular_season['PIE'] > 0.444) & (df_regular_season['PIE'] <= 0.473), 'PIE'] = -5 
df_regular_season.loc[(df_regular_season['PIE'] > 0.473) & (df_regular_season['PIE'] <= 0.499), 'PIE'] = -4
df_regular_season.loc[(df_regular_season['PIE'] > 0.499) & (df_regular_season['PIE'] <= 0.525), 'PIE']   = -3
df_regular_season.loc[(df_regular_season['PIE'] > 0.525) & (df_regular_season['PIE'] <= 0.553), 'PIE'] = -2
df_regular_season.loc[(df_regular_season['PIE'] > 0.553) & (df_regular_season['PIE'] <= 0.591), 'PIE']   = -1
df_regular_season.loc[ df_regular_season['PIE'] > 0.591, 'PIE'] = 0
df_regular_season['PIE'] = df_regular_season['PIE'].map({-7:0,-6:1,-5:2,-4:3,-3:4,-2:5,-1:6,0:7})
df_regular_season['PIE'] = df_regular_season['PIE'].astype(int)

df_regular_season.loc[ df_regular_season['TSP'] <= 50.139, 'TSP'] = 0
df_regular_season.loc[(df_regular_season['TSP'] > 50.139) & (df_regular_season['TSP'] <= 51.566), 'TSP'] = 1
df_regular_season.loc[(df_regular_season['TSP'] > 51.566) & (df_regular_season['TSP'] <= 52.672), 'TSP'] = 2 
df_regular_season.loc[(df_regular_season['TSP'] > 52.672) & (df_regular_season['TSP'] <= 53.643), 'TSP'] = 3
df_regular_season.loc[(df_regular_season['TSP'] > 53.643) & (df_regular_season['TSP'] <= 54.593), 'TSP']   = 4
df_regular_season.loc[(df_regular_season['TSP'] > 54.593) & (df_regular_season['TSP'] <= 55.629), 'TSP'] = 5
df_regular_season.loc[(df_regular_season['TSP'] > 55.629) & (df_regular_season['TSP'] <= 57.107), 'TSP']   = 6
df_regular_season.loc[ df_regular_season['TSP'] > 57.107, 'TSP'] = 7
df_regular_season['TSP'] = df_regular_season['TSP'].astype(int)

df_regular_season.drop(labels=['band1','band2','band3'], inplace=True, axis=1)
df_regular_season.head()

Unnamed: 0,Season,TeamID,NetRtg,TSP,PIE
0,2003,1102,4,7,3
1,2003,1103,4,7,4
2,2003,1104,5,2,5
3,2003,1105,2,1,1
4,2003,1106,3,1,5


In [261]:
df_seeds_win = df_seeds.rename(columns={'TeamID':'WTeamID', 'seed_int':'WSeed'})
df_seeds_loss = df_seeds.rename(columns={'TeamID':'LTeamID', 'seed_int':'LSeed'})
df_regular_season_win = df_regular_season.rename(columns={'TeamID':'WTeamID', 'NetRtg':'WNetRtg','TSP':'WTSP','PIE':'WPIE'})
df_regular_season_loss = df_regular_season.rename(columns={'TeamID':'LTeamID', 'NetRtg':'LNetRtg','TSP':'LTSP','PIE':'LPIE'})
df_concat1 = pd.merge(left=df_tour, right=df_seeds_win, how='left', on=['Season', 'WTeamID'])
df_concat2_1 = pd.merge(left=df_concat1, right=df_seeds_loss, on=['Season', 'LTeamID'])
df_concat2_2 = pd.merge(left=df_concat2_1, right=df_regular_season_win, on=['Season', 'WTeamID'])
df_concat2_3 = pd.merge(left=df_concat2_2, right=df_regular_season_loss, on=['Season', 'LTeamID'])
df_concat2 = df_concat2_3
df_concat2['SeedDiff'] = df_concat2.WSeed - df_concat2.LSeed
df_concat2['NetRtgDiff'] = df_concat2.WNetRtg - df_concat2.LNetRtg
df_concat2['TSPDiff'] = df_concat2.WTSP - df_concat2.LTSP
df_concat2['PIEDiff'] = df_concat2.WPIE - df_concat2.LPIE

df_regular_full_win = df_regular_full.rename(columns={'TeamID':'WTeamID', 'net_score':'Wnet_score', 'home_score1':'Whome_score1', 'home_score2':'Whome_score2', 'away_score1':'Waway_score1', 'away_score2':'Waway_score2'})
df_regular_full_loss = df_regular_full.rename(columns={'TeamID':'LTeamID', 'net_score':'Lnet_score', 'home_score1':'Lhome_score1', 'home_score2':'Lhome_score2', 'away_score1':'Laway_score1', 'away_score2':'Laway_score2'})
df_concat3 = pd.merge(left=df_concat2, right=df_regular_full_win, on=['Season', 'WTeamID'])
df_concat4 = pd.merge(left=df_concat3, right=df_regular_full_loss, on=['Season', 'LTeamID'])
df_concat4['net_scoreDiff'] = df_concat4.Wnet_score - df_concat4.Lnet_score
df_concat4['home_score1Diff'] = df_concat4.Whome_score1 - df_concat4.Lhome_score1
df_concat4['home_score2Diff'] = df_concat4.Whome_score2 - df_concat4.Lhome_score2
df_concat4['away_score1Diff'] = df_concat4.Waway_score1 - df_concat4.Laway_score1
df_concat4['away_score2Diff'] = df_concat4.Waway_score2 - df_concat4.Laway_score2

def getRound(x):
    t1_seed = df_seeds[(df_seeds.Season == x['Season']) & (df_seeds.TeamID == x['WTeamID'])].Seed.values[0]
    t2_seed = df_seeds[(df_seeds.Season == x['Season']) & (df_seeds.TeamID == x['LTeamID'])].Seed.values[0]
    t1_seed = t1_seed[0:3]#a/b seeds in 2001
    t2_seed = t2_seed[0:3]#a/b seeds in 2001
    found = False
    roundCount = 0
    #print("{} {} {}".format(x['Season'],x['WTeamID'],x['LTeamID']))
    while(found == False):
        roundCount+=1
        slot1 = df_slots[(df_slots.Season == x['Season']) & ((df_slots.StrongSeed == t1_seed) | (df_slots.WeakSeed == t1_seed))].Slot.values[0]
        slot2 = df_slots[(df_slots.Season == x['Season']) & ((df_slots.StrongSeed == t2_seed) | (df_slots.WeakSeed == t2_seed))].Slot.values[0]
        t1_seed = slot1
        t2_seed = slot2
        if(slot1 == slot2):
            found = True
            #print("Found: " + slot1)
    return roundCount

#df_concat4['round'] = df_concat4.apply(getRound,axis=1)

df_concat4.head()

Unnamed: 0,Season,WTeamID,LTeamID,Seed_x,WSeed,Seed_y,LSeed,WNetRtg,WTSP,WPIE,...,Lhome_score1,Laway_score1,Laway_score2,Lhome_score2,Lnet_score,net_scoreDiff,home_score1Diff,home_score2Diff,away_score1Diff,away_score2Diff
0,2003,1421,1411,X16b,16,X16a,16,1,4,1,...,5,6,2,2,5,-4,-2,3,-2,3
1,2003,1112,1436,Z01,1,Z16,16,7,6,7,...,5,5,1,1,6,1,2,-1,2,-1
2,2003,1112,1211,Z01,1,Z09,9,7,6,7,...,7,7,1,1,7,0,0,-1,0,-1
3,2003,1112,1323,Z01,1,Z05,5,7,6,7,...,7,7,1,1,7,0,0,-1,0,-1
4,2003,1113,1272,Z10,10,Z07,7,6,6,6,...,7,7,0,0,7,-1,-1,2,-2,2


In [262]:
df_wins = pd.DataFrame()
df_wins['SeedDiff'] = df_concat4['SeedDiff']
df_wins['NetRtgDiff'] = df_concat4['NetRtgDiff']
df_wins['TSPDiff'] = df_concat4['TSPDiff']
df_wins['PIEDiff'] = df_concat4['PIEDiff']
df_wins['net_scoreDiff'] = df_concat4['net_scoreDiff'].astype(int)
df_wins['home_score1Diff'] = df_concat4['home_score1Diff']
df_wins['home_score2Diff'] = df_concat4['home_score2Diff']
df_wins['away_score1Diff'] = df_concat4['away_score1Diff']
df_wins['away_score2Diff'] = df_concat4['away_score2Diff']
#df_wins['round'] = df_concat4['round']
#df_wins['diff_seed_round'] = df_concat4['SeedDiff']/df_concat4['round']
df_wins['Result'] = 1

df_losses = pd.DataFrame()
df_losses['SeedDiff'] = -df_concat4['SeedDiff']
df_losses['NetRtgDiff'] = -df_concat4['NetRtgDiff']
df_losses['TSPDiff'] = -df_concat4['TSPDiff']
df_losses['PIEDiff'] = -df_concat4['PIEDiff']
df_losses['net_scoreDiff'] = -df_concat4['net_scoreDiff'].astype(int)
df_losses['home_score1Diff'] = -df_concat4['home_score1Diff']
df_losses['home_score2Diff'] = -df_concat4['home_score2Diff']
df_losses['away_score1Diff'] = -df_concat4['away_score1Diff']
df_losses['away_score2Diff'] = -df_concat4['away_score2Diff']
#df_losses['round'] = df_concat4['round']
#df_losses['diff_seed_round'] = -df_concat4['SeedDiff']/df_concat4['round']
df_losses['Result'] = 0

df_predictions = pd.concat((df_wins, df_losses))
df_predictions.head()

Unnamed: 0,SeedDiff,NetRtgDiff,TSPDiff,PIEDiff,net_scoreDiff,home_score1Diff,home_score2Diff,away_score1Diff,away_score2Diff,Result
0,0,-3,0,-4,-4,-2,3,-2,3,1
1,-15,1,3,1,1,2,-1,2,-1,1
2,-8,0,-1,0,0,0,-1,0,-1,1
3,-4,0,0,0,0,0,-1,0,-1,1
4,3,-1,3,-1,-1,-1,2,-2,2,1


In [263]:
X_train1 = df_predictions.drop(['Result','NetRtgDiff','TSPDiff','PIEDiff','net_scoreDiff'],axis=1)
y_train1 = df_predictions["Result"]
X_train1, y_train1 = shuffle(X_train1, y_train1)

X_train2 = df_predictions.drop(['Result','SeedDiff','home_score1Diff','home_score2Diff','away_score1Diff','away_score2Diff','net_scoreDiff'],axis=1)
y_train2 = df_predictions["Result"]
X_train2, y_train2 = shuffle(X_train2, y_train2)

<h1>
Regular season features extracted
</h1>
<table align='left'>
    <tr>
        <th>Term</th><th>Description</th>
    </tr>
    <tr>
        <td>home_score1</td><td>Total points scored in winning cause</td>
    </tr>
    <tr>
        <td>aaway_score1</td><td>Total points conceded in winning cause</td>
    </tr>    
    <tr>
        <td>home_score2</td><td>Total points scored in losing cause</td>
    </tr>
    <tr>
        <td>away_score2</td><td>Total points conceded in losing cause</td>
    </tr>
    <tr>
        <td>net_score</td><td>Total net points</td>
    </tr>    
    <tr>
        <td>home_score1Diff</td><td>home_score1['winning team'] - home_score1['losing team']</td>
    </tr>
    <tr>
        <td>home_score2Diff</td><td>home_score2['winning team'] - home_score2['losing team']</td>
    </tr>
    <tr>
        <td>away_score1Diff</td><td>away_score1['winning team'] - away_score1['losing team']</td>
    </tr>
    <tr>
        <td>away_score2Diff</td><td>away_score2['winning team'] - away_score2['losing team']</td>
    </tr>
    <tr>
        <td>net_scoreDiff</td><td>net_score['winning team'] - net_score['losing team']</td>
    </tr>
    <tr>
        <td>round</td><td>Championship round number</td>
    </tr>
        <tr>
        <td>diff_seed_round</td><td>SeedDiff/round</td>
    </tr>
</table>

In [264]:
vote_est1 = [
    ('lr', linear_model.LogisticRegression())
]

grid_param1 = [   
            [{
            #LogisticRegression
            'C': np.logspace(start=-5, stop=3, num=9)
            }] 
        ]

vote_est2 = [
    ('lr', linear_model.LogisticRegression())
]

grid_param2 = [   
            [{
            #LogisticRegression
            'C': np.logspace(start=-5, stop=3, num=9)
            }] 
        ]

start_total = time.perf_counter() #https://docs.python.org/3/library/time.html#time.perf_counter

for clf1, param1 in zip (vote_est1, grid_param1): #https://docs.python.org/3/library/functions.html#zip
    start = time.perf_counter()        
    best_search1 = model_selection.GridSearchCV(estimator = clf1[1], param_grid = param1, scoring = 'neg_log_loss', refit=True)
    best_search1.fit(X_train1, y_train1)
    run = time.perf_counter() - start

    best_param = best_search1.best_params_
    print('The best parameter for {} is {} with a runtime of {:.2f} seconds.'.format(clf1[1].__class__.__name__, best_param, run))
    clf1[1].set_params(**best_param) 
    print('Best log_loss: {:.4}'.format(best_search1.best_score_))
print('-'*10)

The best parameter for LogisticRegression is {'C': 0.001} with a runtime of 0.34 seconds.
Best log_loss: -0.5573
----------


In [265]:
start_total = time.perf_counter() #https://docs.python.org/3/library/time.html#time.perf_counter

for clf2, param2 in zip (vote_est2, grid_param2): #https://docs.python.org/3/library/functions.html#zip
    start = time.perf_counter()        
    best_search2 = model_selection.GridSearchCV(estimator = clf2[1], param_grid = param2, scoring = 'neg_log_loss', refit=True)
    best_search2.fit(X_train2, y_train2)
    run = time.perf_counter() - start

    best_param = best_search2.best_params_
    print('The best parameter for {} is {} with a runtime of {:.2f} seconds.'.format(clf2[1].__class__.__name__, best_param, run))
    clf2[1].set_params(**best_param) 
    print('Best log_loss: {:.4}'.format(best_search2.best_score_))
print('-'*10)

The best parameter for LogisticRegression is {'C': 0.1} with a runtime of 0.28 seconds.
Best log_loss: -0.6193
----------


In [266]:
grid_hard1 = ensemble.VotingClassifier(estimators=vote_est1)
grid_search1 = model_selection.GridSearchCV(estimator = grid_hard1, param_grid = {'voting':['soft']}, scoring = 'neg_log_loss', refit=True)
grid_search1.fit(X_train1, y_train1)
print('Best log_loss: {:.4} with params{}'.format(grid_search1.best_score_,grid_search1.best_params_))

Best log_loss: -0.5573 with params{'voting': 'soft'}


In [267]:
grid_hard2 = ensemble.VotingClassifier(estimators=vote_est2)
grid_search2 = model_selection.GridSearchCV(estimator = grid_hard2, param_grid = {'voting':['soft']}, scoring = 'neg_log_loss', refit=True)
grid_search2.fit(X_train2, y_train2)
print('Best log_loss: {:.4} with params{}'.format(grid_search2.best_score_,grid_search2.best_params_))

Best log_loss: -0.6193 with params{'voting': 'soft'}


In [268]:
X_train3 = pd.DataFrame()
X_train3['pred1'] = grid_search1.predict_proba(X_train1)[:,1]
X_train3['pred2'] = grid_search2.predict_proba(X_train2)[:,1]
y_train3 = df_predictions['Result']

vote_est3 = [
    ('lr', linear_model.LogisticRegression())
]

grid_param3 = [   
            [{
            #LogisticRegression
            'C': np.logspace(start=-5, stop=3, num=9)
            }] 
        ]

start_total = time.perf_counter() #https://docs.python.org/3/library/time.html#time.perf_counter

for clf3, param3 in zip (vote_est3, grid_param3): #https://docs.python.org/3/library/functions.html#zip
    start = time.perf_counter()        
    best_search3 = model_selection.GridSearchCV(estimator = clf3[1], param_grid = param3, scoring = 'neg_log_loss', refit=True)
    best_search3.fit(X_train3, y_train3)
    run = time.perf_counter() - start

    best_param = best_search3.best_params_
    print('The best parameter for {} is {} with a runtime of {:.2f} seconds.'.format(clf3[1].__class__.__name__, best_param, run))
    clf3[1].set_params(**best_param) 
    print('Best log_loss: {:.4}'.format(best_search3.best_score_))
print('-'*10)

The best parameter for LogisticRegression is {'C': 1e-05} with a runtime of 0.26 seconds.
Best log_loss: -0.6931
----------


In [269]:
grid_hard3 = ensemble.VotingClassifier(estimators=vote_est3)
grid_search3 = model_selection.GridSearchCV(estimator = grid_hard3, param_grid = {'voting':['soft']}, scoring = 'neg_log_loss', refit=True)
grid_search3.fit(X_train3, y_train3)
print('Best log_loss: {:.4} with params{}'.format(grid_search3.best_score_,grid_search3.best_params_))

Best log_loss: -0.6931 with params{'voting': 'soft'}


In [270]:
df_sample_sub1 = pd.read_csv(data_dir + 'SampleSubmissionStage1.csv')
df_sample_sub2 = pd.read_csv(data_dir + 'SampleSubmissionStage1.csv')
df_sample_sub3 = pd.read_csv(data_dir + 'SampleSubmissionStage1.csv')
n_test_games = len(df_sample_sub1)

def get_year_t1_t2(ID):
    """Return a tuple with ints `year`, `team1` and `team2`."""
    return (int(x) for x in ID.split('_'))

df_sample_sub1.head()

Unnamed: 0,ID,Pred
0,2014_1107_1110,0.5
1,2014_1107_1112,0.5
2,2014_1107_1113,0.5
3,2014_1107_1124,0.5
4,2014_1107_1140,0.5


In [274]:
X_test1 = pd.DataFrame()
X_test2 = pd.DataFrame()
for ii, row in df_sample_sub1.iterrows():
    year, t1, t2 = get_year_t1_t2(row.ID)
    #slot = getRound({'Season':year,'WTeamID':t1,'LTeamID':t2}) 
    t1_seed = df_seeds[(df_seeds.TeamID == t1) & (df_seeds.Season == year)].seed_int.values[0]
    t2_seed = df_seeds[(df_seeds.TeamID == t2) & (df_seeds.Season == year)].seed_int.values[0]
    diff_seed = t1_seed - t2_seed
    t1_netrtg = df_regular_season[(df_regular_season.TeamID == t1) & (df_regular_season.Season == year)].NetRtg.values[0]
    t2_netrtg = df_regular_season[(df_regular_season.TeamID == t2) & (df_regular_season.Season == year)].NetRtg.values[0]
    diff_netrtg = t1_netrtg - t2_netrtg
    t1_tsp = df_regular_season[(df_regular_season.TeamID == t1) & (df_regular_season.Season == year)].TSP.values[0]
    t2_tsp = df_regular_season[(df_regular_season.TeamID == t2) & (df_regular_season.Season == year)].TSP.values[0]
    diff_tsp = t1_tsp - t2_tsp    
    t1_pie = df_regular_season[(df_regular_season.TeamID == t1) & (df_regular_season.Season == year)].PIE.values[0]
    t2_pie = df_regular_season[(df_regular_season.TeamID == t2) & (df_regular_season.Season == year)].PIE.values[0]
    diff_pie = t1_pie - t2_pie
    #diff_seed_round = diff_seed/slot
    t1_net_score = df_regular_full[(df_regular_full.TeamID == t1) & (df_regular_full.Season == year)].net_score.values[0]
    t2_net_score = df_regular_full[(df_regular_full.TeamID == t2) & (df_regular_full.Season == year)].net_score.values[0]
    diff_net_score = t1_net_score - t2_net_score    
    t1_home_score1 = df_regular_full[(df_regular_full.TeamID == t1) & (df_regular_full.Season == year)].home_score1.values[0]
    t2_home_score1 = df_regular_full[(df_regular_full.TeamID == t2) & (df_regular_full.Season == year)].home_score1.values[0]
    diff_home_score1 = t1_home_score1 - t2_home_score1
    t1_home_score2 = df_regular_full[(df_regular_full.TeamID == t1) & (df_regular_full.Season == year)].home_score2.values[0]
    t2_home_score2 = df_regular_full[(df_regular_full.TeamID == t2) & (df_regular_full.Season == year)].home_score2.values[0]
    diff_home_score2 = t1_home_score2 - t2_home_score2
    t1_away_score1 = df_regular_full[(df_regular_full.TeamID == t1) & (df_regular_full.Season == year)].away_score1.values[0]
    t2_away_score1 = df_regular_full[(df_regular_full.TeamID == t2) & (df_regular_full.Season == year)].away_score1.values[0]
    diff_away_score1 = t1_away_score1 - t2_away_score1
    t1_away_score2 = df_regular_full[(df_regular_full.TeamID == t1) & (df_regular_full.Season == year)].away_score2.values[0]
    t2_away_score2 = df_regular_full[(df_regular_full.TeamID == t2) & (df_regular_full.Season == year)].away_score2.values[0]
    diff_away_score2 = t1_away_score2 - t2_away_score2    
    #X_test = X_test.append({'NetRtgDiff':diff_netrtg,'TSPDiff':diff_tsp,'PIEDiff':diff_pie,'net_scoreDiff':diff_net_score,'SeedDiff':diff_seed,'home_score1Diff':diff_home_score1,'home_score2Diff':diff_home_score2,'away_score1Diff':diff_away_score1,'away_score2Diff':diff_away_score2},ignore_index=True) 
    X_test1 = X_test1.append({'SeedDiff':diff_seed,'home_score1Diff':diff_home_score1,'home_score2Diff':diff_home_score2,'away_score1Diff':diff_away_score1,'away_score2Diff':diff_away_score2},ignore_index=True)    
    X_test2 = X_test2.append({'NetRtgDiff':diff_netrtg,'TSPDiff':diff_tsp,'PIEDiff':diff_pie},ignore_index=True)    

X_test1.head()

Unnamed: 0,SeedDiff,away_score1Diff,away_score2Diff,home_score1Diff,home_score2Diff
0,1.0,0.0,1.0,0.0,1.0
1,15.0,-2.0,2.0,-2.0,2.0
2,6.0,-1.0,0.0,-1.0,0.0
3,10.0,-2.0,0.0,-2.0,0.0
4,6.0,-2.0,-1.0,-2.0,-1.0


In [277]:
preds1 = grid_search1.predict_proba(X_test1)[:,1]
#clipped_preds = np.clip(preds, 0.05, 0.95)
df_sample_sub1.Pred = preds1
preds2 = grid_search2.predict_proba(X_test2)[:,1]
df_sample_sub2.Pred = preds2
preds3 = (preds1 + preds2)/2
df_sample_sub3.Pred = preds3
df_sample_sub1.head()

Unnamed: 0,ID,Pred
0,2014_1107_1110,0.454919
1,2014_1107_1112,0.082743
2,2014_1107_1113,0.279686
3,2014_1107_1124,0.169656
4,2014_1107_1140,0.278624


In [278]:
#df_sample_sub1.to_csv('../working/NCAA_points.csv', index=False)
#df_sample_sub2.to_csv('../working/NCAA_stats.csv', index=False)
df_sample_sub3.to_csv('../working/NCAA_comb4.csv', index=False)