In [1]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn import neighbors
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error
from statsmodels.tools.eval_measures import mse, rmse
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV
%matplotlib inline

In [2]:
# load the data

df = pd.read_csv('RegularSeasonDetailedResults.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82041 entries, 0 to 82040
Data columns (total 34 columns):
Season     82041 non-null int64
DayNum     82041 non-null int64
WTeamID    82041 non-null int64
WScore     82041 non-null int64
LTeamID    82041 non-null int64
LScore     82041 non-null int64
WLoc       82041 non-null object
NumOT      82041 non-null int64
WFGM       82041 non-null int64
WFGA       82041 non-null int64
WFGM3      82041 non-null int64
WFGA3      82041 non-null int64
WFTM       82041 non-null int64
WFTA       82041 non-null int64
WOR        82041 non-null int64
WDR        82041 non-null int64
WAst       82041 non-null int64
WTO        82041 non-null int64
WStl       82041 non-null int64
WBlk       82041 non-null int64
WPF        82041 non-null int64
LFGM       82041 non-null int64
LFGA       82041 non-null int64
LFGM3      82041 non-null int64
LFGA3      82041 non-null int64
LFTM       82041 non-null int64
LFTA       82041 non-null int64
LOR        82041 non-null 

In [4]:
df.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2003,10,1104,68,1328,62,N,0,27,58,...,10,16,22,10,22,8,18,9,2,20
1,2003,10,1272,70,1393,63,N,0,26,62,...,24,9,20,20,25,7,12,8,6,16
2,2003,11,1266,73,1437,61,N,0,24,58,...,26,14,23,31,22,9,12,2,5,23
3,2003,11,1296,56,1457,50,N,0,18,38,...,22,8,15,17,20,9,19,4,3,23
4,2003,11,1400,77,1208,71,N,0,30,61,...,16,17,27,21,15,12,10,7,1,14


In [5]:
# create training set from this data

training_set = pd.DataFrame()

In [6]:
training_set['net_fgm'] = df['WFGM'] - df['LFGM']
training_set['net_fga'] = df['WFGA'] - df['LFGA']
training_set['net_fgm3'] = df['WFGM3'] - df['LFGM3']
training_set['net_fga3'] = df['WFGA3'] - df['LFGA3']
training_set['net_ftm'] = df['WFTM'] - df['LFTM']
training_set['net_fta'] = df['WFTA'] - df['LFTA']
training_set['net_or'] = df['WOR'] - df['LOR']
training_set['net_dr'] = df['WDR'] - df['LDR']
training_set['net_tr'] = df['WOR'] + df['WDR'] - df['LOR'] - df['LDR']
training_set['net_ast'] = df['WAst'] - df['LAst']
training_set['net_to'] = df['WTO'] - df['LTO']
training_set['net_stl'] = df['WStl'] - df['LStl']
training_set['net_blk'] = df['WBlk'] - df['LBlk']
training_set['net_pf'] = df['WPF'] - df['LPF']
training_set['win'] = 1

In [7]:
training_set.head()

Unnamed: 0,net_fgm,net_fga,net_fgm3,net_fga3,net_ftm,net_fta,net_or,net_dr,net_tr,net_ast,net_to,net_stl,net_blk,net_pf,win
0,5,5,1,4,-5,-4,4,2,6,5,5,-2,-1,2,1
1,2,-5,2,-4,1,-1,-5,3,-2,9,1,-4,-2,2,1
2,2,-15,5,-8,3,6,-14,4,-10,6,-2,3,-3,2,1
3,0,-11,-3,-13,9,16,-11,-1,-12,2,-7,10,-1,-5,1
4,6,-1,0,-2,-6,-14,-4,7,3,0,4,-3,3,6,1


In [8]:
inverse_df = -training_set
inverse_df['win'] = 0

In [9]:
inverse_df.head()

Unnamed: 0,net_fgm,net_fga,net_fgm3,net_fga3,net_ftm,net_fta,net_or,net_dr,net_tr,net_ast,net_to,net_stl,net_blk,net_pf,win
0,-5,-5,-1,-4,5,4,-4,-2,-6,-5,-5,2,1,-2,0
1,-2,5,-2,4,-1,1,5,-3,2,-9,-1,4,2,-2,0
2,-2,15,-5,8,-3,-6,14,-4,10,-6,2,-3,3,-2,0
3,0,11,3,13,-9,-16,11,1,12,-2,7,-10,1,5,0
4,-6,1,0,2,6,14,4,-7,-3,0,-4,3,-3,-6,0


In [10]:
# this is the final version of the training set
# X_train = all columns except 'win'
# Y_train = win column

final_df = training_set.append(inverse_df)

In [11]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 164082 entries, 0 to 82040
Data columns (total 15 columns):
net_fgm     164082 non-null int64
net_fga     164082 non-null int64
net_fgm3    164082 non-null int64
net_fga3    164082 non-null int64
net_ftm     164082 non-null int64
net_fta     164082 non-null int64
net_or      164082 non-null int64
net_dr      164082 non-null int64
net_tr      164082 non-null int64
net_ast     164082 non-null int64
net_to      164082 non-null int64
net_stl     164082 non-null int64
net_blk     164082 non-null int64
net_pf      164082 non-null int64
win         164082 non-null int64
dtypes: int64(15)
memory usage: 20.0 MB


In [12]:
# test to make sure a column sum in this dataframe is 0 as it should be

final_df['net_fgm'].sum()

0

Next, create the test set. This competition will be graded solely on the 2019 NCAA tournament, so the data used to create the test set will come solely from games played in the 2018 season (in collegiate sports, the change in year-over-year performance can be drastic due to the constant roster changes).

The objective here is to find each team's average in each metric provided in the training set (average net_fgm, average net_fga, etc.). Then, for any given matchup of team A vs. team B, compare their averages in each metric - the differences in these averages will yield the final data to be evaluated as the test set!

In [13]:
df_2018 = df.loc[df['Season'] == 2018]

In [14]:
df_2018.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5405 entries, 76636 to 82040
Data columns (total 34 columns):
Season     5405 non-null int64
DayNum     5405 non-null int64
WTeamID    5405 non-null int64
WScore     5405 non-null int64
LTeamID    5405 non-null int64
LScore     5405 non-null int64
WLoc       5405 non-null object
NumOT      5405 non-null int64
WFGM       5405 non-null int64
WFGA       5405 non-null int64
WFGM3      5405 non-null int64
WFGA3      5405 non-null int64
WFTM       5405 non-null int64
WFTA       5405 non-null int64
WOR        5405 non-null int64
WDR        5405 non-null int64
WAst       5405 non-null int64
WTO        5405 non-null int64
WStl       5405 non-null int64
WBlk       5405 non-null int64
WPF        5405 non-null int64
LFGM       5405 non-null int64
LFGA       5405 non-null int64
LFGM3      5405 non-null int64
LFGA3      5405 non-null int64
LFTM       5405 non-null int64
LFTA       5405 non-null int64
LOR        5405 non-null int64
LDR        5405 non

In [15]:
total_unique_teams = df_2018['WTeamID'].append(df_2018['LTeamID'])

In [16]:
a = total_unique_teams.unique()
print(a)

[1104 1107 1112 1113 1116 1120 1124 1127 1130 1132 1139 1153 1155 1158
 1160 1161 1163 1166 1172 1173 1174 1181 1184 1191 1203 1204 1206 1208
 1211 1213 1216 1218 1219 1220 1221 1222 1227 1228 1232 1234 1242 1243
 1246 1249 1253 1260 1261 1266 1267 1268 1269 1274 1275 1277 1278 1279
 1280 1281 1283 1284 1286 1297 1298 1301 1305 1310 1311 1314 1315 1318
 1321 1324 1326 1329 1330 1332 1333 1336 1344 1345 1348 1351 1355 1356
 1371 1374 1376 1378 1385 1387 1390 1393 1395 1397 1400 1403 1408 1412
 1415 1416 1425 1428 1433 1435 1437 1438 1439 1449 1451 1455 1458 1461
 1462 1103 1133 1138 1140 1182 1193 1195 1245 1247 1250 1276 1304 1323
 1325 1340 1346 1364 1372 1388 1401 1405 1417 1424 1426 1447 1102 1129
 1143 1187 1207 1217 1231 1252 1257 1328 1353 1360 1391 1407 1414 1450
 1117 1125 1135 1165 1177 1190 1196 1200 1201 1240 1256 1285 1288 1292
 1320 1359 1362 1377 1386 1409 1410 1413 1429 1454 1162 1186 1199 1202
 1209 1251 1272 1273 1296 1307 1312 1430 1463 1236 1258 1262 1264 1299
 1335 

In [17]:
# start making the test set!

test_df = pd.DataFrame()

In [18]:
test_df['TeamID'] = a

In [19]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 351 entries, 0 to 350
Data columns (total 1 columns):
TeamID    351 non-null int64
dtypes: int64(1)
memory usage: 2.9 KB


In [20]:
df_2018_net = pd.DataFrame()

In [21]:
df_2018_net['teamid'] = df_2018['WTeamID']
df_2018_net['net_fgm'] = df_2018['WFGM'] - df_2018['LFGM']
df_2018_net['net_fga'] = df_2018['WFGA'] - df_2018['LFGA']
df_2018_net['net_fgm3'] = df_2018['WFGM3'] - df_2018['LFGM3']
df_2018_net['net_fga3'] = df_2018['WFGA3'] - df_2018['LFGA3']
df_2018_net['net_ftm'] = df_2018['WFTM'] - df_2018['LFTM']
df_2018_net['net_fta'] = df_2018['WFTA'] - df_2018['LFTA']
df_2018_net['net_or'] = df_2018['WOR'] - df_2018['LOR']
df_2018_net['net_dr'] = df_2018['WDR'] - df_2018['LDR']
df_2018_net['net_tr'] = df_2018['WOR'] + df_2018['WDR'] - df_2018['LOR'] - df_2018['LDR']
df_2018_net['net_ast'] = df_2018['WAst'] - df_2018['LAst']
df_2018_net['net_to'] = df_2018['WTO'] - df_2018['LTO']
df_2018_net['net_stl'] = df_2018['WStl'] - df_2018['LStl']
df_2018_net['net_blk'] = df_2018['WBlk'] - df_2018['LBlk']
df_2018_net['net_pf'] = df_2018['WPF'] - df_2018['LPF']

In [22]:
df_2018_inverse = -df_2018_net
df_2018_inverse['teamid'] = df_2018['LTeamID']

In [23]:
aggregate_2018_df = df_2018_net.append(df_2018_inverse)

In [24]:
df_2018_net.head()

Unnamed: 0,teamid,net_fgm,net_fga,net_fgm3,net_fga3,net_ftm,net_fta,net_or,net_dr,net_tr,net_ast,net_to,net_stl,net_blk,net_pf
76636,1104,3,-3,9,10,-3,-8,-6,3,-3,9,-1,1,4,8
76637,1107,-2,3,0,-7,6,5,12,8,20,-8,6,-5,-3,-5
76638,1112,11,-1,1,0,11,2,0,20,20,10,-2,3,5,-1
76639,1113,8,6,0,6,4,4,4,8,12,5,-3,-3,3,-2
76640,1116,16,2,1,-7,6,7,-8,11,3,16,-8,1,5,-2


In [25]:
df_2018_inverse.head()

Unnamed: 0,teamid,net_fgm,net_fga,net_fgm3,net_fga3,net_ftm,net_fta,net_or,net_dr,net_tr,net_ast,net_to,net_stl,net_blk,net_pf
76636,1272,-3,3,-9,-10,3,8,6,-3,3,-9,1,-1,-4,-8
76637,1233,2,-3,0,7,-6,-5,-12,-8,-20,8,-6,5,3,5
76638,1319,-11,1,-1,0,-11,-2,0,-20,-20,-10,2,-3,-5,1
76639,1226,-8,-6,0,-6,-4,-4,-4,-8,-12,-5,3,3,-3,2
76640,1359,-16,-2,-1,7,-6,-7,8,-11,-3,-16,8,-1,-5,2


In [26]:
# fill in test_df with the averages of each net stat for each team...how to do this efficiently?