In [1]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn import neighbors
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error
from statsmodels.tools.eval_measures import mse, rmse
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV
from sklearn.naive_bayes import BernoulliNB
%matplotlib inline

In [2]:
# load the data

df = pd.read_csv('RegularSeasonDetailedResults.csv')

In [3]:
df.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2003,10,1104,68,1328,62,N,0,27,58,...,10,16,22,10,22,8,18,9,2,20
1,2003,10,1272,70,1393,63,N,0,26,62,...,24,9,20,20,25,7,12,8,6,16
2,2003,11,1266,73,1437,61,N,0,24,58,...,26,14,23,31,22,9,12,2,5,23
3,2003,11,1296,56,1457,50,N,0,18,38,...,22,8,15,17,20,9,19,4,3,23
4,2003,11,1400,77,1208,71,N,0,30,61,...,16,17,27,21,15,12,10,7,1,14


In [4]:
# create training set from this data

training_set = pd.DataFrame()

In [5]:
training_set['net_fgm'] = df['WFGM'] - df['LFGM']
training_set['net_fga'] = df['WFGA'] - df['LFGA']
training_set['net_fgm3'] = df['WFGM3'] - df['LFGM3']
training_set['net_fga3'] = df['WFGA3'] - df['LFGA3']
training_set['net_ftm'] = df['WFTM'] - df['LFTM']
training_set['net_fta'] = df['WFTA'] - df['LFTA']
training_set['net_or'] = df['WOR'] - df['LOR']
training_set['net_dr'] = df['WDR'] - df['LDR']
training_set['net_tr'] = df['WOR'] + df['WDR'] - df['LOR'] - df['LDR']
training_set['net_ast'] = df['WAst'] - df['LAst']
training_set['net_to'] = df['WTO'] - df['LTO']
training_set['net_stl'] = df['WStl'] - df['LStl']
training_set['net_blk'] = df['WBlk'] - df['LBlk']
training_set['net_pf'] = df['WPF'] - df['LPF']
training_set['win'] = 1

In [6]:
inverse_df = -training_set
inverse_df['win'] = 0

In [7]:
# this is the final version of the training set
# x_train = all columns except 'win'
# y_train = win column

final_df = training_set.append(inverse_df)

x_train = final_df.drop(columns = 'win')
y_train = final_df['win']

Next, create the test set. This competition will be graded solely on the 2019 NCAA tournament, so the data used to create the test set will come solely from games played in the 2018 season (in collegiate sports, the change in year-over-year performance can be drastic due to the constant roster changes).

The objective here is to find each team's average in each metric provided in the training set (average net_fgm, average net_fga, etc.). Then, for any given matchup of team A vs. team B, compare their averages in each metric - the differences in these averages will yield the final data to be evaluated as the test set!

In [8]:
df_2018 = df.loc[df['Season'] == 2018]

In [9]:
df_2018_net = pd.DataFrame()

In [10]:
df_2018_net['teamid'] = df_2018['WTeamID']
df_2018_net['net_fgm'] = df_2018['WFGM'] - df_2018['LFGM']
df_2018_net['net_fga'] = df_2018['WFGA'] - df_2018['LFGA']
df_2018_net['net_fgm3'] = df_2018['WFGM3'] - df_2018['LFGM3']
df_2018_net['net_fga3'] = df_2018['WFGA3'] - df_2018['LFGA3']
df_2018_net['net_ftm'] = df_2018['WFTM'] - df_2018['LFTM']
df_2018_net['net_fta'] = df_2018['WFTA'] - df_2018['LFTA']
df_2018_net['net_or'] = df_2018['WOR'] - df_2018['LOR']
df_2018_net['net_dr'] = df_2018['WDR'] - df_2018['LDR']
df_2018_net['net_tr'] = df_2018['WOR'] + df_2018['WDR'] - df_2018['LOR'] - df_2018['LDR']
df_2018_net['net_ast'] = df_2018['WAst'] - df_2018['LAst']
df_2018_net['net_to'] = df_2018['WTO'] - df_2018['LTO']
df_2018_net['net_stl'] = df_2018['WStl'] - df_2018['LStl']
df_2018_net['net_blk'] = df_2018['WBlk'] - df_2018['LBlk']
df_2018_net['net_pf'] = df_2018['WPF'] - df_2018['LPF']

In [11]:
df_2018_inverse = -df_2018_net
df_2018_inverse['teamid'] = df_2018['LTeamID']

In [12]:
aggregate_2018_df = df_2018_net.append(df_2018_inverse)

In [13]:
test_data = aggregate_2018_df.groupby('teamid')['net_fgm', 'net_fga', 'net_fgm3', 'net_fga3', 'net_ftm', 'net_fta', 'net_or',
                                             'net_dr', 'net_tr', 'net_ast', 'net_to', 'net_stl', 'net_blk', 'net_pf'].mean()

In [14]:
test_data.head()

Unnamed: 0_level_0,net_fgm,net_fga,net_fgm3,net_fga3,net_ftm,net_fta,net_or,net_dr,net_tr,net_ast,net_to,net_stl,net_blk,net_pf
teamid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1101,0.740741,2.185185,-0.148148,0.222222,-3.481481,-4.777778,-0.888889,-1.814815,-2.703704,1.851852,-1.074074,0.62963,0.703704,3.37037
1102,-2.482759,1.655172,-1.344828,1.068966,0.275862,-0.206897,0.655172,-3.344828,-2.689655,-0.689655,-1.206897,0.862069,-0.965517,-0.586207
1103,-1.451613,1.193548,1.290323,4.032258,-3.290323,-3.870968,0.258065,-1.935484,-1.677419,-0.419355,0.83871,-0.419355,-1.258065,2.645161
1104,1.117647,-3.352941,-0.558824,-2.058824,0.705882,1.323529,-1.558824,1.764706,0.205882,0.911765,0.970588,-0.176471,1.705882,-0.970588
1105,-6.709677,-4.741935,-1.483871,0.193548,-1.032258,-0.516129,-0.258065,-2.322581,-2.580645,-3.870968,5.193548,-3.677419,-3.290323,-0.741935


In [15]:
submission_df = pd.read_csv('SampleSubmissionStage2.csv')

In [16]:
# create final dataframe to submit into the models for prediction

test_df = pd.DataFrame()
test_df['team1'] = submission_df['ID'].str[5:9]
test_df['team2'] = submission_df['ID'].str[10:14]

In [17]:
test_copy = test_df

for index, row in test_copy.iterrows():
    test_copy.loc[index, 'net_fgm'] = test_data.loc[int(row['team1']), 'net_fgm'] - test_data.loc[int(row['team2']), 'net_fgm']
    test_copy.loc[index, 'net_fga'] = test_data.loc[int(row['team1']), 'net_fga'] - test_data.loc[int(row['team2']), 'net_fga']
    test_copy.loc[index, 'net_fgm3'] = test_data.loc[int(row['team1']), 'net_fgm3'] - test_data.loc[int(row['team2']), 'net_fgm3']
    test_copy.loc[index, 'net_fga3'] = test_data.loc[int(row['team1']), 'net_fga3'] - test_data.loc[int(row['team2']), 'net_fga3']
    test_copy.loc[index, 'net_ftm'] = test_data.loc[int(row['team1']), 'net_ftm'] - test_data.loc[int(row['team2']), 'net_ftm']
    test_copy.loc[index, 'net_fta'] = test_data.loc[int(row['team1']), 'net_fta'] - test_data.loc[int(row['team2']), 'net_fta']
    test_copy.loc[index, 'net_or'] = test_data.loc[int(row['team1']), 'net_or'] - test_data.loc[int(row['team2']), 'net_or']
    test_copy.loc[index, 'net_dr'] = test_data.loc[int(row['team1']), 'net_dr'] - test_data.loc[int(row['team2']), 'net_dr']
    test_copy.loc[index, 'net_tr'] = test_data.loc[int(row['team1']), 'net_tr'] - test_data.loc[int(row['team2']), 'net_tr']
    test_copy.loc[index, 'net_ast'] = test_data.loc[int(row['team1']), 'net_ast'] - test_data.loc[int(row['team2']), 'net_ast']
    test_copy.loc[index, 'net_to'] = test_data.loc[int(row['team1']), 'net_to'] - test_data.loc[int(row['team2']), 'net_to']
    test_copy.loc[index, 'net_stl'] = test_data.loc[int(row['team1']), 'net_stl'] - test_data.loc[int(row['team2']), 'net_stl']
    test_copy.loc[index, 'net_blk'] = test_data.loc[int(row['team1']), 'net_blk'] - test_data.loc[int(row['team2']), 'net_blk']
    test_copy.loc[index, 'net_pf'] = test_data.loc[int(row['team1']), 'net_pf'] - test_data.loc[int(row['team2']), 'net_pf']

In [18]:
test_copy.head()

Unnamed: 0,team1,team2,net_fgm,net_fga,net_fgm3,net_fga3,net_ftm,net_fta,net_or,net_dr,net_tr,net_ast,net_to,net_stl,net_blk,net_pf
0,1101,1113,-0.74313,2.249701,-0.857826,-0.261649,-7.997611,-10.519713,0.433692,-1.782557,-1.348865,2.819594,3.151732,-0.563919,0.639188,6.144564
1,1101,1120,-1.821759,-1.814815,-1.835648,-3.621528,-6.731481,-7.246528,-2.763889,-3.283565,-6.047454,0.226852,1.925926,-0.65162,-1.421296,4.15162
2,1101,1124,-1.968937,2.507766,1.174432,5.222222,-4.51374,-5.584229,-1.792115,-6.363202,-8.155317,1.948626,-2.783751,0.726404,0.76822,3.20908
3,1101,1125,-1.228956,4.912458,-4.693603,-9.626263,-3.420875,-4.656566,0.444444,-5.117845,-4.673401,-2.6633,-1.922559,1.872054,0.946128,4.612795
4,1101,1133,-0.353009,2.153935,0.726852,4.159722,-3.887731,-5.965278,-1.420139,-3.221065,-4.641204,1.726852,-0.886574,0.47338,-0.640046,3.49537


In [19]:
bnb = BernoulliNB()

bnb.fit(x_train, y_train)
bnb_test = bnb.predict(test_copy.drop(columns = ['team1', 'team2']))
bnb_submission = submission_df
bnb_submission['Pred'] = bnb_test
bnb_submission.to_csv('bnb_submission.csv', index=False)

print(bnb_submission.loc[bnb_submission['ID'] == '2019_1153_1277'])

In [20]:
lrm = LinearRegression()

lrm.fit(x_train, y_train)
lrm_test = lrm.predict(test_copy.drop(columns = ['team1', 'team2']))
lrm_bounded = np.clip(lrm_test, 0, 1)
lrm_submission = submission_df
lrm_submission['Pred'] = lrm_bounded
lrm_submission.to_csv('lrm_submission.csv', index=False)

In [21]:
alphas = [np.power(10.0,p) for p in np.arange(-10,40,1)]

rcv = RidgeCV(alphas=alphas, cv=5)
rcv.fit(x_train, y_train)
print("Best alpha value is: {}".format(rcv.alpha_))
print("R-squared of the model in training set is: {}".format(rcv.score(x_train, y_train)))

  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T


Best alpha value is: 10000.0
R-squared of the model in training set is: 0.6796807800696064


In [22]:
rcv_test = rcv.predict(test_copy.drop(columns = ['team1', 'team2']))
rcv_bounded = np.clip(rcv_test, 0, 1)
rcv_submission = submission_df
rcv_submission['Pred'] = rcv_bounded
rcv_submission.to_csv('rcv_submission.csv', index=False)

In [23]:
lcv = LassoCV(alphas=alphas, cv=5)

lcv.fit(x_train, y_train)
print("Best alpha value is: {}".format(lcv.alpha_))
print("R-squared of the model in training set is: {}".format(lcv.score(x_train, y_train)))

Best alpha value is: 1e-10
R-squared of the model in training set is: 0.6797338459115958


  positive)


In [24]:
lcv_test = lcv.predict(test_copy.drop(columns = ['team1', 'team2']))
lcv_bounded = np.clip(lcv_test, 0, 1)
lcv_submission = submission_df
lcv_submission['Pred'] = lcv_bounded
lcv_submission.to_csv('lcv_submission.csv', index=False)

In [25]:
encv = ElasticNetCV(alphas=alphas, cv=5)

encv.fit(x_train, y_train)
print("Best alpha value is: {}".format(encv.alpha_))
print("R-squared of the model in training set is: {}".format(encv.score(x_train, y_train)))

Best alpha value is: 1e-10
R-squared of the model in training set is: 0.6797338459115957


  positive)


In [26]:
encv_test = encv.predict(test_copy.drop(columns = ['team1', 'team2']))
encv_bounded = np.clip(encv_test, 0, 1)
encv_submission = submission_df
encv_submission['Pred'] = encv_bounded
encv_submission.to_csv('encv_submission.csv', index=False)

In [27]:
knc1 = KNeighborsClassifier(n_neighbors=1)
knc1.fit(x_train, y_train)
knc1_test = knc1.predict(test_copy.drop(columns = ['team1', 'team2']))
knc1_submission = submission_df
knc1_submission['Pred'] = knc1_test
knc1_submission.to_csv('knc1_submission.csv', index=False)

In [28]:
knc5 = KNeighborsClassifier(n_neighbors=5)
knc5.fit(x_train, y_train)
knc5_test = knc5.predict(test_copy.drop(columns = ['team1', 'team2']))
knc5_submission = submission_df
knc5_submission['Pred'] = knc5_test
knc5_submission.to_csv('knc5_submission.csv', index=False)

In [29]:
knc10 = KNeighborsClassifier(n_neighbors=10)
knc10.fit(x_train, y_train)
knc10_test = knc10.predict(test_copy.drop(columns = ['team1', 'team2']))
knc10_submission = submission_df
knc10_submission['Pred'] = knc10_test
knc10_submission.to_csv('knc10_submission.csv', index=False)

In [30]:
knc20 = KNeighborsClassifier(n_neighbors=20)
knc20.fit(x_train, y_train)
knc20_test = knc20.predict(test_copy.drop(columns = ['team1', 'team2']))
knc20_submission = submission_df
knc20_submission['Pred'] = knc20_test
knc20_submission.to_csv('knc20_submission.csv', index=False)

In [31]:
# weighted knn classifer, n=1
knc1w = KNeighborsClassifier(n_neighbors=1, weights='distance')
knc1w.fit(x_train, y_train)
knc1w_test = knc1w.predict(test_copy.drop(columns = ['team1', 'team2']))
knc1w_submission = submission_df
knc1w_submission['Pred'] = knc1w_test
knc1w_submission.to_csv('knc1w_submission.csv', index=False)

In [32]:
# weighted knn classifer, n=5
knc5w = KNeighborsClassifier(n_neighbors=5, weights='distance')
knc5w.fit(x_train, y_train)
knc5w_test = knc5w.predict(test_copy.drop(columns = ['team1', 'team2']))
knc5w_submission = submission_df
knc5w_submission['Pred'] = knc5w_test
knc5w_submission.to_csv('knc5w_submission.csv', index=False)

In [33]:
# weighted knn classifer, n=10
knc10w = KNeighborsClassifier(n_neighbors=10, weights='distance')
knc10w.fit(x_train, y_train)
knc10w_test = knc10w.predict(test_copy.drop(columns = ['team1', 'team2']))
knc10w_submission = submission_df
knc10w_submission['Pred'] = knc10w_test
knc10w_submission.to_csv('knc10w_submission.csv', index=False)

In [34]:
# weighted knn classifer, n=20
knc20w = KNeighborsClassifier(n_neighbors=20, weights='distance')
knc20w.fit(x_train, y_train)
knc20w_test = knc20w.predict(test_copy.drop(columns = ['team1', 'team2']))
knc20w_submission = submission_df
knc20w_submission['Pred'] = knc20w_test
knc20w_submission.to_csv('knc20w_submission.csv', index=False)

In [35]:
# knn regressor, n=1
knr1 = neighbors.KNeighborsRegressor(n_neighbors=1)
knr1.fit(x_train, y_train)
knr1_test = knr1.predict(test_copy.drop(columns = ['team1', 'team2']))
knr1_submission = submission_df
knr1_submission['Pred'] = knr1_test
knr1_submission.to_csv('knr1_submission.csv', index=False)

In [36]:
# knn regressor, n=5
knr5 = neighbors.KNeighborsRegressor(n_neighbors=5)
knr5.fit(x_train, y_train)
knr5_test = knr5.predict(test_copy.drop(columns = ['team1', 'team2']))
knr5_submission = submission_df
knr5_submission['Pred'] = knr5_test
knr5_submission.to_csv('knr5_submission.csv', index=False)

In [37]:
# knn regressor, n=10
knr10 = neighbors.KNeighborsRegressor(n_neighbors=10)
knr10.fit(x_train, y_train)
knr10_test = knr10.predict(test_copy.drop(columns = ['team1', 'team2']))
knr10_submission = submission_df
knr10_submission['Pred'] = knr10_test
knr10_submission.to_csv('knr10_submission.csv', index=False)

In [38]:
# knn regressor, n=20
knr20 = neighbors.KNeighborsRegressor(n_neighbors=20)
knr20.fit(x_train, y_train)
knr20_test = knr20.predict(test_copy.drop(columns = ['team1', 'team2']))
knr20_submission = submission_df
knr20_submission['Pred'] = knr20_test
knr20_submission.to_csv('knr20_submission.csv', index=False)

In [39]:
# knn regressor, n=50
knr50 = neighbors.KNeighborsRegressor(n_neighbors=50)
knr50.fit(x_train, y_train)
knr50_test = knr50.predict(test_copy.drop(columns = ['team1', 'team2']))
knr50_submission = submission_df
knr50_submission['Pred'] = knr50_test
knr50_submission.to_csv('knr50_submission.csv', index=False)

In [40]:
# knn regressor, n=100
knr100 = neighbors.KNeighborsRegressor(n_neighbors=100)
knr100.fit(x_train, y_train)
knr100_test = knr100.predict(test_copy.drop(columns = ['team1', 'team2']))
knr100_submission = submission_df
knr100_submission['Pred'] = knr100_test
knr100_submission.to_csv('knr100_submission.csv', index=False)

In [41]:
# knn regressor, n=250
knr250 = neighbors.KNeighborsRegressor(n_neighbors=250)
knr250.fit(x_train, y_train)
knr250_test = knr250.predict(test_copy.drop(columns = ['team1', 'team2']))
knr250_submission = submission_df
knr250_submission['Pred'] = knr250_test
knr250_submission.to_csv('knr250_submission.csv', index=False)

In [42]:
# knn regressor, n=500
knr500 = neighbors.KNeighborsRegressor(n_neighbors=500)
knr500.fit(x_train, y_train)
knr500_test = knr500.predict(test_copy.drop(columns = ['team1', 'team2']))
knr500_submission = submission_df
knr500_submission['Pred'] = knr500_test
knr500_submission.to_csv('knr500_submission.csv', index=False)

In [43]:
# knn regressor, n=1000
knr1000 = neighbors.KNeighborsRegressor(n_neighbors=1000)
knr1000.fit(x_train, y_train)
knr1000_test = knr1000.predict(test_copy.drop(columns = ['team1', 'team2']))
knr1000_submission = submission_df
knr1000_submission['Pred'] = knr1000_test
knr1000_submission.to_csv('knr1000_submission.csv', index=False)

In [44]:
# knn regressor, n=2500
knr2500 = neighbors.KNeighborsRegressor(n_neighbors=2500)
knr2500.fit(x_train, y_train)
knr2500_test = knr2500.predict(test_copy.drop(columns = ['team1', 'team2']))
knr2500_submission = submission_df
knr2500_submission['Pred'] = knr2500_test
knr2500_submission.to_csv('knr2500_submission.csv', index=False)

In [45]:
# knn regressor, n=5000
knr5000 = neighbors.KNeighborsRegressor(n_neighbors=5000)
knr5000.fit(x_train, y_train)
knr5000_test = knr5000.predict(test_copy.drop(columns = ['team1', 'team2']))
knr5000_submission = submission_df
knr5000_submission['Pred'] = knr5000_test
knr5000_submission.to_csv('knr5000_submission.csv', index=False)

In [46]:
# knn regressor, n=10000
knr10000 = neighbors.KNeighborsRegressor(n_neighbors=10000)
knr10000.fit(x_train, y_train)
knr10000_test = knr10000.predict(test_copy.drop(columns = ['team1', 'team2']))
knr10000_submission = submission_df
knr10000_submission['Pred'] = knr10000_test
knr10000_submission.to_csv('knr10000_submission.csv', index=False)

In [47]:
# knn regressor, n=25000
knr25000 = neighbors.KNeighborsRegressor(n_neighbors=25000)
knr25000.fit(x_train, y_train)
knr25000_test = knr25000.predict(test_copy.drop(columns = ['team1', 'team2']))
knr25000_submission = submission_df
knr25000_submission['Pred'] = knr25000_test
knr25000_submission.to_csv('knr25000_submission.csv', index=False)

In [48]:
# knn regressor, n=50000
knr50000 = neighbors.KNeighborsRegressor(n_neighbors=50000)
knr50000.fit(x_train, y_train)
knr50000_test = knr50000.predict(test_copy.drop(columns = ['team1', 'team2']))
knr50000_submission = submission_df
knr50000_submission['Pred'] = knr50000_test
knr50000_submission.to_csv('knr50000_submission.csv', index=False)

In [49]:
# knn regressor, n=100000
knr100000 = neighbors.KNeighborsRegressor(n_neighbors=100000)
knr100000.fit(x_train, y_train)
knr100000_test = knr100000.predict(test_copy.drop(columns = ['team1', 'team2']))
knr100000_submission = submission_df
knr100000_submission['Pred'] = knr100000_test
knr100000_submission.to_csv('knr100000_submission.csv', index=False)

In [50]:
# weighted knn regressor, n=1
knr1w = neighbors.KNeighborsRegressor(n_neighbors=1, weights='distance')
knr1w.fit(x_train, y_train)
knr1w_test = knr1w.predict(test_copy.drop(columns = ['team1', 'team2']))
knr1w_submission = submission_df
knr1w_submission['Pred'] = knr1w_test
knr1w_submission.to_csv('knr1w_submission.csv', index=False)

In [51]:
# weighted knn regressor, n=5
knr5w = neighbors.KNeighborsRegressor(n_neighbors=5, weights='distance')
knr5w.fit(x_train, y_train)
knr5w_test = knr5w.predict(test_copy.drop(columns = ['team1', 'team2']))
knr5w_submission = submission_df
knr5w_submission['Pred'] = knr5w_test
knr5w_submission.to_csv('knr5w_submission.csv', index=False)

In [52]:
# weighted knn regressor, n=10
knr10w = neighbors.KNeighborsRegressor(n_neighbors=10, weights='distance')
knr10w.fit(x_train, y_train)
knr10w_test = knr10w.predict(test_copy.drop(columns = ['team1', 'team2']))
knr10w_submission = submission_df
knr10w_submission['Pred'] = knr10w_test
knr10w_submission.to_csv('knr10w_submission.csv', index=False)

In [53]:
# weighted knn regressor, n=20
knr20w = neighbors.KNeighborsRegressor(n_neighbors=20, weights='distance')
knr20w.fit(x_train, y_train)
knr20w_test = knr20w.predict(test_copy.drop(columns = ['team1', 'team2']))
knr20w_submission = submission_df
knr20w_submission['Pred'] = knr20w_test
knr20w_submission.to_csv('knr20w_submission.csv', index=False)