In [1]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn import neighbors, tree, ensemble
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, SVR
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error
from statsmodels.tools.eval_measures import mse, rmse
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV
from sklearn.naive_bayes import BernoulliNB
%matplotlib inline

In [2]:
# load the data

df = pd.read_csv('RegularSeasonDetailedResults.csv')

In [3]:
# create training set from this data

training_set = pd.DataFrame()

training_set['net_fgm'] = df['WFGM'] - df['LFGM']
training_set['net_fga'] = df['WFGA'] - df['LFGA']
training_set['net_fgm3'] = df['WFGM3'] - df['LFGM3']
training_set['net_fga3'] = df['WFGA3'] - df['LFGA3']
training_set['net_ftm'] = df['WFTM'] - df['LFTM']
training_set['net_fta'] = df['WFTA'] - df['LFTA']
training_set['net_or'] = df['WOR'] - df['LOR']
training_set['net_dr'] = df['WDR'] - df['LDR']
training_set['net_tr'] = df['WOR'] + df['WDR'] - df['LOR'] - df['LDR']
training_set['net_ast'] = df['WAst'] - df['LAst']
training_set['net_to'] = df['WTO'] - df['LTO']
training_set['net_stl'] = df['WStl'] - df['LStl']
training_set['net_blk'] = df['WBlk'] - df['LBlk']
training_set['net_pf'] = df['WPF'] - df['LPF']
training_set['win'] = 1

In [4]:
inverse_df = -training_set
inverse_df['win'] = 0

In [5]:
# this is the final version of the training set
# x_train = all columns except 'win'
# y_train = win column

final_df = training_set.append(inverse_df)

x_train = final_df.drop(columns = 'win')
y_train = final_df['win']

In [6]:
df_2018 = df.loc[df['Season'] == 2018]

In [7]:
df_2018_net = pd.DataFrame()

df_2018_net['teamid'] = df_2018['WTeamID']
df_2018_net['net_fgm'] = df_2018['WFGM'] - df_2018['LFGM']
df_2018_net['net_fga'] = df_2018['WFGA'] - df_2018['LFGA']
df_2018_net['net_fgm3'] = df_2018['WFGM3'] - df_2018['LFGM3']
df_2018_net['net_fga3'] = df_2018['WFGA3'] - df_2018['LFGA3']
df_2018_net['net_ftm'] = df_2018['WFTM'] - df_2018['LFTM']
df_2018_net['net_fta'] = df_2018['WFTA'] - df_2018['LFTA']
df_2018_net['net_or'] = df_2018['WOR'] - df_2018['LOR']
df_2018_net['net_dr'] = df_2018['WDR'] - df_2018['LDR']
df_2018_net['net_tr'] = df_2018['WOR'] + df_2018['WDR'] - df_2018['LOR'] - df_2018['LDR']
df_2018_net['net_ast'] = df_2018['WAst'] - df_2018['LAst']
df_2018_net['net_to'] = df_2018['WTO'] - df_2018['LTO']
df_2018_net['net_stl'] = df_2018['WStl'] - df_2018['LStl']
df_2018_net['net_blk'] = df_2018['WBlk'] - df_2018['LBlk']
df_2018_net['net_pf'] = df_2018['WPF'] - df_2018['LPF']

In [8]:
df_2018_inverse = -df_2018_net
df_2018_inverse['teamid'] = df_2018['LTeamID']

In [9]:
aggregate_2018_df = df_2018_net.append(df_2018_inverse)

In [10]:
test_data = aggregate_2018_df.groupby('teamid')['net_fgm', 'net_fga', 'net_fgm3', 'net_fga3', 'net_ftm', 'net_fta', 'net_or',
                                             'net_dr', 'net_tr', 'net_ast', 'net_to', 'net_stl', 'net_blk', 'net_pf'].mean()

In [11]:
submission_df = pd.read_csv('SampleSubmissionStage2.csv')

In [12]:
# create final dataframe to submit into the models for prediction

test_df = pd.DataFrame()
test_df['team1'] = submission_df['ID'].str[5:9]
test_df['team2'] = submission_df['ID'].str[10:14]

In [13]:
test_copy = test_df

for index, row in test_copy.iterrows():
    test_copy.loc[index, 'net_fgm'] = test_data.loc[int(row['team1']), 'net_fgm'] - test_data.loc[int(row['team2']), 'net_fgm']
    test_copy.loc[index, 'net_fga'] = test_data.loc[int(row['team1']), 'net_fga'] - test_data.loc[int(row['team2']), 'net_fga']
    test_copy.loc[index, 'net_fgm3'] = test_data.loc[int(row['team1']), 'net_fgm3'] - test_data.loc[int(row['team2']), 'net_fgm3']
    test_copy.loc[index, 'net_fga3'] = test_data.loc[int(row['team1']), 'net_fga3'] - test_data.loc[int(row['team2']), 'net_fga3']
    test_copy.loc[index, 'net_ftm'] = test_data.loc[int(row['team1']), 'net_ftm'] - test_data.loc[int(row['team2']), 'net_ftm']
    test_copy.loc[index, 'net_fta'] = test_data.loc[int(row['team1']), 'net_fta'] - test_data.loc[int(row['team2']), 'net_fta']
    test_copy.loc[index, 'net_or'] = test_data.loc[int(row['team1']), 'net_or'] - test_data.loc[int(row['team2']), 'net_or']
    test_copy.loc[index, 'net_dr'] = test_data.loc[int(row['team1']), 'net_dr'] - test_data.loc[int(row['team2']), 'net_dr']
    test_copy.loc[index, 'net_tr'] = test_data.loc[int(row['team1']), 'net_tr'] - test_data.loc[int(row['team2']), 'net_tr']
    test_copy.loc[index, 'net_ast'] = test_data.loc[int(row['team1']), 'net_ast'] - test_data.loc[int(row['team2']), 'net_ast']
    test_copy.loc[index, 'net_to'] = test_data.loc[int(row['team1']), 'net_to'] - test_data.loc[int(row['team2']), 'net_to']
    test_copy.loc[index, 'net_stl'] = test_data.loc[int(row['team1']), 'net_stl'] - test_data.loc[int(row['team2']), 'net_stl']
    test_copy.loc[index, 'net_blk'] = test_data.loc[int(row['team1']), 'net_blk'] - test_data.loc[int(row['team2']), 'net_blk']
    test_copy.loc[index, 'net_pf'] = test_data.loc[int(row['team1']), 'net_pf'] - test_data.loc[int(row['team2']), 'net_pf']

In [14]:
# gradient boosting classifier, 2 deep trees, 500 iterations

gbc2d500 = GradientBoostingClassifier(loss = 'deviance',
                                     n_estimators = 500,
                                     max_depth=2)
gbc2d500.fit(x_train, y_train)
gbc2d500_test = gbc2d500.predict(test_copy.drop(columns = ['team1', 'team2']))
gbc2d500_submission = submission_df
gbc2d500_submission['Pred'] = gbc2d500_test
gbc2d500_submission.to_csv('gbc2d500_submission.csv', index=False)

In [15]:
# gradient boosting classifier, 2 deep trees, 100 iterations

gbc2d100 = GradientBoostingClassifier(loss = 'deviance',
                                     n_estimators = 100,
                                     max_depth=2)
gbc2d100.fit(x_train, y_train)
gbc2d100_test = gbc2d100.predict(test_copy.drop(columns = ['team1', 'team2']))
gbc2d100_submission = submission_df
gbc2d100_submission['Pred'] = gbc2d100_test
gbc2d100_submission.to_csv('gbc2d100_submission.csv', index=False)

In [16]:
# gradient boosting classifier, 3 deep trees, 500 iterations

gbc3d500 = GradientBoostingClassifier(loss = 'deviance',
                                     n_estimators = 500,
                                     max_depth=3)
gbc3d500.fit(x_train, y_train)
gbc3d500_test = gbc3d500.predict(test_copy.drop(columns = ['team1', 'team2']))
gbc3d500_submission = submission_df
gbc3d500_submission['Pred'] = gbc3d500_test
gbc3d500_submission.to_csv('gbc3d500_submission.csv', index=False)

In [17]:
# gradient boosting classifier, 3 deep trees, 100 iterations

gbc3d100 = GradientBoostingClassifier(loss = 'deviance',
                                     n_estimators = 100,
                                     max_depth=3)
gbc3d100.fit(x_train, y_train)
gbc3d100_test = gbc3d100.predict(test_copy.drop(columns = ['team1', 'team2']))
gbc3d100_submission = submission_df
gbc3d100_submission['Pred'] = gbc3d100_test
gbc3d100_submission.to_csv('gbc3d100_submission.csv', index=False)

In [21]:
# gradient boosting regressor, 2 deep trees, 100 iterations

gbr2d100 = GradientBoostingRegressor(n_estimators = 100,
                                     max_depth=2)
gbr2d100.fit(x_train, y_train)
gbr2d100_test = gbr2d100.predict(test_copy.drop(columns = ['team1', 'team2']))
gbr2d100_bounded = np.clip(gbr2d100_test, 0, 1)
gbr2d100_submission = submission_df
gbr2d100_submission['Pred'] = gbr2d100_bounded
gbr2d100_submission.to_csv('gbr2d100_submission.csv', index=False)

In [22]:
# gradient boosting regressor, 2 deep trees, 500 iterations

gbr2d500 = GradientBoostingRegressor(n_estimators = 500,
                                     max_depth=2)
gbr2d500.fit(x_train, y_train)
gbr2d500_test = gbr2d500.predict(test_copy.drop(columns = ['team1', 'team2']))
gbr2d500_bounded = np.clip(gbr2d500_test, 0, 1)
gbr2d500_submission = submission_df
gbr2d500_submission['Pred'] = gbr2d500_bounded
gbr2d500_submission.to_csv('gbr2d500_submission.csv', index=False)

In [23]:
# gradient boosting regressor, 3 deep trees, 100 iterations

gbr3d100 = GradientBoostingRegressor(n_estimators = 100,
                                     max_depth=3)
gbr3d100.fit(x_train, y_train)
gbr3d100_test = gbr3d100.predict(test_copy.drop(columns = ['team1', 'team2']))
gbr3d100_bounded = np.clip(gbr3d100_test, 0, 1)
gbr3d100_submission = submission_df
gbr3d100_submission['Pred'] = gbr3d100_bounded
gbr3d100_submission.to_csv('gbr3d100_submission.csv', index=False)

In [24]:
# gradient boosting regressor, 3 deep trees, 500 iterations

gbr3d500 = GradientBoostingRegressor(n_estimators = 500,
                                     max_depth=3)
gbr3d500.fit(x_train, y_train)
gbr3d500_test = gbr3d500.predict(test_copy.drop(columns = ['team1', 'team2']))
gbr3d500_bounded = np.clip(gbr3d500_test, 0, 1)
gbr3d500_submission = submission_df
gbr3d500_submission['Pred'] = gbr3d500_bounded
gbr3d500_submission.to_csv('gbr3d500_submission.csv', index=False)