In [33]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn import neighbors, tree, ensemble
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error
from statsmodels.tools.eval_measures import mse, rmse
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV
from sklearn.naive_bayes import BernoulliNB
%matplotlib inline

In [2]:
# load the data

df = pd.read_csv('RegularSeasonDetailedResults.csv')

In [3]:
# create training set from this data

training_set = pd.DataFrame()

training_set['net_fgm'] = df['WFGM'] - df['LFGM']
training_set['net_fga'] = df['WFGA'] - df['LFGA']
training_set['net_fgm3'] = df['WFGM3'] - df['LFGM3']
training_set['net_fga3'] = df['WFGA3'] - df['LFGA3']
training_set['net_ftm'] = df['WFTM'] - df['LFTM']
training_set['net_fta'] = df['WFTA'] - df['LFTA']
training_set['net_or'] = df['WOR'] - df['LOR']
training_set['net_dr'] = df['WDR'] - df['LDR']
training_set['net_tr'] = df['WOR'] + df['WDR'] - df['LOR'] - df['LDR']
training_set['net_ast'] = df['WAst'] - df['LAst']
training_set['net_to'] = df['WTO'] - df['LTO']
training_set['net_stl'] = df['WStl'] - df['LStl']
training_set['net_blk'] = df['WBlk'] - df['LBlk']
training_set['net_pf'] = df['WPF'] - df['LPF']
training_set['win'] = 1

In [4]:
inverse_df = -training_set
inverse_df['win'] = 0

In [5]:
# this is the final version of the training set
# x_train = all columns except 'win'
# y_train = win column

final_df = training_set.append(inverse_df)

x_train = final_df.drop(columns = 'win')
y_train = final_df['win']

In [6]:
df_2018 = df.loc[df['Season'] == 2018]

In [7]:
df_2018_net = pd.DataFrame()

df_2018_net['teamid'] = df_2018['WTeamID']
df_2018_net['net_fgm'] = df_2018['WFGM'] - df_2018['LFGM']
df_2018_net['net_fga'] = df_2018['WFGA'] - df_2018['LFGA']
df_2018_net['net_fgm3'] = df_2018['WFGM3'] - df_2018['LFGM3']
df_2018_net['net_fga3'] = df_2018['WFGA3'] - df_2018['LFGA3']
df_2018_net['net_ftm'] = df_2018['WFTM'] - df_2018['LFTM']
df_2018_net['net_fta'] = df_2018['WFTA'] - df_2018['LFTA']
df_2018_net['net_or'] = df_2018['WOR'] - df_2018['LOR']
df_2018_net['net_dr'] = df_2018['WDR'] - df_2018['LDR']
df_2018_net['net_tr'] = df_2018['WOR'] + df_2018['WDR'] - df_2018['LOR'] - df_2018['LDR']
df_2018_net['net_ast'] = df_2018['WAst'] - df_2018['LAst']
df_2018_net['net_to'] = df_2018['WTO'] - df_2018['LTO']
df_2018_net['net_stl'] = df_2018['WStl'] - df_2018['LStl']
df_2018_net['net_blk'] = df_2018['WBlk'] - df_2018['LBlk']
df_2018_net['net_pf'] = df_2018['WPF'] - df_2018['LPF']

In [8]:
df_2018_inverse = -df_2018_net
df_2018_inverse['teamid'] = df_2018['LTeamID']

In [9]:
aggregate_2018_df = df_2018_net.append(df_2018_inverse)

In [10]:
test_data = aggregate_2018_df.groupby('teamid')['net_fgm', 'net_fga', 'net_fgm3', 'net_fga3', 'net_ftm', 'net_fta', 'net_or',
                                             'net_dr', 'net_tr', 'net_ast', 'net_to', 'net_stl', 'net_blk', 'net_pf'].mean()

In [11]:
submission_df = pd.read_csv('SampleSubmissionStage2.csv')

In [12]:
# create final dataframe to submit into the models for prediction

test_df = pd.DataFrame()
test_df['team1'] = submission_df['ID'].str[5:9]
test_df['team2'] = submission_df['ID'].str[10:14]

In [13]:
test_copy = test_df

for index, row in test_copy.iterrows():
    test_copy.loc[index, 'net_fgm'] = test_data.loc[int(row['team1']), 'net_fgm'] - test_data.loc[int(row['team2']), 'net_fgm']
    test_copy.loc[index, 'net_fga'] = test_data.loc[int(row['team1']), 'net_fga'] - test_data.loc[int(row['team2']), 'net_fga']
    test_copy.loc[index, 'net_fgm3'] = test_data.loc[int(row['team1']), 'net_fgm3'] - test_data.loc[int(row['team2']), 'net_fgm3']
    test_copy.loc[index, 'net_fga3'] = test_data.loc[int(row['team1']), 'net_fga3'] - test_data.loc[int(row['team2']), 'net_fga3']
    test_copy.loc[index, 'net_ftm'] = test_data.loc[int(row['team1']), 'net_ftm'] - test_data.loc[int(row['team2']), 'net_ftm']
    test_copy.loc[index, 'net_fta'] = test_data.loc[int(row['team1']), 'net_fta'] - test_data.loc[int(row['team2']), 'net_fta']
    test_copy.loc[index, 'net_or'] = test_data.loc[int(row['team1']), 'net_or'] - test_data.loc[int(row['team2']), 'net_or']
    test_copy.loc[index, 'net_dr'] = test_data.loc[int(row['team1']), 'net_dr'] - test_data.loc[int(row['team2']), 'net_dr']
    test_copy.loc[index, 'net_tr'] = test_data.loc[int(row['team1']), 'net_tr'] - test_data.loc[int(row['team2']), 'net_tr']
    test_copy.loc[index, 'net_ast'] = test_data.loc[int(row['team1']), 'net_ast'] - test_data.loc[int(row['team2']), 'net_ast']
    test_copy.loc[index, 'net_to'] = test_data.loc[int(row['team1']), 'net_to'] - test_data.loc[int(row['team2']), 'net_to']
    test_copy.loc[index, 'net_stl'] = test_data.loc[int(row['team1']), 'net_stl'] - test_data.loc[int(row['team2']), 'net_stl']
    test_copy.loc[index, 'net_blk'] = test_data.loc[int(row['team1']), 'net_blk'] - test_data.loc[int(row['team2']), 'net_blk']
    test_copy.loc[index, 'net_pf'] = test_data.loc[int(row['team1']), 'net_pf'] - test_data.loc[int(row['team2']), 'net_pf']

In [14]:
# decision tree classifier, 1 feature, 5 deep

dt1f5d = tree.DecisionTreeClassifier(
    criterion='entropy',
    max_features=1,
    max_depth=5,
)
dt1f5d.fit(x_train, y_train)
dt1f5d_test = dt1f5d.predict(test_copy.drop(columns = ['team1', 'team2']))
dt1f5d_submission = submission_df
dt1f5d_submission['Pred'] = dt1f5d_test
dt1f5d_submission.to_csv('dt1f5d_submission.csv', index=False)

In [15]:
# decision tree classifier, 1 feature, 10 deep

dt1f10d = tree.DecisionTreeClassifier(
    criterion='entropy',
    max_features=1,
    max_depth=10,
)
dt1f10d.fit(x_train, y_train)
dt1f10d_test = dt1f10d.predict(test_copy.drop(columns = ['team1', 'team2']))
dt1f10d_submission = submission_df
dt1f10d_submission['Pred'] = dt1f10d_test
dt1f10d_submission.to_csv('dt1f10d_submission.csv', index=False)

In [16]:
# decision tree classifier, 1 feature, 25 deep

dt1f25d = tree.DecisionTreeClassifier(
    criterion='entropy',
    max_features=1,
    max_depth=25,
)
dt1f25d.fit(x_train, y_train)
dt1f25d_test = dt1f25d.predict(test_copy.drop(columns = ['team1', 'team2']))
dt1f25d_submission = submission_df
dt1f25d_submission['Pred'] = dt1f25d_test
dt1f25d_submission.to_csv('dt1f25d_submission.csv', index=False)

In [17]:
# decision tree classifier, 5 features, 5 deep

dt5f5d = tree.DecisionTreeClassifier(
    criterion='entropy',
    max_features=5,
    max_depth=5,
)
dt5f5d.fit(x_train, y_train)
dt5f5d_test = dt5f5d.predict(test_copy.drop(columns = ['team1', 'team2']))
dt5f5d_submission = submission_df
dt5f5d_submission['Pred'] = dt5f5d_test
dt5f5d_submission.to_csv('dt5f5d_submission.csv', index=False)

In [18]:
# decision tree classifier, 5 features, 10 deep

dt5f10d = tree.DecisionTreeClassifier(
    criterion='entropy',
    max_features=5,
    max_depth=10,
)
dt5f10d.fit(x_train, y_train)
dt5f10d_test = dt5f10d.predict(test_copy.drop(columns = ['team1', 'team2']))
dt5f10d_submission = submission_df
dt5f10d_submission['Pred'] = dt5f10d_test
dt5f10d_submission.to_csv('dt5f10d_submission.csv', index=False)

In [19]:
# decision tree classifier, 5 features, 25 deep

dt5f25d = tree.DecisionTreeClassifier(
    criterion='entropy',
    max_features=5,
    max_depth=25,
)
dt5f25d.fit(x_train, y_train)
dt5f25d_test = dt5f25d.predict(test_copy.drop(columns = ['team1', 'team2']))
dt5f25d_submission = submission_df
dt5f25d_submission['Pred'] = dt5f25d_test
dt5f25d_submission.to_csv('dt5f25d_submission.csv', index=False)

In [20]:
# decision tree regressor, 1 feature, 5 deep

dtr1f5d = DecisionTreeRegressor(
    max_features=1,
    max_depth=5
)
dtr1f5d.fit(x_train, y_train)
dtr1f5d_test = dtr1f5d.predict(test_copy.drop(columns = ['team1', 'team2']))
dtr1f5d_submission = submission_df
dtr1f5d_submission['Pred'] = dtr1f5d_test
dtr1f5d_submission.to_csv('dtr1f5d_submission.csv', index=False)

In [21]:
# decision tree regressor, 1 feature, 10 deep

dtr1f10d = DecisionTreeRegressor(
    max_features=1,
    max_depth=10
)
dtr1f10d.fit(x_train, y_train)
dtr1f10d_test = dtr1f10d.predict(test_copy.drop(columns = ['team1', 'team2']))
dtr1f10d_submission = submission_df
dtr1f10d_submission['Pred'] = dtr1f10d_test
dtr1f10d_submission.to_csv('dtr1f10d_submission.csv', index=False)

In [22]:
# decision tree regressor, 1 feature, 25 deep

dtr1f25d = DecisionTreeRegressor(
    max_features=1,
    max_depth=25
)
dtr1f25d.fit(x_train, y_train)
dtr1f25d_test = dtr1f25d.predict(test_copy.drop(columns = ['team1', 'team2']))
dtr1f25d_submission = submission_df
dtr1f25d_submission['Pred'] = dtr1f25d_test
dtr1f25d_submission.to_csv('dtr1f25d_submission.csv', index=False)

In [23]:
# decision tree regressor, 5 feature, 5 deep

dtr5f5d = DecisionTreeRegressor(
    max_features=5,
    max_depth=5
)
dtr5f5d.fit(x_train, y_train)
dtr5f5d_test = dtr5f5d.predict(test_copy.drop(columns = ['team1', 'team2']))
dtr5f5d_submission = submission_df
dtr5f5d_submission['Pred'] = dtr5f5d_test
dtr5f5d_submission.to_csv('dtr5f5d_submission.csv', index=False)

In [24]:
# decision tree regressor, 5 feature, 10 deep

dtr5f10d = DecisionTreeRegressor(
    max_features=5,
    max_depth=10
)
dtr5f10d.fit(x_train, y_train)
dtr5f10d_test = dtr5f10d.predict(test_copy.drop(columns = ['team1', 'team2']))
dtr5f10d_submission = submission_df
dtr5f10d_submission['Pred'] = dtr5f10d_test
dtr5f10d_submission.to_csv('dtr5f10d_submission.csv', index=False)

In [25]:
# decision tree regressor, 5 feature, 25 deep

dtr5f25d = DecisionTreeRegressor(
    max_features=5,
    max_depth=25
)
dtr5f25d.fit(x_train, y_train)
dtr5f25d_test = dtr5f25d.predict(test_copy.drop(columns = ['team1', 'team2']))
dtr5f25d_submission = submission_df
dtr5f25d_submission['Pred'] = dtr5f25d_test
dtr5f25d_submission.to_csv('dtr5f25d_submission.csv', index=False)

In [27]:
# random forest classifier, 1 feature, 5 deep

rfc1f5d = RandomForestClassifier(
    criterion='entropy',
    max_features=1,
    max_depth=5,
)
rfc1f5d.fit(x_train, y_train)
rfc1f5d_test = rfc1f5d.predict(test_copy.drop(columns = ['team1', 'team2']))
rfc1f5d_submission = submission_df
rfc1f5d_submission['Pred'] = rfc1f5d_test
rfc1f5d_submission.to_csv('rfc1f5d_submission.csv', index=False)



In [28]:
# random forest classifier, 1 feature, 10 deep

rfc1f10d = RandomForestClassifier(
    criterion='entropy',
    max_features=1,
    max_depth=10,
)
rfc1f10d.fit(x_train, y_train)
rfc1f10d_test = rfc1f10d.predict(test_copy.drop(columns = ['team1', 'team2']))
rfc1f10d_submission = submission_df
rfc1f10d_submission['Pred'] = rfc1f10d_test
rfc1f10d_submission.to_csv('rfc1f10d_submission.csv', index=False)



In [29]:
# random forest classifier, 1 feature, 25 deep

rfc1f25d = RandomForestClassifier(
    criterion='entropy',
    max_features=1,
    max_depth=25,
)
rfc1f25d.fit(x_train, y_train)
rfc1f25d_test = rfc1f25d.predict(test_copy.drop(columns = ['team1', 'team2']))
rfc1f25d_submission = submission_df
rfc1f25d_submission['Pred'] = rfc1f25d_test
rfc1f25d_submission.to_csv('rfc1f25d_submission.csv', index=False)



In [30]:
# random forest classifier, 5 feature, 5 deep

rfc5f5d = RandomForestClassifier(
    criterion='entropy',
    max_features=5,
    max_depth=5,
)
rfc5f5d.fit(x_train, y_train)
rfc5f5d_test = rfc5f5d.predict(test_copy.drop(columns = ['team1', 'team2']))
rfc5f5d_submission = submission_df
rfc5f5d_submission['Pred'] = rfc5f5d_test
rfc5f5d_submission.to_csv('rfc5f5d_submission.csv', index=False)



In [31]:
# random forest classifier, 5 feature, 10 deep

rfc5f10d = RandomForestClassifier(
    criterion='entropy',
    max_features=5,
    max_depth=10,
)
rfc5f10d.fit(x_train, y_train)
rfc5f10d_test = rfc5f10d.predict(test_copy.drop(columns = ['team1', 'team2']))
rfc5f10d_submission = submission_df
rfc5f10d_submission['Pred'] = rfc5f10d_test
rfc5f10d_submission.to_csv('rfc5f10d_submission.csv', index=False)



In [32]:
# random forest classifier, 5 feature, 25 deep

rfc5f25d = RandomForestClassifier(
    criterion='entropy',
    max_features=5,
    max_depth=25,
)
rfc5f25d.fit(x_train, y_train)
rfc5f25d_test = rfc5f25d.predict(test_copy.drop(columns = ['team1', 'team2']))
rfc5f25d_submission = submission_df
rfc5f25d_submission['Pred'] = rfc5f25d_test
rfc5f25d_submission.to_csv('rfc5f25d_submission.csv', index=False)



In [34]:
# random forest regressor, 1 feature, 5 deep

rfr1f5d = RandomForestRegressor(
    max_features=1,
    max_depth=5,
)
rfr1f5d.fit(x_train, y_train)
rfr1f5d_test = rfr1f5d.predict(test_copy.drop(columns = ['team1', 'team2']))
rfr1f5d_submission = submission_df
rfr1f5d_submission['Pred'] = rfr1f5d_test
rfr1f5d_submission.to_csv('rfr1f5d_submission.csv', index=False)



In [35]:
# random forest regressor, 1 feature, 10 deep

rfr1f10d = RandomForestRegressor(
    max_features=1,
    max_depth=10,
)
rfr1f10d.fit(x_train, y_train)
rfr1f10d_test = rfr1f10d.predict(test_copy.drop(columns = ['team1', 'team2']))
rfr1f10d_submission = submission_df
rfr1f10d_submission['Pred'] = rfr1f10d_test
rfr1f10d_submission.to_csv('rfr1f10d_submission.csv', index=False)



In [36]:
# random forest regressor, 1 feature, 25 deep

rfr1f25d = RandomForestRegressor(
    max_features=1,
    max_depth=25,
)
rfr1f25d.fit(x_train, y_train)
rfr1f25d_test = rfr1f25d.predict(test_copy.drop(columns = ['team1', 'team2']))
rfr1f25d_submission = submission_df
rfr1f25d_submission['Pred'] = rfr1f25d_test
rfr1f25d_submission.to_csv('rfr1f25d_submission.csv', index=False)



In [37]:
# random forest regressor, 5 feature, 5 deep

rfr5f5d = RandomForestRegressor(
    max_features=5,
    max_depth=5,
)
rfr5f5d.fit(x_train, y_train)
rfr5f5d_test = rfr5f5d.predict(test_copy.drop(columns = ['team1', 'team2']))
rfr5f5d_submission = submission_df
rfr5f5d_submission['Pred'] = rfr5f5d_test
rfr5f5d_submission.to_csv('rfr5f5d_submission.csv', index=False)



In [38]:
# random forest regressor, 5 feature, 10 deep

rfr5f10d = RandomForestRegressor(
    max_features=5,
    max_depth=10,
)
rfr5f10d.fit(x_train, y_train)
rfr5f10d_test = rfr5f10d.predict(test_copy.drop(columns = ['team1', 'team2']))
rfr5f10d_submission = submission_df
rfr5f10d_submission['Pred'] = rfr5f10d_test
rfr5f10d_submission.to_csv('rfr5f10d_submission.csv', index=False)



In [39]:
# random forest regressor, 5 feature, 25 deep

rfr5f25d = RandomForestRegressor(
    max_features=5,
    max_depth=25,
)
rfr5f25d.fit(x_train, y_train)
rfr5f25d_test = rfr5f25d.predict(test_copy.drop(columns = ['team1', 'team2']))
rfr5f25d_submission = submission_df
rfr5f25d_submission['Pred'] = rfr5f25d_test
rfr5f25d_submission.to_csv('rfr5f25d_submission.csv', index=False)

