In [27]:
import pandas as pd
from sklearn import ensemble
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, make_scorer, classification_report
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import numpy as np
from collections import defaultdict
import time

In [28]:
teamPath = '/Users/justinholmes/Desktop/Sports_Model/Data/teamData.csv'
playerPath = '/Users/justinholmes/Desktop/Sports_Model/Data/playerData.csv'
outcomePath = '/Users/justinholmes/Desktop/Sports_Model/Data/outcomeData.csv'
featurePath = '/Users/justinholmes/Desktop/Sports_Model/Data/featureData.csv'

In [29]:
teamData = pd.read_csv(teamPath)
playerData = pd.read_csv(playerPath, engine='python')
outcomeData = pd.read_csv(outcomePath)
featureData = pd.read_csv(featurePath)

### Format all data

In [30]:
# drop unwanted columns
outcomeData = outcomeData.drop(["Start Time", "Box Score", "Notes"], axis=1)

# change OT to integer value for number of OT rounds
outcomeData.loc[(outcomeData["OT?"].isna()),'OT?']= 0
outcomeData.loc[(outcomeData["OT?"] == "OT"),'OT?']= 1
outcomeData.loc[(outcomeData["OT?"] == "2OT"),'OT?']= 2
outcomeData.loc[(outcomeData["OT?"] == "3OT"),'OT?']= 3
outcomeData.loc[(outcomeData["OT?"] == "4OT"),'OT?']= 4

In [31]:
def redefineTeamNames(dataframe):
    dataframe = dataframe.replace('Seattle SuperSonics', 'Oklahoma City Thunder', regex=True)
    
    dataframe = dataframe.replace('New Orleans/Oklahoma City Hornets', 'New Orleans Pelicans', regex=True)

    dataframe = dataframe.replace('New Orleans Hornets', 'New Orleans Pelicans', regex=True)
    
    dataframe = dataframe.replace('Charlotte Bobcats', 'Charlotte Hornets', regex=True)
    
    dataframe = dataframe.replace('New Jersey Nets', 'Brooklyn Nets', regex=True)
    
#     dataframe = dataframe.replace('*', '', regex=True)

    return dataframe

outcomeData = redefineTeamNames(outcomeData)
teamData = redefineTeamNames(teamData)


In [32]:
outcomeData["Final Home Spread"] = (outcomeData["VisitorPTS"] - outcomeData["HomePTS"]).astype(float)

In [33]:
outcomeData.head()

Unnamed: 0,Year,Month,Visitor,VisitorPTS,Home,HomePTS,OT?,Attend,Total Points,Winner,Final Home Spread
0,2005,november,Houston Rockets,79,Detroit Pistons,87,0,22076,166,Detroit Pistons,-8.0
1,2005,november,Sacramento Kings,98,Dallas Mavericks,107,0,20041,205,Dallas Mavericks,-9.0
2,2005,november,Denver Nuggets,78,Los Angeles Lakers,89,0,18997,167,Los Angeles Lakers,-11.0
3,2005,november,Indiana Pacers,109,Cleveland Cavaliers,104,2,19730,213,Indiana Pacers,5.0
4,2005,november,Milwaukee Bucks,92,Orlando Magic,93,0,15138,185,Orlando Magic,-1.0


In [34]:
teamData.head()

Unnamed: 0,Team,Year,G_PG,MP_PG,FG_PG,FGA_PG,FG%_PG,3P_PG,3PA_PG,3P%_PG,2P_PG,2PA_PG,2P%_PG,FT_PG,FTA_PG,FT%_PG,ORB_PG,DRB_PG,TRB_PG,AST_PG,STL_PG,BLK_PG,TOV_PG,PF_PG,PTS_PG,G_PP,MP_PP,FG_PP,FGA_PP,FG%_PP,3P_PP,3PA_PP,3P%_PP,2P_PP,2PA_PP,2P%_PP,FT_PP,FTA_PP,FT%_PP,ORB_PP,DRB_PP,TRB_PP,AST_PP,STL_PP,BLK_PP,TOV_PP,PF_PP,PTS_PP,G_OppPG,MP_OppPG,FG_OppPG,FGA_OppPG,FG%_OppPG,3P_OppPG,3PA_OppPG,3P%_OppPG,2P_OppPG,2PA_OppPG,2P%_OppPG,FT_OppPG,FTA_OppPG,FT%_OppPG,ORB_OppPG,DRB_OppPG,TRB_OppPG,AST_OppPG,STL_OppPG,BLK_OppPG,TOV_OppPG,PF_OppPG,PTS_OppPG,G_OppPP,MP_OppPP,FG_OppPP,FGA_OppPP,FG%_OppPP,3P_OppPP,3PA_OppPP,3P%_OppPP,2P_OppPP,2PA_OppPP,2P%_OppPP,FT_OppPP,FTA_OppPP,FT%_OppPP,ORB_OppPP,DRB_OppPP,TRB_OppPP,AST_OppPP,STL_OppPP,BLK_OppPP,TOV_OppPP,PF_OppPP,PTS_OppPP,Age,W,L,PW,PL,MOV,SOS,SRS,ORtg,DRtg,NRtg,Pace,FTr,3PAr,TS%,eFG%,TOV%,ORB%,FT/FGA,eFG%.1,TOV%.1,DRB%,FT/FGA.1,Arena,Attend.,Attend./G
0,Phoenix Suns*,2005,82,241.2,40.9,85.6,0.477,9.7,24.7,0.393,31.2,60.9,0.512,19.0,25.4,0.748,11.8,32.3,44.1,23.5,7.0,5.5,13.7,19.1,110.4,82,19780,42.4,88.8,0.477,10.1,25.6,0.393,32.3,63.1,0.512,19.7,26.3,0.748,12.2,33.5,45.8,24.4,7.2,5.7,14.2,19.8,114.5,82,241.2,40.6,91.3,0.445,6.0,18.0,0.335,34.6,73.3,0.472,16.1,21.6,0.744,15.0,31.1,46.1,21.2,7.6,4.2,13.8,21.3,103.3,82,19780,42.1,94.7,0.445,6.2,18.7,0.335,35.8,76.0,0.472,16.7,22.5,0.744,15.6,32.3,47.9,22.0,7.9,4.3,14.3,22.1,107.1,25.2,62,20,59,23,7.12,-0.04,7.08,114.5,107.1,7.4,95.9,0.296,0.289,0.571,0.534,12.4,27.5,0.222,0.478,12.0,68.3,0.176,America West Arena,726066,17709
1,Sacramento Kings*,2005,82,242.1,39.1,85.1,0.459,6.4,17.0,0.374,32.7,68.1,0.48,19.2,24.4,0.787,12.5,29.9,42.4,24.5,8.2,3.9,13.1,20.5,103.7,82,19855,41.6,90.7,0.459,6.8,18.1,0.374,34.8,72.5,0.48,20.5,26.0,0.787,13.3,31.9,45.2,26.0,8.8,4.1,13.9,21.9,110.5,82,242.1,38.9,84.6,0.459,5.7,16.1,0.357,33.1,68.6,0.483,18.1,24.5,0.738,13.4,30.9,44.3,21.5,7.6,4.5,14.7,21.5,101.6,82,19855,41.4,90.2,0.459,6.1,17.1,0.357,35.3,73.1,0.483,19.3,26.1,0.738,14.3,32.9,47.2,22.9,8.1,4.7,15.6,22.9,108.2,27.7,50,32,47,35,2.16,0.4,2.56,110.5,108.2,2.3,93.0,0.287,0.2,0.541,0.496,12.0,28.8,0.226,0.493,13.3,69.0,0.214,ARCO Arena (II),709997,17317
2,Dallas Mavericks*,2005,82,240.6,37.3,81.6,0.457,5.6,15.5,0.364,31.6,66.1,0.479,22.3,28.2,0.789,12.1,30.9,42.9,19.6,8.6,5.6,13.4,22.3,102.5,82,19730,40.1,87.8,0.457,6.1,16.7,0.364,34.0,71.1,0.479,24.0,30.4,0.789,13.0,33.2,46.2,21.1,9.3,6.0,14.5,24.0,110.3,82,240.6,35.9,81.9,0.438,5.5,16.7,0.33,30.4,65.2,0.466,19.5,25.9,0.754,13.1,30.1,43.2,20.9,7.1,4.9,15.6,23.2,96.8,82,19730,38.6,88.1,0.438,5.9,17.9,0.33,32.7,70.1,0.466,21.0,27.9,0.754,14.1,32.4,46.4,22.5,7.7,5.2,16.8,24.9,104.1,27.7,58,24,57,25,5.74,0.11,5.86,110.3,104.1,6.2,92.7,0.346,0.19,0.545,0.492,12.5,28.6,0.273,0.472,14.3,70.2,0.239,American Airlines Center,822533,20062
3,Miami Heat*,2005,82,243.7,37.8,77.7,0.486,5.8,15.4,0.377,32.0,62.3,0.513,20.2,30.1,0.672,10.8,32.2,43.0,21.8,6.4,5.8,13.7,22.1,101.5,82,19980,41.0,84.3,0.486,6.3,16.7,0.377,34.7,67.6,0.513,21.9,32.6,0.672,11.7,34.9,46.7,23.7,7.0,6.3,14.9,24.0,110.2,82,243.7,34.9,81.8,0.427,5.4,15.5,0.348,29.6,66.3,0.446,19.8,26.0,0.76,11.2,29.2,40.5,19.8,7.5,3.2,13.2,24.6,95.0,82,19980,37.9,88.7,0.427,5.8,16.8,0.348,32.1,71.9,0.446,21.4,28.2,0.76,12.2,31.7,43.9,21.4,8.1,3.5,14.4,26.7,103.1,28.0,59,23,59,23,6.52,-0.76,5.77,110.2,103.1,7.1,90.8,0.387,0.198,0.559,0.524,13.1,27.0,0.26,0.46,12.4,74.1,0.242,AmericanAirlines Arena,815143,19882
4,Boston Celtics*,2005,82,242.4,37.1,79.4,0.468,5.3,15.3,0.349,31.8,64.1,0.496,21.6,28.3,0.764,11.1,29.7,40.8,22.1,8.1,5.2,15.8,24.4,101.3,82,19880,39.4,84.3,0.468,5.7,16.2,0.349,33.8,68.1,0.496,23.0,30.1,0.764,11.8,31.6,43.3,23.4,8.6,5.5,16.8,25.9,107.5,82,242.4,36.1,81.3,0.444,6.0,16.7,0.356,30.1,64.6,0.467,22.3,29.5,0.753,12.7,29.6,42.3,22.7,8.7,4.9,15.6,23.6,100.4,82,19880,38.3,86.2,0.444,6.3,17.7,0.356,32.0,68.5,0.467,23.6,31.4,0.753,13.4,31.4,44.9,24.1,9.2,5.2,16.5,25.1,106.6,27.1,45,37,43,39,0.87,-0.52,0.35,107.5,106.6,0.9,93.3,0.357,0.192,0.551,0.501,14.7,27.2,0.273,0.481,14.2,70.1,0.274,FleetCenter,656081,16002


In [35]:
# # generate column names
# teamDataColumn = teamData.drop(['Team', "Year", "Arena", "Attend.", "Attend./G", "G_PG", "MP_PG", 'G_OppPG', 'G_OppPP', 'MP_OppPP'], axis=1).columns
# # testTeamDataColumn = teamData.drop(teamData.columns.difference(["FGA_PG","3PA_PG","3P_PG"]), axis=1).columns
# # outcomeData = outcomeData.loc[outcomeData['Year'] == 2020]
# outcomeData

# def differentialVariableGenerator(outcomeData, teamData, teamDataColumn):
                                  
#     global teamData_global
#     global counter 
#     global differentialFeatureData
    
#     differentialFeatureData = pd.DataFrame(index=range(outcomeData.shape[0]), columns = teamDataColumn)
#     print(outcomeData.shape[0])
#     teamData_global = teamData
#     totalCount = outcomeData.shape[0]
#     counter = 0
#     overallTime = time.time()
    
#     def calcDifferences(row):
                                  
#         global teamData_global
#         global counter
                                  
#         counter = counter + 1;
#         start = time.time()
#         year = row['Year']
#         visitor = row['Visitor']
#         home = row['Home']
                                  
#         def columnCalc(col):
                                  
#             variable = col.name
#             year_subset = teamData_global.loc[(teamData_global.Year == year)]
#             visitorValue = year_subset.loc[year_subset.Team.astype(str).str.contains(visitor)][variable]
#             homeValue = year_subset.loc[year_subset.Team.astype(str).str.contains(home)][variable]
#             difference = homeValue.iloc[0] - visitorValue.iloc[0]
#             differentialFeatureData[variable][counter-1] = difference
#             #print(homeValue, visitorValue, difference)
                                  
#         teamData_global[teamDataColumn].apply(columnCalc)
# #         print(visitor, home, time.time() - start,time.time() - overallTime)
#         print("{count}/{total}".format(count = counter, total = totalCount))
                                  
#     outcomeData.apply(calcDifferences, axis=1)
# #     print("Total time {time} seconds".format(time = time.time() - overallTime))
#     return differentialFeatureData

# testOutcomeData = differentialVariableGenerator(outcomeData, teamData, teamDataColumn)

# testOutcomeData.to_csv('/Users/justinholmes/Desktop/Sports_Model/Data/featureData.csv', index=False)

In [36]:
outcomeData = pd.concat([outcomeData, featureData], axis=1, sort=False)

In [37]:
outcomeData.head()

Unnamed: 0,Year,Month,Visitor,VisitorPTS,Home,HomePTS,OT?,Attend,Total Points,Winner,Final Home Spread,FG_PG,FGA_PG,FG%_PG,3P_PG,3PA_PG,3P%_PG,2P_PG,2PA_PG,2P%_PG,FT_PG,FTA_PG,FT%_PG,ORB_PG,DRB_PG,TRB_PG,AST_PG,STL_PG,BLK_PG,TOV_PG,PF_PG,PTS_PG,G_PP,MP_PP,FG_PP,FGA_PP,FG%_PP,3P_PP,3PA_PP,3P%_PP,2P_PP,2PA_PP,2P%_PP,FT_PP,FTA_PP,FT%_PP,ORB_PP,DRB_PP,TRB_PP,AST_PP,STL_PP,BLK_PP,TOV_PP,PF_PP,PTS_PP,G_OppPG,MP_OppPG,FG_OppPG,FGA_OppPG,FG%_OppPG,3P_OppPG,3PA_OppPG,3P%_OppPG,2P_OppPG,2PA_OppPG,2P%_OppPG,FT_OppPG,FTA_OppPG,FT%_OppPG,ORB_OppPG,DRB_OppPG,TRB_OppPG,AST_OppPG,STL_OppPG,BLK_OppPG,TOV_OppPG,PF_OppPG,PTS_OppPG,G_OppPP,MP_OppPP,FG_OppPP,FGA_OppPP,FG%_OppPP,3P_OppPP,3PA_OppPP,3P%_OppPP,2P_OppPP,2PA_OppPP,2P%_OppPP,FT_OppPP,FTA_OppPP,FT%_OppPP,ORB_OppPP,DRB_OppPP,TRB_OppPP,AST_OppPP,STL_OppPP,BLK_OppPP,TOV_OppPP,PF_OppPP,PTS_OppPP,Age,W,L,PW,PL,MOV,SOS,SRS,ORtg,DRtg,NRtg,Pace,FTr,3PAr,TS%,eFG%,TOV%,ORB%,FT/FGA,eFG%.1,TOV%.1,DRB%,FT/FGA.1
0,2005,november,Houston Rockets,79,Detroit Pistons,87,0,22076,166,Detroit Pistons,-8.0,0.1,0.0,0.001,-2.3,-5.7,-0.019,2.3,5.8,-0.005,0.5,2.0,-0.042,2.2,-1.1,1.0,0.7,0.1,1.5,0.0,-2.0,-1.8,0,100,0.5,1.2,0.001,-2.5,-6.2,-0.019,3.1,7.4,-0.005,0.8,2.7,-0.042,2.6,-0.8,1.8,1.1,0.2,1.8,0.1,-2.0,-0.6,0,1.3,0.8,0.9,0.007,-0.5,-1.6,0.0,1.4,2.4,0.005,-2.8,-4.0,0.009,0.4,-1.5,-1.1,-1.0,0.0,0.4,0.5,1.0,-1.5,0,100,1.5,2.1,0.007,-0.5,-1.6,0.0,1.9,3.7,0.005,-2.9,-4.1,0.009,0.7,-1.3,-0.6,-0.8,0.1,0.5,0.9,1.4,-0.5,-1.8,3,-3,0,0,-0.17,-0.78,-0.96,-0.6,-0.5,-0.1,-1.6,0.026,-0.073,-0.015,-0.014,-0.2,4.9,0.005,0.002,0.6,-1.5,-0.038
1,2005,november,Sacramento Kings,98,Dallas Mavericks,107,0,20041,205,Dallas Mavericks,-9.0,-1.8,-3.5,-0.002,-0.8,-1.5,-0.01,-1.1,-2.0,-0.001,3.1,3.8,0.002,-0.4,1.0,0.5,-4.9,0.4,1.7,0.3,1.8,-1.2,0,-125,-1.5,-2.9,-0.002,-0.7,-1.4,-0.01,-0.8,-1.4,-0.001,3.5,4.4,0.002,-0.3,1.3,1.0,-4.9,0.5,1.9,0.6,2.1,-0.2,0,-1.5,-3.0,-2.7,-0.021,-0.2,0.6,-0.027,-2.7,-3.4,-0.017,1.4,1.4,0.016,-0.3,-0.8,-1.1,-0.6,-0.5,0.4,0.9,1.7,-4.8,0,-125,-2.8,-2.1,-0.021,-0.2,0.8,-0.027,-2.6,-3.0,-0.017,1.7,1.8,0.016,-0.2,-0.5,-0.8,-0.4,-0.4,0.5,1.2,2.0,-4.1,0.0,8,-8,10,-10,3.58,-0.29,3.3,-0.2,-4.1,3.9,-0.3,0.059,-0.01,0.004,-0.004,0.5,-0.2,0.047,-0.021,1.0,1.2,0.025
2,2005,november,Denver Nuggets,78,Los Angeles Lakers,89,0,18997,167,Los Angeles Lakers,-11.0,-1.7,0.1,-0.022,4.0,10.6,0.015,-5.6,-10.5,-0.012,-1.2,-2.1,0.014,0.7,0.4,1.2,-3.5,-2.9,-1.8,-0.6,-1.0,-0.8,0,25,-0.8,2.7,-0.022,4.4,12.0,0.015,-5.2,-9.4,-0.012,-0.7,-1.5,0.014,1.1,1.4,2.6,-3.1,-2.9,-1.8,-0.2,-0.4,2.1,0,0.3,2.3,3.9,0.006,1.1,2.2,0.016,1.3,1.7,0.007,-1.5,-3.2,0.034,0.6,0.1,0.7,2.7,-0.2,-0.5,-4.6,-1.8,4.2,0,25,3.7,6.7,0.006,1.3,2.9,0.016,2.3,3.8,0.007,-1.1,-2.6,0.034,1.0,1.1,2.0,3.6,0.1,-0.3,-4.5,-1.3,7.5,-0.7,-15,15,-14,14,-4.98,0.44,-4.55,2.1,7.5,-5.4,-2.7,-0.028,0.131,0.0,0.002,-0.4,1.1,-0.016,0.011,-4.1,-0.6,-0.029
3,2005,november,Indiana Pacers,109,Cleveland Cavaliers,104,2,19730,213,Indiana Pacers,5.0,4.0,6.3,0.015,-2.9,-8.2,-0.012,6.9,14.5,0.002,-1.4,-0.4,-0.04,3.0,-0.8,2.2,4.4,0.5,1.2,-0.4,-0.4,3.5,0,-50,3.4,4.8,0.015,-3.5,-9.6,-0.012,6.8,14.4,0.002,-2.2,-1.2,-0.04,3.0,-1.7,1.3,4.3,0.3,1.3,-0.8,-1.2,1.1,0,-0.6,1.7,2.0,0.012,0.3,0.6,0.011,1.4,1.3,0.012,-0.5,-0.9,0.007,-0.5,-1.2,-1.5,2.4,0.0,0.2,0.9,-1.4,3.5,0,-50,0.9,-0.1,0.012,0.3,0.3,0.011,0.7,-0.4,0.012,-1.0,-1.7,0.007,-0.8,-2.1,-3.0,2.1,-0.2,0.1,0.5,-2.2,1.1,-1.8,-2,2,0,0,0.03,-0.25,-0.22,1.1,1.1,0.0,2.5,-0.033,-0.12,-0.016,-0.006,-1.1,6.1,-0.039,0.013,0.5,0.2,-0.013
4,2005,november,Milwaukee Bucks,92,Orlando Magic,93,0,15138,185,Orlando Magic,-1.0,0.9,1.3,0.004,-0.1,-0.1,-0.002,0.9,1.3,0.004,0.6,1.3,-0.013,0.7,1.7,2.4,-1.7,1.2,1.9,2.3,1.7,2.3,0,-75,-0.6,-2.0,0.004,-0.2,-0.6,-0.002,-0.4,-1.5,0.004,-0.2,0.2,-0.013,0.1,0.6,0.8,-2.6,1.0,1.8,1.9,0.9,-1.7,0,-0.9,-0.2,1.8,-0.013,-0.2,-0.8,0.0,0.1,2.5,-0.017,2.2,3.4,-0.017,1.1,0.2,1.3,-2.5,1.0,-0.2,1.5,-0.4,1.6,0,-75,-1.8,-1.5,-0.013,-0.5,-1.4,0.0,-1.3,0.0,-0.017,1.6,2.6,-0.017,0.7,-1.1,-0.3,-3.6,0.8,-0.3,1.0,-1.4,-2.5,-0.4,6,-6,3,-3,0.75,-0.17,0.57,-1.7,-2.5,0.8,3.9,0.011,-0.004,0.002,0.003,1.6,0.9,0.004,-0.015,0.8,-0.7,0.022


## Functions for additional calculations

In [38]:
def homeWin(dataframe):
    if dataframe['Winner'] == dataframe['Home']:
        return True
    else:
        return False

outcomeData['HomeWin'] = outcomeData.apply(homeWin, axis=1)

#### Encode team names

In [39]:
encoding = LabelEncoder()
encoding.fit(outcomeData['Home'].values)
outcomeData['Home'] = encoding.transform(outcomeData['Home'].values)
outcomeData['Visitor'] = encoding.transform(outcomeData['Visitor'].values)

In [40]:
outcomeData.head()

Unnamed: 0,Year,Month,Visitor,VisitorPTS,Home,HomePTS,OT?,Attend,Total Points,Winner,Final Home Spread,FG_PG,FGA_PG,FG%_PG,3P_PG,3PA_PG,3P%_PG,2P_PG,2PA_PG,2P%_PG,FT_PG,FTA_PG,FT%_PG,ORB_PG,DRB_PG,TRB_PG,AST_PG,STL_PG,BLK_PG,TOV_PG,PF_PG,PTS_PG,G_PP,MP_PP,FG_PP,FGA_PP,FG%_PP,3P_PP,3PA_PP,3P%_PP,2P_PP,2PA_PP,2P%_PP,FT_PP,FTA_PP,FT%_PP,ORB_PP,DRB_PP,TRB_PP,AST_PP,STL_PP,BLK_PP,TOV_PP,PF_PP,PTS_PP,G_OppPG,MP_OppPG,FG_OppPG,FGA_OppPG,FG%_OppPG,3P_OppPG,3PA_OppPG,3P%_OppPG,2P_OppPG,2PA_OppPG,2P%_OppPG,FT_OppPG,FTA_OppPG,FT%_OppPG,ORB_OppPG,DRB_OppPG,TRB_OppPG,AST_OppPG,STL_OppPG,BLK_OppPG,TOV_OppPG,PF_OppPG,PTS_OppPG,G_OppPP,MP_OppPP,FG_OppPP,FGA_OppPP,FG%_OppPP,3P_OppPP,3PA_OppPP,3P%_OppPP,2P_OppPP,2PA_OppPP,2P%_OppPP,FT_OppPP,FTA_OppPP,FT%_OppPP,ORB_OppPP,DRB_OppPP,TRB_OppPP,AST_OppPP,STL_OppPP,BLK_OppPP,TOV_OppPP,PF_OppPP,PTS_OppPP,Age,W,L,PW,PL,MOV,SOS,SRS,ORtg,DRtg,NRtg,Pace,FTr,3PAr,TS%,eFG%,TOV%,ORB%,FT/FGA,eFG%.1,TOV%.1,DRB%,FT/FGA.1,HomeWin
0,2005,november,10,79,8,87,0,22076,166,Detroit Pistons,-8.0,0.1,0.0,0.001,-2.3,-5.7,-0.019,2.3,5.8,-0.005,0.5,2.0,-0.042,2.2,-1.1,1.0,0.7,0.1,1.5,0.0,-2.0,-1.8,0,100,0.5,1.2,0.001,-2.5,-6.2,-0.019,3.1,7.4,-0.005,0.8,2.7,-0.042,2.6,-0.8,1.8,1.1,0.2,1.8,0.1,-2.0,-0.6,0,1.3,0.8,0.9,0.007,-0.5,-1.6,0.0,1.4,2.4,0.005,-2.8,-4.0,0.009,0.4,-1.5,-1.1,-1.0,0.0,0.4,0.5,1.0,-1.5,0,100,1.5,2.1,0.007,-0.5,-1.6,0.0,1.9,3.7,0.005,-2.9,-4.1,0.009,0.7,-1.3,-0.6,-0.8,0.1,0.5,0.9,1.4,-0.5,-1.8,3,-3,0,0,-0.17,-0.78,-0.96,-0.6,-0.5,-0.1,-1.6,0.026,-0.073,-0.015,-0.014,-0.2,4.9,0.005,0.002,0.6,-1.5,-0.038,True
1,2005,november,25,98,6,107,0,20041,205,Dallas Mavericks,-9.0,-1.8,-3.5,-0.002,-0.8,-1.5,-0.01,-1.1,-2.0,-0.001,3.1,3.8,0.002,-0.4,1.0,0.5,-4.9,0.4,1.7,0.3,1.8,-1.2,0,-125,-1.5,-2.9,-0.002,-0.7,-1.4,-0.01,-0.8,-1.4,-0.001,3.5,4.4,0.002,-0.3,1.3,1.0,-4.9,0.5,1.9,0.6,2.1,-0.2,0,-1.5,-3.0,-2.7,-0.021,-0.2,0.6,-0.027,-2.7,-3.4,-0.017,1.4,1.4,0.016,-0.3,-0.8,-1.1,-0.6,-0.5,0.4,0.9,1.7,-4.8,0,-125,-2.8,-2.1,-0.021,-0.2,0.8,-0.027,-2.6,-3.0,-0.017,1.7,1.8,0.016,-0.2,-0.5,-0.8,-0.4,-0.4,0.5,1.2,2.0,-4.1,0.0,8,-8,10,-10,3.58,-0.29,3.3,-0.2,-4.1,3.9,-0.3,0.059,-0.01,0.004,-0.004,0.5,-0.2,0.047,-0.021,1.0,1.2,0.025,True
2,2005,november,7,78,13,89,0,18997,167,Los Angeles Lakers,-11.0,-1.7,0.1,-0.022,4.0,10.6,0.015,-5.6,-10.5,-0.012,-1.2,-2.1,0.014,0.7,0.4,1.2,-3.5,-2.9,-1.8,-0.6,-1.0,-0.8,0,25,-0.8,2.7,-0.022,4.4,12.0,0.015,-5.2,-9.4,-0.012,-0.7,-1.5,0.014,1.1,1.4,2.6,-3.1,-2.9,-1.8,-0.2,-0.4,2.1,0,0.3,2.3,3.9,0.006,1.1,2.2,0.016,1.3,1.7,0.007,-1.5,-3.2,0.034,0.6,0.1,0.7,2.7,-0.2,-0.5,-4.6,-1.8,4.2,0,25,3.7,6.7,0.006,1.3,2.9,0.016,2.3,3.8,0.007,-1.1,-2.6,0.034,1.0,1.1,2.0,3.6,0.1,-0.3,-4.5,-1.3,7.5,-0.7,-15,15,-14,14,-4.98,0.44,-4.55,2.1,7.5,-5.4,-2.7,-0.028,0.131,0.0,0.002,-0.4,1.1,-0.016,0.011,-4.1,-0.6,-0.029,True
3,2005,november,11,109,5,104,2,19730,213,Indiana Pacers,5.0,4.0,6.3,0.015,-2.9,-8.2,-0.012,6.9,14.5,0.002,-1.4,-0.4,-0.04,3.0,-0.8,2.2,4.4,0.5,1.2,-0.4,-0.4,3.5,0,-50,3.4,4.8,0.015,-3.5,-9.6,-0.012,6.8,14.4,0.002,-2.2,-1.2,-0.04,3.0,-1.7,1.3,4.3,0.3,1.3,-0.8,-1.2,1.1,0,-0.6,1.7,2.0,0.012,0.3,0.6,0.011,1.4,1.3,0.012,-0.5,-0.9,0.007,-0.5,-1.2,-1.5,2.4,0.0,0.2,0.9,-1.4,3.5,0,-50,0.9,-0.1,0.012,0.3,0.3,0.011,0.7,-0.4,0.012,-1.0,-1.7,0.007,-0.8,-2.1,-3.0,2.1,-0.2,0.1,0.5,-2.2,1.1,-1.8,-2,2,0,0,0.03,-0.25,-0.22,1.1,1.1,0.0,2.5,-0.033,-0.12,-0.016,-0.006,-1.1,6.1,-0.039,0.013,0.5,0.2,-0.013,False
4,2005,november,16,92,21,93,0,15138,185,Orlando Magic,-1.0,0.9,1.3,0.004,-0.1,-0.1,-0.002,0.9,1.3,0.004,0.6,1.3,-0.013,0.7,1.7,2.4,-1.7,1.2,1.9,2.3,1.7,2.3,0,-75,-0.6,-2.0,0.004,-0.2,-0.6,-0.002,-0.4,-1.5,0.004,-0.2,0.2,-0.013,0.1,0.6,0.8,-2.6,1.0,1.8,1.9,0.9,-1.7,0,-0.9,-0.2,1.8,-0.013,-0.2,-0.8,0.0,0.1,2.5,-0.017,2.2,3.4,-0.017,1.1,0.2,1.3,-2.5,1.0,-0.2,1.5,-0.4,1.6,0,-75,-1.8,-1.5,-0.013,-0.5,-1.4,0.0,-1.3,0.0,-0.017,1.6,2.6,-0.017,0.7,-1.1,-0.3,-3.6,0.8,-0.3,1.0,-1.4,-2.5,-0.4,6,-6,3,-3,0.75,-0.17,0.57,-1.7,-2.5,0.8,3.9,0.011,-0.004,0.002,0.003,1.6,0.9,0.004,-0.015,0.8,-0.7,0.022,True


### Calculating a basis level by always predicting the home team will win

In [48]:
# target column (in this case, whether or not the home team will win)
y = outcomeData["HomeWin"].to_numpy()

# columns that will be used to make the prediction
X = outcomeData[["Visitor", "Home"]]

# set up scorer for testing accuracy
scorer = make_scorer(f1_score, pos_label = None, average = "weighted")

In [49]:
# Set up train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=100)

#### Accuracy of only guessing home team

In [50]:
basis_level = np.count_nonzero(y_train)/len(y_train)
basis_level

0.5951550743255644

#### F1 score of only guessing home team

In [51]:
y_pred = [1] * len(y_train)
f1_score(y_train, y_pred, pos_label = None, average = "weighted")

0.4441067432208479

### Testing with additional variables

In [41]:
pd.set_option('display.max_columns', None)
outcomeData.head()

Unnamed: 0,Year,Month,Visitor,VisitorPTS,Home,HomePTS,OT?,Attend,Total Points,Winner,Final Home Spread,FG_PG,FGA_PG,FG%_PG,3P_PG,3PA_PG,3P%_PG,2P_PG,2PA_PG,2P%_PG,FT_PG,FTA_PG,FT%_PG,ORB_PG,DRB_PG,TRB_PG,AST_PG,STL_PG,BLK_PG,TOV_PG,PF_PG,PTS_PG,G_PP,MP_PP,FG_PP,FGA_PP,FG%_PP,3P_PP,3PA_PP,3P%_PP,2P_PP,2PA_PP,2P%_PP,FT_PP,FTA_PP,FT%_PP,ORB_PP,DRB_PP,TRB_PP,AST_PP,STL_PP,BLK_PP,TOV_PP,PF_PP,PTS_PP,G_OppPG,MP_OppPG,FG_OppPG,FGA_OppPG,FG%_OppPG,3P_OppPG,3PA_OppPG,3P%_OppPG,2P_OppPG,2PA_OppPG,2P%_OppPG,FT_OppPG,FTA_OppPG,FT%_OppPG,ORB_OppPG,DRB_OppPG,TRB_OppPG,AST_OppPG,STL_OppPG,BLK_OppPG,TOV_OppPG,PF_OppPG,PTS_OppPG,G_OppPP,MP_OppPP,FG_OppPP,FGA_OppPP,FG%_OppPP,3P_OppPP,3PA_OppPP,3P%_OppPP,2P_OppPP,2PA_OppPP,2P%_OppPP,FT_OppPP,FTA_OppPP,FT%_OppPP,ORB_OppPP,DRB_OppPP,TRB_OppPP,AST_OppPP,STL_OppPP,BLK_OppPP,TOV_OppPP,PF_OppPP,PTS_OppPP,Age,W,L,PW,PL,MOV,SOS,SRS,ORtg,DRtg,NRtg,Pace,FTr,3PAr,TS%,eFG%,TOV%,ORB%,FT/FGA,eFG%.1,TOV%.1,DRB%,FT/FGA.1,HomeWin
0,2005,november,10,79,8,87,0,22076,166,Detroit Pistons,-8.0,0.1,0.0,0.001,-2.3,-5.7,-0.019,2.3,5.8,-0.005,0.5,2.0,-0.042,2.2,-1.1,1.0,0.7,0.1,1.5,0.0,-2.0,-1.8,0,100,0.5,1.2,0.001,-2.5,-6.2,-0.019,3.1,7.4,-0.005,0.8,2.7,-0.042,2.6,-0.8,1.8,1.1,0.2,1.8,0.1,-2.0,-0.6,0,1.3,0.8,0.9,0.007,-0.5,-1.6,0.0,1.4,2.4,0.005,-2.8,-4.0,0.009,0.4,-1.5,-1.1,-1.0,0.0,0.4,0.5,1.0,-1.5,0,100,1.5,2.1,0.007,-0.5,-1.6,0.0,1.9,3.7,0.005,-2.9,-4.1,0.009,0.7,-1.3,-0.6,-0.8,0.1,0.5,0.9,1.4,-0.5,-1.8,3,-3,0,0,-0.17,-0.78,-0.96,-0.6,-0.5,-0.1,-1.6,0.026,-0.073,-0.015,-0.014,-0.2,4.9,0.005,0.002,0.6,-1.5,-0.038,True
1,2005,november,25,98,6,107,0,20041,205,Dallas Mavericks,-9.0,-1.8,-3.5,-0.002,-0.8,-1.5,-0.01,-1.1,-2.0,-0.001,3.1,3.8,0.002,-0.4,1.0,0.5,-4.9,0.4,1.7,0.3,1.8,-1.2,0,-125,-1.5,-2.9,-0.002,-0.7,-1.4,-0.01,-0.8,-1.4,-0.001,3.5,4.4,0.002,-0.3,1.3,1.0,-4.9,0.5,1.9,0.6,2.1,-0.2,0,-1.5,-3.0,-2.7,-0.021,-0.2,0.6,-0.027,-2.7,-3.4,-0.017,1.4,1.4,0.016,-0.3,-0.8,-1.1,-0.6,-0.5,0.4,0.9,1.7,-4.8,0,-125,-2.8,-2.1,-0.021,-0.2,0.8,-0.027,-2.6,-3.0,-0.017,1.7,1.8,0.016,-0.2,-0.5,-0.8,-0.4,-0.4,0.5,1.2,2.0,-4.1,0.0,8,-8,10,-10,3.58,-0.29,3.3,-0.2,-4.1,3.9,-0.3,0.059,-0.01,0.004,-0.004,0.5,-0.2,0.047,-0.021,1.0,1.2,0.025,True
2,2005,november,7,78,13,89,0,18997,167,Los Angeles Lakers,-11.0,-1.7,0.1,-0.022,4.0,10.6,0.015,-5.6,-10.5,-0.012,-1.2,-2.1,0.014,0.7,0.4,1.2,-3.5,-2.9,-1.8,-0.6,-1.0,-0.8,0,25,-0.8,2.7,-0.022,4.4,12.0,0.015,-5.2,-9.4,-0.012,-0.7,-1.5,0.014,1.1,1.4,2.6,-3.1,-2.9,-1.8,-0.2,-0.4,2.1,0,0.3,2.3,3.9,0.006,1.1,2.2,0.016,1.3,1.7,0.007,-1.5,-3.2,0.034,0.6,0.1,0.7,2.7,-0.2,-0.5,-4.6,-1.8,4.2,0,25,3.7,6.7,0.006,1.3,2.9,0.016,2.3,3.8,0.007,-1.1,-2.6,0.034,1.0,1.1,2.0,3.6,0.1,-0.3,-4.5,-1.3,7.5,-0.7,-15,15,-14,14,-4.98,0.44,-4.55,2.1,7.5,-5.4,-2.7,-0.028,0.131,0.0,0.002,-0.4,1.1,-0.016,0.011,-4.1,-0.6,-0.029,True
3,2005,november,11,109,5,104,2,19730,213,Indiana Pacers,5.0,4.0,6.3,0.015,-2.9,-8.2,-0.012,6.9,14.5,0.002,-1.4,-0.4,-0.04,3.0,-0.8,2.2,4.4,0.5,1.2,-0.4,-0.4,3.5,0,-50,3.4,4.8,0.015,-3.5,-9.6,-0.012,6.8,14.4,0.002,-2.2,-1.2,-0.04,3.0,-1.7,1.3,4.3,0.3,1.3,-0.8,-1.2,1.1,0,-0.6,1.7,2.0,0.012,0.3,0.6,0.011,1.4,1.3,0.012,-0.5,-0.9,0.007,-0.5,-1.2,-1.5,2.4,0.0,0.2,0.9,-1.4,3.5,0,-50,0.9,-0.1,0.012,0.3,0.3,0.011,0.7,-0.4,0.012,-1.0,-1.7,0.007,-0.8,-2.1,-3.0,2.1,-0.2,0.1,0.5,-2.2,1.1,-1.8,-2,2,0,0,0.03,-0.25,-0.22,1.1,1.1,0.0,2.5,-0.033,-0.12,-0.016,-0.006,-1.1,6.1,-0.039,0.013,0.5,0.2,-0.013,False
4,2005,november,16,92,21,93,0,15138,185,Orlando Magic,-1.0,0.9,1.3,0.004,-0.1,-0.1,-0.002,0.9,1.3,0.004,0.6,1.3,-0.013,0.7,1.7,2.4,-1.7,1.2,1.9,2.3,1.7,2.3,0,-75,-0.6,-2.0,0.004,-0.2,-0.6,-0.002,-0.4,-1.5,0.004,-0.2,0.2,-0.013,0.1,0.6,0.8,-2.6,1.0,1.8,1.9,0.9,-1.7,0,-0.9,-0.2,1.8,-0.013,-0.2,-0.8,0.0,0.1,2.5,-0.017,2.2,3.4,-0.017,1.1,0.2,1.3,-2.5,1.0,-0.2,1.5,-0.4,1.6,0,-75,-1.8,-1.5,-0.013,-0.5,-1.4,0.0,-1.3,0.0,-0.017,1.6,2.6,-0.017,0.7,-1.1,-0.3,-3.6,0.8,-0.3,1.0,-1.4,-2.5,-0.4,6,-6,3,-3,0.75,-0.17,0.57,-1.7,-2.5,0.8,3.9,0.011,-0.004,0.002,0.003,1.6,0.9,0.004,-0.015,0.8,-0.7,0.022,True


In [42]:
outcomeData = outcomeData.drop(['Year', 'Month', 'Attend', 'Winner', 'VisitorPTS', 'HomePTS', 'Total Points',
                 'G_OppPG', 'G_OppPP', 'MP_OppPP', 'Final Home Spread' ], axis=1)
outcomeData.head()

Unnamed: 0,Visitor,Home,OT?,FG_PG,FGA_PG,FG%_PG,3P_PG,3PA_PG,3P%_PG,2P_PG,2PA_PG,2P%_PG,FT_PG,FTA_PG,FT%_PG,ORB_PG,DRB_PG,TRB_PG,AST_PG,STL_PG,BLK_PG,TOV_PG,PF_PG,PTS_PG,G_PP,MP_PP,FG_PP,FGA_PP,FG%_PP,3P_PP,3PA_PP,3P%_PP,2P_PP,2PA_PP,2P%_PP,FT_PP,FTA_PP,FT%_PP,ORB_PP,DRB_PP,TRB_PP,AST_PP,STL_PP,BLK_PP,TOV_PP,PF_PP,PTS_PP,MP_OppPG,FG_OppPG,FGA_OppPG,FG%_OppPG,3P_OppPG,3PA_OppPG,3P%_OppPG,2P_OppPG,2PA_OppPG,2P%_OppPG,FT_OppPG,FTA_OppPG,FT%_OppPG,ORB_OppPG,DRB_OppPG,TRB_OppPG,AST_OppPG,STL_OppPG,BLK_OppPG,TOV_OppPG,PF_OppPG,PTS_OppPG,FG_OppPP,FGA_OppPP,FG%_OppPP,3P_OppPP,3PA_OppPP,3P%_OppPP,2P_OppPP,2PA_OppPP,2P%_OppPP,FT_OppPP,FTA_OppPP,FT%_OppPP,ORB_OppPP,DRB_OppPP,TRB_OppPP,AST_OppPP,STL_OppPP,BLK_OppPP,TOV_OppPP,PF_OppPP,PTS_OppPP,Age,W,L,PW,PL,MOV,SOS,SRS,ORtg,DRtg,NRtg,Pace,FTr,3PAr,TS%,eFG%,TOV%,ORB%,FT/FGA,eFG%.1,TOV%.1,DRB%,FT/FGA.1,HomeWin
0,10,8,0,0.1,0.0,0.001,-2.3,-5.7,-0.019,2.3,5.8,-0.005,0.5,2.0,-0.042,2.2,-1.1,1.0,0.7,0.1,1.5,0.0,-2.0,-1.8,0,100,0.5,1.2,0.001,-2.5,-6.2,-0.019,3.1,7.4,-0.005,0.8,2.7,-0.042,2.6,-0.8,1.8,1.1,0.2,1.8,0.1,-2.0,-0.6,1.3,0.8,0.9,0.007,-0.5,-1.6,0.0,1.4,2.4,0.005,-2.8,-4.0,0.009,0.4,-1.5,-1.1,-1.0,0.0,0.4,0.5,1.0,-1.5,1.5,2.1,0.007,-0.5,-1.6,0.0,1.9,3.7,0.005,-2.9,-4.1,0.009,0.7,-1.3,-0.6,-0.8,0.1,0.5,0.9,1.4,-0.5,-1.8,3,-3,0,0,-0.17,-0.78,-0.96,-0.6,-0.5,-0.1,-1.6,0.026,-0.073,-0.015,-0.014,-0.2,4.9,0.005,0.002,0.6,-1.5,-0.038,True
1,25,6,0,-1.8,-3.5,-0.002,-0.8,-1.5,-0.01,-1.1,-2.0,-0.001,3.1,3.8,0.002,-0.4,1.0,0.5,-4.9,0.4,1.7,0.3,1.8,-1.2,0,-125,-1.5,-2.9,-0.002,-0.7,-1.4,-0.01,-0.8,-1.4,-0.001,3.5,4.4,0.002,-0.3,1.3,1.0,-4.9,0.5,1.9,0.6,2.1,-0.2,-1.5,-3.0,-2.7,-0.021,-0.2,0.6,-0.027,-2.7,-3.4,-0.017,1.4,1.4,0.016,-0.3,-0.8,-1.1,-0.6,-0.5,0.4,0.9,1.7,-4.8,-2.8,-2.1,-0.021,-0.2,0.8,-0.027,-2.6,-3.0,-0.017,1.7,1.8,0.016,-0.2,-0.5,-0.8,-0.4,-0.4,0.5,1.2,2.0,-4.1,0.0,8,-8,10,-10,3.58,-0.29,3.3,-0.2,-4.1,3.9,-0.3,0.059,-0.01,0.004,-0.004,0.5,-0.2,0.047,-0.021,1.0,1.2,0.025,True
2,7,13,0,-1.7,0.1,-0.022,4.0,10.6,0.015,-5.6,-10.5,-0.012,-1.2,-2.1,0.014,0.7,0.4,1.2,-3.5,-2.9,-1.8,-0.6,-1.0,-0.8,0,25,-0.8,2.7,-0.022,4.4,12.0,0.015,-5.2,-9.4,-0.012,-0.7,-1.5,0.014,1.1,1.4,2.6,-3.1,-2.9,-1.8,-0.2,-0.4,2.1,0.3,2.3,3.9,0.006,1.1,2.2,0.016,1.3,1.7,0.007,-1.5,-3.2,0.034,0.6,0.1,0.7,2.7,-0.2,-0.5,-4.6,-1.8,4.2,3.7,6.7,0.006,1.3,2.9,0.016,2.3,3.8,0.007,-1.1,-2.6,0.034,1.0,1.1,2.0,3.6,0.1,-0.3,-4.5,-1.3,7.5,-0.7,-15,15,-14,14,-4.98,0.44,-4.55,2.1,7.5,-5.4,-2.7,-0.028,0.131,0.0,0.002,-0.4,1.1,-0.016,0.011,-4.1,-0.6,-0.029,True
3,11,5,2,4.0,6.3,0.015,-2.9,-8.2,-0.012,6.9,14.5,0.002,-1.4,-0.4,-0.04,3.0,-0.8,2.2,4.4,0.5,1.2,-0.4,-0.4,3.5,0,-50,3.4,4.8,0.015,-3.5,-9.6,-0.012,6.8,14.4,0.002,-2.2,-1.2,-0.04,3.0,-1.7,1.3,4.3,0.3,1.3,-0.8,-1.2,1.1,-0.6,1.7,2.0,0.012,0.3,0.6,0.011,1.4,1.3,0.012,-0.5,-0.9,0.007,-0.5,-1.2,-1.5,2.4,0.0,0.2,0.9,-1.4,3.5,0.9,-0.1,0.012,0.3,0.3,0.011,0.7,-0.4,0.012,-1.0,-1.7,0.007,-0.8,-2.1,-3.0,2.1,-0.2,0.1,0.5,-2.2,1.1,-1.8,-2,2,0,0,0.03,-0.25,-0.22,1.1,1.1,0.0,2.5,-0.033,-0.12,-0.016,-0.006,-1.1,6.1,-0.039,0.013,0.5,0.2,-0.013,False
4,16,21,0,0.9,1.3,0.004,-0.1,-0.1,-0.002,0.9,1.3,0.004,0.6,1.3,-0.013,0.7,1.7,2.4,-1.7,1.2,1.9,2.3,1.7,2.3,0,-75,-0.6,-2.0,0.004,-0.2,-0.6,-0.002,-0.4,-1.5,0.004,-0.2,0.2,-0.013,0.1,0.6,0.8,-2.6,1.0,1.8,1.9,0.9,-1.7,-0.9,-0.2,1.8,-0.013,-0.2,-0.8,0.0,0.1,2.5,-0.017,2.2,3.4,-0.017,1.1,0.2,1.3,-2.5,1.0,-0.2,1.5,-0.4,1.6,-1.8,-1.5,-0.013,-0.5,-1.4,0.0,-1.3,0.0,-0.017,1.6,2.6,-0.017,0.7,-1.1,-0.3,-3.6,0.8,-0.3,1.0,-1.4,-2.5,-0.4,6,-6,3,-3,0.75,-0.17,0.57,-1.7,-2.5,0.8,3.9,0.011,-0.004,0.002,0.003,1.6,0.9,0.004,-0.015,0.8,-0.7,0.022,True


In [43]:
# target column we want to predict
y = outcomeData["HomeWin"].to_numpy()

# columns that will be used to make the prediction
X = outcomeData.drop(["HomeWin"], axis=1)

# set up scorer for testing accuracy
scorer = make_scorer(f1_score, pos_label = None, average = "weighted")

In [44]:
# Set up train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=100)

In [47]:
clf = RandomForestClassifier(random_state = 14)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)
#scores = cross_val_score(clf, X_train, y_train, scoring=scorer)

0.6349400538292146

In [46]:
np.mean(scores)

0.6413609871527244