In [89]:
import pandas as pd
from sklearn import ensemble
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, make_scorer, classification_report
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import numpy as np
from collections import defaultdict
import time

In [90]:
teamPath = '/Users/justinholmes/Desktop/Sports_Model/Data/teamData.csv'
playerPath = '/Users/justinholmes/Desktop/Sports_Model/Data/playerData.csv'
outcomePath = '/Users/justinholmes/Desktop/Sports_Model/Data/outcomeData.csv'

In [91]:
teamData = pd.read_csv(teamPath)
playerData = pd.read_csv(playerPath, engine='python')
outcomeData = pd.read_csv(outcomePath)

### Format all data

In [92]:
# drop unwanted columns
outcomeData = outcomeData.drop(["Start Time", "Box Score", "Notes"], axis=1)

# change OT to integer value for number of OT rounds
outcomeData.loc[(outcomeData["OT?"].isna()),'OT?']= 0
outcomeData.loc[(outcomeData["OT?"] == "OT"),'OT?']= 1
outcomeData.loc[(outcomeData["OT?"] == "2OT"),'OT?']= 2
outcomeData.loc[(outcomeData["OT?"] == "3OT"),'OT?']= 3
outcomeData.loc[(outcomeData["OT?"] == "4OT"),'OT?']= 4

In [93]:
def redefineTeamNames(dataframe):
    dataframe = dataframe.replace('Seattle SuperSonics', 'Oklahoma City Thunder', regex=True)
    
    dataframe = dataframe.replace('New Orleans/Oklahoma City Hornets', 'New Orleans Pelicans', regex=True)

    dataframe = dataframe.replace('New Orleans Hornets', 'New Orleans Pelicans', regex=True)
    
    dataframe = dataframe.replace('Charlotte Bobcats', 'Charlotte Hornets', regex=True)
    
    dataframe = dataframe.replace('New Jersey Nets', 'Brooklyn Nets', regex=True)
    
#     dataframe = dataframe.replace('*', '', regex=True)

    return dataframe

outcomeData = redefineTeamNames(outcomeData)
teamData = redefineTeamNames(teamData)


In [94]:
outcomeData["Final Home Spread"] = (outcomeData["VisitorPTS"] - outcomeData["HomePTS"]).astype(float)

In [95]:
outcomeData.head()

Unnamed: 0,Year,Month,Visitor,VisitorPTS,Home,HomePTS,OT?,Attend,Total Points,Winner,Final Home Spread
0,2005,november,Houston Rockets,79,Detroit Pistons,87,0,22076,166,Detroit Pistons,-8.0
1,2005,november,Sacramento Kings,98,Dallas Mavericks,107,0,20041,205,Dallas Mavericks,-9.0
2,2005,november,Denver Nuggets,78,Los Angeles Lakers,89,0,18997,167,Los Angeles Lakers,-11.0
3,2005,november,Indiana Pacers,109,Cleveland Cavaliers,104,2,19730,213,Indiana Pacers,5.0
4,2005,november,Milwaukee Bucks,92,Orlando Magic,93,0,15138,185,Orlando Magic,-1.0


In [96]:
teamData.head()

Unnamed: 0,Team,Year,G_PG,MP_PG,FG_PG,FGA_PG,FG%_PG,3P_PG,3PA_PG,3P%_PG,...,TOV%,ORB%,FT/FGA,eFG%.1,TOV%.1,DRB%,FT/FGA.1,Arena,Attend.,Attend./G
0,Phoenix Suns*,2005,82,241.2,40.9,85.6,0.477,9.7,24.7,0.393,...,12.4,27.5,0.222,0.478,12.0,68.3,0.176,America West Arena,726066,17709
1,Sacramento Kings*,2005,82,242.1,39.1,85.1,0.459,6.4,17.0,0.374,...,12.0,28.8,0.226,0.493,13.3,69.0,0.214,ARCO Arena (II),709997,17317
2,Dallas Mavericks*,2005,82,240.6,37.3,81.6,0.457,5.6,15.5,0.364,...,12.5,28.6,0.273,0.472,14.3,70.2,0.239,American Airlines Center,822533,20062
3,Miami Heat*,2005,82,243.7,37.8,77.7,0.486,5.8,15.4,0.377,...,13.1,27.0,0.26,0.46,12.4,74.1,0.242,AmericanAirlines Arena,815143,19882
4,Boston Celtics*,2005,82,242.4,37.1,79.4,0.468,5.3,15.3,0.349,...,14.7,27.2,0.273,0.481,14.2,70.1,0.274,FleetCenter,656081,16002


In [97]:
teamDataColumn = teamData.drop(['Team', "Year", "Arena", "Attend.", "Attend./G"], axis=1).columns

In [98]:
variableNameList = []

for col in teamDataColumn:
    variableNameList.append([col + "_DIFF", col])

## Functions for additional calculations

In [None]:
def homeWin(dataframe):
    if dataframe['Winner'] == dataframe['Home']:
        return True
    else:
        return False

outcomeData['HomeWin'] = outcomeData.apply(homeWin, axis=1)

In [None]:
outcomeData.head()

In [None]:
# def pointDiff(dataframe):
    
#     if teamData['Year'] == outcomeData["Year"] and teamData['Team'] == outcomeData["Home"]:
#         homeScore = teamData['PTS_PG']
#     if teamData['Year'] == outcomeData["Year"] and teamData['Team'] == outcomeData["Visitor"]:
#         awayScore = teamData['PTS_PG']
        
#     pointDiff = homeScore-awayScore
    
#     return pointDiff

In [None]:
# outcomeData['pointDiff'] = outcomeData.apply(lambda x: teamData.loc[x['Year'] == teamData['Year'],
#                                                teamData.loc[x['Home'] == teamData["Team"], x["PTS_PG"] - 
#                                                             teamData.loc[x['Year'] == teamData['Year'],
#                                                teamData.loc[x['Visitor'] == teamData["Team"], x["PTS_PG"]].reset_index(drop=True), axis=1)



In [None]:
# outcomeData['pointDiff'] = outcomeData.apply(pointDiff, axis=1)

#### Encode team names

In [None]:
encoding = LabelEncoder()
encoding.fit(outcomeData['Home'].values)
outcomeData['Home'] = encoding.transform(outcomeData['Home'].values)
outcomeData['Visitor'] = encoding.transform(outcomeData['Visitor'].values)

In [None]:
outcomeData.head()

### Calculating a basis level by always predicting the home team will win

In [None]:
# target column (in this case, whether or not the home team will win)
y = outcomeData["HomeWin"].to_numpy()

# columns that will be used to make the prediction
X = outcomeData[["Visitor", "Home"]]

# set up scorer for testing accuracy
scorer = make_scorer(f1_score, pos_label = None, average = "weighted")

In [None]:
# Set up train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=100)

#### Accuracy of only guessing home team

In [None]:
basis_level = np.count_nonzero(y_train)/len(y_train)
basis_level

#### F1 score of only guessing home team

In [None]:
y_pred = [1] * len(y_train)
f1_score(y_train, y_pred, pos_label = None, average = "weighted")

### Testing with additional variables

In [None]:
# target column we want to predict
y = outcomeData["Final Home Spread"].to_numpy()

# columns that will be used to make the prediction
X = outcomeData[["Visitor", "Home"]]

# set up scorer for testing accuracy
scorer = make_scorer(f1_score, pos_label = None, average = "weighted")

In [None]:
# Set up train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=100)

In [None]:
clf = RandomForestClassifier(random_state = 14)
clf.fit(X_train, y_train)
#clf.score(X_test, y_test)
scores = cross_val_score(clf, X_train, y_train, scoring=scorer)

In [None]:
np.mean(scores)