# NBA Games Predictor - Basic

This notebook takes all the regular season games from the NBA 2016-17 season.  Using a series of regression & supervised learning techniques, it attempts to accurately predict the outcome of basketball games.

In [156]:
import json 
import pandas as pd
import numpy as np 
import geopy.distance

#package for flattening json in pandas df
from pandas.io.json import json_normalize

In [157]:
# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

## Load data from file system

In [158]:
#load games - json object
with open('../input/games.json') as gamesJsonRaw:
    d = json.load(gamesJsonRaw)

games = json_normalize(d['games'])

In [159]:
games.head(5)

Unnamed: 0,id,location,scoreAway,scoreHome,scoreQuarters,season,teamAwayCode,teamAwayId,teamHomeCode,teamHomeId,time
0,33941,Quicken Loans Arena,88,117,"[{'number': 1, 'scoreHome': 28, 'scoreAway': 1...",2016,NYK,83,CLE,86,2016-10-25T19:30:00.000Z
1,33942,Moda Center,104,113,"[{'number': 1, 'scoreHome': 26, 'scoreAway': 2...",2016,UTA,98,POR,97,2016-10-25T22:00:00.000Z
2,33943,Oracle Arena,129,100,"[{'number': 1, 'scoreHome': 20, 'scoreAway': 3...",2016,SAS,106,GSW,101,2016-10-25T22:30:00.000Z
3,33944,Amway Center,108,96,"[{'number': 1, 'scoreHome': 27, 'scoreAway': 2...",2016,MIA,92,ORL,95,2016-10-26T19:00:00.000Z
4,33945,Bankers Life Fieldhouse,121,130,"[{'number': 1, 'scoreHome': 33, 'scoreAway': 2...",2016,DAL,108,IND,87,2016-10-26T19:00:00.000Z


In [160]:
#load stadiums - json object
with open('../input/stadiums.json') as stadiumsJsonRaw:
    stadiumsJson = json.load(stadiumsJsonRaw)

stadiums = json_normalize(stadiumsJson['stadiums'])

In [161]:
stadiums.head(5)

Unnamed: 0,lat,lng,team
0,33.757183,-84.396278,ATL
1,42.366281,-71.062266,BOS
2,40.68265,-73.974689,BRO
3,35.224519,-80.841053,CHA
4,41.880589,-87.674149,CHI


### Calculate the number of games played for season

In [162]:
# Create a dataframe for the results - same size as dataset
data = pd.DataFrame(index=range(0,len(games)), columns=['gamesPlayedHome', 'gamesPlayedAway'])

# Iterate through every team
for team in games.teamAwayCode.unique():
    gameCount = 0
    homeGameCount = 0;
    
    # Iterate through each game the team is present in.
    for index, game in games[(games['teamAwayCode'] == team) | (games['teamHomeCode'] == team)].iterrows():
        gameCount += 1
        
        # Update game count for team - whether away or home.
        if game.teamAwayCode == team:
            data.loc[index]['gamesPlayedAway'] = gameCount
        else:
            data.loc[index]['gamesPlayedHome'] = gameCount

# Append the results to the dataset
games = pd.merge(games, data, left_index=True, right_index=True)

In [163]:
games[['teamHomeCode', 'teamAwayCode', 'gamesPlayedHome', 'gamesPlayedAway']].sample(5)

Unnamed: 0,teamHomeCode,teamAwayCode,gamesPlayedHome,gamesPlayedAway
505,MIA,DET,35,36
99,CLE,ATL,7,7
847,DET,CHA,58,57
120,PHI,IND,8,9
269,MIN,NYK,18,18


### Calculate the results for the last 5 games for the Home team.

In [164]:
# Create a dataframe for the results - same size as dataset
data = pd.DataFrame(index=range(0,len(games)), columns=['lastGame1WinHome', 'lastGame1AtHomeHome', 'lastGame2WinHome', 'lastGame2AtHomeHome', 'lastGame3WinHome', 'lastGame3AtHomeHome', 'lastGame4WinHome', 'lastGame4AtHomeHome', 'lastGame5WinHome', 'lastGame5AtHomeHome'])

# Iterate through each game where the teams have played at least 5 games.
for index, game in games.iterrows():
    
    # Get the last five games for the team.
    last5games = games[(games['gamesPlayedHome'] < game.gamesPlayedHome) & ((games['teamAwayCode'] == game.teamHomeCode) | (games['teamHomeCode'] == game.teamHomeCode))].sort_values(by='gamesPlayedHome', ascending=False).head(5)

    if len(last5games) == 5:
        lastGame1WinHome = last5games["scoreAway"].iloc[0] < last5games["scoreHome"].iloc[0]
        lastGame2WinHome = last5games["scoreAway"].iloc[1] < last5games["scoreHome"].iloc[1]
        lastGame3WinHome = last5games["scoreAway"].iloc[2] < last5games["scoreHome"].iloc[2]
        lastGame4WinHome = last5games["scoreAway"].iloc[3] < last5games["scoreHome"].iloc[3]
        lastGame5WinHome = last5games["scoreAway"].iloc[4] < last5games["scoreHome"].iloc[4]
        #print(lastGame1WinHome, lastGame2WinHome, lastGame3WinHome, lastGame4WinHome, lastGame5WinHome)
        
        lastGame1AtHomeHome = last5games["teamHomeCode"].iloc[0] == game.teamHomeCode
        lastGame2AtHomeHome = last5games["teamHomeCode"].iloc[1] == game.teamHomeCode
        lastGame3AtHomeHome = last5games["teamHomeCode"].iloc[2] == game.teamHomeCode
        lastGame4AtHomeHome = last5games["teamHomeCode"].iloc[3] == game.teamHomeCode
        lastGame5AtHomeHome = last5games["teamHomeCode"].iloc[4] == game.teamHomeCode
        #print(lastGame1AtHomeHome, lastGame2AtHomeHome, lastGame3AtHomeHome, lastGame4AtHomeHome, lastGame5AtHomeHome)
        
        # Update the row with the history
        data.loc[index] = [lastGame1WinHome, lastGame1AtHomeHome, lastGame2WinHome, lastGame2AtHomeHome, lastGame3WinHome, lastGame3AtHomeHome, lastGame4WinHome, lastGame4AtHomeHome, lastGame5WinHome, lastGame5AtHomeHome]

# Convert types 
data['lastGame1WinHome'] = data['lastGame1WinHome'].astype('bool')
data['lastGame2WinHome'] = data['lastGame2WinHome'].astype('bool')
data['lastGame3WinHome'] = data['lastGame3WinHome'].astype('bool')
data['lastGame4WinHome'] = data['lastGame4WinHome'].astype('bool')
data['lastGame5WinHome'] = data['lastGame5WinHome'].astype('bool')
data['lastGame1AtHomeHome'] = data['lastGame1AtHomeHome'].astype('bool')
data['lastGame2AtHomeHome'] = data['lastGame2AtHomeHome'].astype('bool')
data['lastGame3AtHomeHome'] = data['lastGame3AtHomeHome'].astype('bool')
data['lastGame4AtHomeHome'] = data['lastGame4AtHomeHome'].astype('bool')
data['lastGame5AtHomeHome'] = data['lastGame5AtHomeHome'].astype('bool')

# Add results to the dataset
games = pd.merge(games, data, left_index=True, right_index=True)

In [165]:
games[['teamHomeCode', 'teamAwayCode', 'lastGame1WinHome', 'lastGame1AtHomeHome', 'lastGame2WinHome', 'lastGame2AtHomeHome', 'lastGame3WinHome', 'lastGame3AtHomeHome', 'lastGame4WinHome', 'lastGame4AtHomeHome', 'lastGame5WinHome', 'lastGame5AtHomeHome']].sample(5)

Unnamed: 0,teamHomeCode,teamAwayCode,lastGame1WinHome,lastGame1AtHomeHome,lastGame2WinHome,lastGame2AtHomeHome,lastGame3WinHome,lastGame3AtHomeHome,lastGame4WinHome,lastGame4AtHomeHome,lastGame5WinHome,lastGame5AtHomeHome
260,BRO,LAC,False,True,False,True,True,False,False,True,True,False
523,PHX,MIA,True,False,True,True,True,False,True,False,True,True
606,SAC,OKL,False,True,True,True,False,True,False,True,False,True
691,IND,SAC,True,False,False,True,False,False,True,True,True,False
459,ORL,MEM,True,True,True,False,False,False,False,True,True,True


### Calculate the results for the last 5 games for the Away team.

In [166]:
# Create a dataframe for the results - same size as dataset
data = pd.DataFrame(index=range(0,len(games)), columns=['lastGame1WinAway', 'lastGame1AtHomeAway', 'lastGame2WinAway', 'lastGame2AtHomeAway', 'lastGame3WinAway', 'lastGame3AtHomeAway', 'lastGame4WinAway', 'lastGame4AtHomeAway', 'lastGame5WinAway', 'lastGame5AtHomeAway'])

# Iterate through each game where the teams have played at least 5 games.
for index, game in games.iterrows():
    
    # Get the last five games for the team.
    last5games = games[(games['gamesPlayedAway'] < game.gamesPlayedAway) & ((games['teamAwayCode'] == game.teamAwayCode) | (games['teamHomeCode'] == game.teamAwayCode))].sort_values(by='gamesPlayedAway', ascending=False).head(5)

    if len(last5games) == 5:
        lastGame1WinAway = last5games["scoreAway"].iloc[0] < last5games["scoreHome"].iloc[0]
        lastGame2WinAway = last5games["scoreAway"].iloc[1] < last5games["scoreHome"].iloc[1]
        lastGame3WinAway = last5games["scoreAway"].iloc[2] < last5games["scoreHome"].iloc[2]
        lastGame4WinAway = last5games["scoreAway"].iloc[3] < last5games["scoreHome"].iloc[3]
        lastGame5WinAway = last5games["scoreAway"].iloc[4] < last5games["scoreHome"].iloc[4]
        #print(lastGame1WinAway, lastGame2WinAway, lastGame3WinAway, lastGame4WinAway, lastGame5WinAway)
        
        lastGame1AtHomeAway = last5games["teamAwayCode"].iloc[0] == game.teamAwayCode
        lastGame2AtHomeAway = last5games["teamAwayCode"].iloc[1] == game.teamAwayCode
        lastGame3AtHomeAway = last5games["teamAwayCode"].iloc[2] == game.teamAwayCode
        lastGame4AtHomeAway = last5games["teamAwayCode"].iloc[3] == game.teamAwayCode
        lastGame5AtHomeAway = last5games["teamAwayCode"].iloc[4] == game.teamAwayCode
        #print(lastGame1AtHomeAway, lastGame2AtHomeAway, lastGame3AtHomeAway, lastGame4AtHomeAway, lastGame5AtHomeAway)

        # Update the row with the history
        data.loc[index] = [lastGame1WinAway, lastGame1AtHomeAway, lastGame2WinAway, lastGame2AtHomeAway, lastGame3WinAway, lastGame3AtHomeAway, lastGame4WinAway, lastGame4AtHomeAway, lastGame5WinAway, lastGame5AtHomeAway]

# Convert types         
data['lastGame1WinAway'] = data['lastGame1WinAway'].astype('bool')
data['lastGame2WinAway'] = data['lastGame2WinAway'].astype('bool')
data['lastGame3WinAway'] = data['lastGame3WinAway'].astype('bool')
data['lastGame4WinAway'] = data['lastGame4WinAway'].astype('bool')
data['lastGame5WinAway'] = data['lastGame5WinAway'].astype('bool')
data['lastGame1AtHomeAway'] = data['lastGame1AtHomeAway'].astype('bool')
data['lastGame2AtHomeAway'] = data['lastGame2AtHomeAway'].astype('bool')
data['lastGame3AtHomeAway'] = data['lastGame3AtHomeAway'].astype('bool')
data['lastGame4AtHomeAway'] = data['lastGame4AtHomeAway'].astype('bool')
data['lastGame5AtHomeAway'] = data['lastGame5AtHomeAway'].astype('bool')

# Add results to the dataset
games = pd.merge(games, data, left_index=True, right_index=True)

In [167]:
games[['teamHomeCode', 'teamAwayCode', 'lastGame1WinHome', 'lastGame1AtHomeHome', 'lastGame2WinHome', 'lastGame2AtHomeHome', 'lastGame3WinHome', 'lastGame3AtHomeHome', 'lastGame4WinHome', 'lastGame4AtHomeHome', 'lastGame5WinHome', 'lastGame5AtHomeHome']].sample(5)

Unnamed: 0,teamHomeCode,teamAwayCode,lastGame1WinHome,lastGame1AtHomeHome,lastGame2WinHome,lastGame2AtHomeHome,lastGame3WinHome,lastGame3AtHomeHome,lastGame4WinHome,lastGame4AtHomeHome,lastGame5WinHome,lastGame5AtHomeHome
631,SAC,IND,False,True,False,True,True,True,False,True,False,True
141,MIN,LAL,False,False,False,True,True,False,True,False,False,True
886,DET,POR,False,True,True,True,True,True,False,False,True,False
923,ATL,IND,False,True,True,True,False,False,True,False,False,True
638,CHA,TOR,True,True,True,False,True,False,True,False,True,False


In [168]:
# test entry for stadium distance calc

coords_1 = (37.750267, -122.202853)
coords_2 = (34.04303865743706, -118.26711416244507)

geopy.distance.vincenty(coords_1, coords_2).miles


337.6783477035334

In [169]:
games.dtypes

id                      int64
location               object
scoreAway               int64
scoreHome               int64
scoreQuarters          object
season                  int64
teamAwayCode           object
teamAwayId              int64
teamHomeCode           object
teamHomeId              int64
time                   object
gamesPlayedHome        object
gamesPlayedAway        object
lastGame1WinHome         bool
lastGame1AtHomeHome      bool
lastGame2WinHome         bool
lastGame2AtHomeHome      bool
lastGame3WinHome         bool
lastGame3AtHomeHome      bool
lastGame4WinHome         bool
lastGame4AtHomeHome      bool
lastGame5WinHome         bool
lastGame5AtHomeHome      bool
lastGame1WinAway         bool
lastGame1AtHomeAway      bool
lastGame2WinAway         bool
lastGame2AtHomeAway      bool
lastGame3WinAway         bool
lastGame3AtHomeAway      bool
lastGame4WinAway         bool
lastGame4AtHomeAway      bool
lastGame5WinAway         bool
lastGame5AtHomeAway      bool
dtype: obj

## Data Cleanse
Transform the Time column into a datetime object.

In [170]:
games["time"] = pd.to_datetime(games["time"])

### Convert the home and away teams to one-hot encoding 

In [171]:
homeTeam = pd.get_dummies(games["teamHomeCode"], prefix='team')
awayTeam = pd.get_dummies(games["teamAwayCode"], prefix='team')

homeTeam = homeTeam.replace({0:np.nan})
awayTeam = awayTeam.replace({0:np.nan})

teams = homeTeam.fillna(awayTeam).fillna(0).astype('bool')
# teams[['team_NYK', 'team_CLE', 'team_ATL', 'team_BOS']].head(2)

In [172]:
homeTeam = pd.get_dummies(games["teamHomeCode"], prefix='hometeam').astype('bool')
awayTeam = pd.get_dummies(games["teamAwayCode"], prefix='awayteam').astype('bool')

games = pd.concat([games, teams, homeTeam, awayTeam], axis=1)

In [173]:
games[['teamHomeCode', 'teamAwayCode', 'team_NYK', 'team_CLE', 'hometeam_NYK', 'hometeam_CLE', 'awayteam_NYK', 'awayteam_CLE']].head(2)

Unnamed: 0,teamHomeCode,teamAwayCode,team_NYK,team_CLE,hometeam_NYK,hometeam_CLE,awayteam_NYK,awayteam_CLE
0,CLE,NYK,True,True,False,True,True,False
1,POR,UTA,False,False,False,False,False,False


Drop the id fields for teams.

In [175]:
games = games.drop(["teamAwayId", "teamAwayCode", "teamHomeId", "teamHomeCode"], axis=1)

In [176]:
games = games.drop(["location", "scoreQuarters", "time"], axis=1)

In [15]:
#gamesQuarters = json_normalize(data=d['games'], record_path='scoreQuarters', meta=['id'])
#gamesQuarters.head(10)

In [177]:
games["homeWin"] = games["scoreAway"] > games["scoreHome"]

In [178]:
games = games.drop(["scoreAway", "scoreHome"], axis=1)

In [179]:
games = games[(pd.notnull(games['lastGame1WinHome'])) & (pd.notnull(games['lastGame1AtHomeAway']))]

In [180]:
#games.isnull().any(axis=1)
games.head(3)

Unnamed: 0,id,season,gamesPlayedHome,gamesPlayedAway,lastGame1WinHome,lastGame1AtHomeHome,lastGame2WinHome,lastGame2AtHomeHome,lastGame3WinHome,lastGame3AtHomeHome,...,awayteam_ORL,awayteam_PHI,awayteam_PHX,awayteam_POR,awayteam_SAC,awayteam_SAS,awayteam_TOR,awayteam_UTA,awayteam_WAS,homeWin
0,33941,2016,1,1,True,True,True,True,True,True,...,False,False,False,False,False,False,False,False,False,False
1,33942,2016,1,1,True,True,True,True,True,True,...,False,False,False,False,False,False,False,True,False,False
2,33943,2016,1,1,True,True,True,True,True,True,...,False,False,False,False,False,True,False,False,False,True


In [181]:
msk = np.random.rand(len(games)) < 0.8
msk

train = games[msk]
test = games[~msk]

X_train = train.drop("homeWin", axis=1)
Y_train = train["homeWin"]

#
X_test  = test.drop("id", axis=1).copy()
X_train.shape, Y_train.shape, X_test.shape

((987, 114), (987,), (243, 114))

### Logistic Regression

In [182]:
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
Y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_train, Y_train) * 100, 2)
acc_log

60.490000000000002

In [183]:
coeff_df = pd.DataFrame(games.columns.delete(0))
coeff_df.columns = ['Feature']
coeff_df["Correlation"] = pd.Series(logreg.coef_[0])

coeff_df.sort_values(by='Correlation', ascending=False)

Unnamed: 0,Feature,Correlation
16,lastGame2AtHomeAway,0.115813
2,gamesPlayedAway,0.111254
93,awayteam_HOU,0.096876
110,awayteam_TOR,0.094582
56,hometeam_CHA,0.073564
10,lastGame4AtHomeHome,0.073015
94,awayteam_IND,0.066923
14,lastGame1AtHomeAway,0.049975
73,hometeam_OKL,0.048296
111,awayteam_UTA,0.047682


### Support Vector Machines

In [184]:
svc = SVC()
svc.fit(X_train, Y_train)
Y_pred = svc.predict(X_test)
acc_svc = round(svc.score(X_train, Y_train) * 100, 2)
acc_svc

69.810000000000002

### k-Nearest Neighbors

In [185]:
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)
acc_knn = round(knn.score(X_train, Y_train) * 100, 2)
acc_knn

77.510000000000005

### Gaussian Naive Bayes

In [186]:
gaussian = GaussianNB()
gaussian.fit(X_train, Y_train)
Y_pred = gaussian.predict(X_test)
acc_gaussian = round(gaussian.score(X_train, Y_train) * 100, 2)
acc_gaussian

65.049999999999997

### Perceptron

In [187]:
perceptron = Perceptron(max_iter = 5)
perceptron.fit(X_train, Y_train)
Y_pred = perceptron.predict(X_test)
acc_perceptron = round(perceptron.score(X_train, Y_train) * 100, 2)
acc_perceptron

58.049999999999997

### Linear SVC

In [188]:
linear_svc = LinearSVC()
linear_svc.fit(X_train, Y_train)
Y_pred = linear_svc.predict(X_test)
acc_linear_svc = round(linear_svc.score(X_train, Y_train) * 100, 2)
acc_linear_svc

41.950000000000003

### Stochastic Gradient Descent

In [189]:
sgd = SGDClassifier(max_iter = 5)
sgd.fit(X_train, Y_train)
Y_pred = sgd.predict(X_test)
acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2)
acc_sgd

41.950000000000003

### Decision Tree

In [190]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
acc_decision_tree

100.0

### Random Forest

In [191]:
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
Y_pred = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
acc_random_forest

100.0

## Model Evaluation Results

In [192]:
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest', 'Naive Bayes', 'Perceptron', 
              'Stochastic Gradient Decent', 'Linear SVC', 
              'Decision Tree'],
    'Score': [acc_svc, acc_knn, acc_log, 
              acc_random_forest, acc_gaussian, acc_perceptron, 
              acc_sgd, acc_linear_svc, acc_decision_tree]})
models.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score
3,Random Forest,100.0
8,Decision Tree,100.0
1,KNN,77.51
0,Support Vector Machines,69.81
4,Naive Bayes,65.05
2,Logistic Regression,60.49
5,Perceptron,58.05
6,Stochastic Gradient Decent,41.95
7,Linear SVC,41.95


In [193]:
len(Y_pred)

243