# NBA Games Predictor - Basic

This notebook takes all the regular season games from the NBA 2016-17 season.  Using a series of regression & supervised learning techniques, it attempts to accurately predict the outcome of basketball games.

In [1]:
import json 
import pandas as pd
import numpy as np 
import geopy.distance

#package for flattening json in pandas df
from pandas.io.json import json_normalize

In [2]:
# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

## Configuration

In [3]:
use_parsed_data = False
output_parsed_data_filename = '../input/parsed_data.csv'

calc_last_5_games = True
calc_win_loss = True
calc_win_loss_write_all = True
calc_ratings = True
use_team_keys = False
use_team_hot_encoding = False
use_team_home_away_hot_encoding = False

## Load data from file system

In [4]:
#load games - json object
if use_parsed_data == True:
    games = pd.read_csv(output_parsed_data_filename)
    games = games.drop(['Unnamed: 0'], axis=1)
else:
    with open('../input/games.json') as gamesJsonRaw:
        d = json.load(gamesJsonRaw)

    games = json_normalize(d['games'])

In [5]:
games.sample(5)

Unnamed: 0,id,location,oddsBet365Away,oddsBet365Home,scoreAway,scoreHome,scoreQuarters,season,statsAway.Ast,statsAway.Blk,...,statsHome.Reb,statsHome.Stl,statsHome.Tov,statsHome.WinPct,statsHome.Wins,teamAwayCode,teamAwayId,teamHomeCode,teamHomeId,time
2045,34759,Sleep Train Arena,2.4,1.62,99,105,"[{'number': 1, 'scoreHome': 23, 'scoreAway': 2...",2016,23,2,...,54,11,19,1.0,1,NOP,110,SAC,103,2017-02-12T21:00:00.000Z
1920,34634,Wells Fargo Center,1.25,4.1,123,118,"[{'number': 1, 'scoreHome': 28, 'scoreAway': 3...",2016,25,4,...,38,14,22,0.0,0,HOU,109,PHI,85,2017-01-27T20:00:00.000Z
695,31872,American Airlines Center,5.0,1.19,79,91,"[{'number': 1, 'scoreHome': 25, 'scoreAway': 1...",2015,13,2,...,55,6,12,0.0,1,BRO,84,DAL,108,2016-01-29T20:30:00.000Z
166,31342,Time Warner Cable Arena,3.4,1.33,111,116,"[{'number': 1, 'scoreHome': 32, 'scoreAway': 3...",2015,25,4,...,58,5,15,0.0,1,BRO,84,CHA,93,2015-11-18T19:00:00.000Z
984,32161,Sleep Train Arena,1.74,2.15,108,99,"[{'number': 1, 'scoreHome': 18, 'scoreAway': 3...",2015,26,3,...,42,8,10,0.0,0,UTA,98,SAC,103,2016-03-13T18:00:00.000Z


In [6]:
#load stadiums - json object
with open('../input/stadiums.json') as stadiumsJsonRaw:
    stadiumsJson = json.load(stadiumsJsonRaw)

stadiums = json_normalize(stadiumsJson['stadiums'])

In [7]:
stadiums.head(5)

Unnamed: 0,lat,lng,team
0,33.757183,-84.396278,ATL
1,42.366281,-71.062266,BOS
2,40.68265,-73.974689,BRO
3,35.224519,-80.841053,CHA
4,41.880589,-87.674149,CHI


   ## Data Preperation

### Record the names of all stats - to be dropped later.

In [8]:
statsColumns = pd.Series(list(games.columns.values))
statsColumns = statsColumns.loc[statsColumns.str.startswith(('statsAway', 'statsHome'), na=False)]

In [9]:
# Replace any NaN with 0 in the stats
games[statsColumns] = games[statsColumns].fillna(value=0)

### Set Valid Data Types

In [10]:
#games['season'] = games['season'].astype('float')
games['scoreAway'] = games['scoreAway'].astype('uint8')
games['scoreHome'] = games['scoreHome'].astype('uint8')
games['teamAwayId'] = games['teamAwayId'].astype('uint8')
games['teamHomeId'] = games['teamHomeId'].astype('uint8')

games['statsHome.FgAtt'] = games['statsHome.FgAtt'].astype('uint8')
games['statsHome.OffReb'] = games['statsHome.OffReb'].astype('uint8')
games['statsHome.Tov'] = games['statsHome.Tov'].astype('uint8')
games['statsHome.FtAtt'] = games['statsHome.FtAtt'].astype('uint8')
games['statsHome.Pts'] = games['statsHome.Pts'].astype('uint8')

games['statsAway.FgAtt'] = games['statsAway.FgAtt'].astype('uint8')
games['statsAway.OffReb'] = games['statsAway.OffReb'].astype('uint8')
games['statsAway.Tov'] = games['statsAway.Tov'].astype('uint8')
games['statsAway.FtAtt'] = games['statsAway.FtAtt'].astype('uint8')
games['statsAway.Pts'] = games['statsAway.Pts'].astype('uint8')

#scoreQuarters          object
#teamAwayCode           object
#teamHomeCode           object

### Calculate the number of games played for season

In [11]:
if (use_parsed_data == False):

    # Create a dataframe for the results - same size as dataset
    data = pd.DataFrame(index=range(0,len(games)), columns=['gamesPlayedHome', 'gamesPlayedAway'])

    # Iterate through every team
    for team in games.teamAwayCode.unique():
        season = 0

        # Iterate through each game the team is present in.
        for index, game in games[(games['teamAwayCode'] == team) | (games['teamHomeCode'] == team)].sort_values(by='season', ascending=True).iterrows():
            if season != game.season:
                gameCount = 0
                homeGameCount = 0
                season = game.season

            gameCount += 1

            # Update game count for team - whether away or home.
            if game.teamAwayCode == team:
                data.loc[index]['gamesPlayedAway'] = gameCount
            else:
                data.loc[index]['gamesPlayedHome'] = gameCount

    data['gamesPlayedAway'] = data['gamesPlayedAway'].astype('uint8')
    data['gamesPlayedHome'] = data['gamesPlayedHome'].astype('uint8')

    # Append the results to the dataset
    games = pd.merge(games, data, left_index=True, right_index=True)

In [12]:
games[['teamHomeCode', 'teamAwayCode', 'season', 'gamesPlayedHome', 'gamesPlayedAway']].sample(5)

Unnamed: 0,teamHomeCode,teamAwayCode,season,gamesPlayedHome,gamesPlayedAway
1646,CHA,LAL,2016,73,71
2138,MIL,LAC,2016,18,18
601,DET,GSW,2015,43,50
1629,CLE,LAL,2016,77,72
1306,UTA,SAS,2016,53,53


## Calculate the Win/Loss % for the Home team.

In [13]:
def calc_percentage(x, y):
    return 0 if y == 0 else (x / y)

In [14]:
if (use_parsed_data == False) & (calc_win_loss == True):
    
    # Create a dataframe for the results - same size as dataset
    data = pd.DataFrame(
        index=range(0,len(games)), 
        columns=[
            'totalGamesHome', 'totalWinsHome', 'homeGamesHome', 'homeWinsHome', 'awayGamesHome', 'awayWinsHome',
            'percentageTotalWinHome', 'percentageHomeWinHome', 'percentageAwayWinHome',
            'totalGamesAway', 'totalWinsAway', 'homeGamesAway', 'homeWinsAway', 'awayGamesAway', 'awayWinsAway',
            'percentageTotalWinAway', 'percentageHomeWinAway', 'percentageAwayWinAway'
        ]
    )

    # Iterate through each game.
    for index, game in games.iterrows():

        # Home Team
        # ---------
        
        historicGames = games[(games['season'] == game.season) & (games['gamesPlayedHome'] < game.gamesPlayedHome) & ((games['teamHomeCode'] == game.teamHomeCode) | (games['teamAwayCode'] == game.teamHomeCode))].sort_values(by='gamesPlayedHome', ascending=True)
       
        homeGamesHome = len(historicGames[(historicGames['teamHomeCode'] == game['teamHomeCode'])])        
        homeWinsHome = len(historicGames[(historicGames['teamHomeCode'] == game['teamHomeCode']) & (games["scoreAway"] < games["scoreHome"])])
        
        awayGamesHome = len(historicGames[(historicGames['teamAwayCode'] == game['teamHomeCode'])])        
        awayWinsHome = len(historicGames[(historicGames['teamAwayCode'] == game['teamHomeCode']) & (games["scoreAway"] > games["scoreHome"])])

        #totalGames = len(historicGames);
        #totalWins = len(
        #    historicGames[
        #        (historicGames['teamHomeCode'] == game['teamHomeCode']) & (games["scoreAway"] < games["scoreHome"]) |
        #        (historicGames['teamAwayCode'] == game['teamHomeCode']) & (games["scoreAway"] > games["scoreHome"])
        #    ]
        #)
        
        totalGamesHome = homeGamesHome + awayGamesHome
        totalWinsHome = homeWinsHome + awayWinsHome
        
        percentageTotalWinHome = round(calc_percentage(totalWinsHome, totalGamesHome), 3)
        percentageHomeWinHome = round(calc_percentage(homeWinsHome, homeGamesHome), 3)
        percentageAwayWinHome = round(calc_percentage(awayWinsHome, awayGamesHome), 3)
        
        # print('HOME', game['gamesPlayedHome'], 'TG', totalGamesHome, 'W', totalWinsHome, 'HGP', homeGamesHome, 'HW', homeWinsHome, 'AGP', awayGamesHome, 'AW', awayWinsHome)
        # print('HOME', 'W%', percentageTotalWinHome, 'HW%', percentageHomeWinHome, 'AW%', percentageAwayWinHome)
        
        # Away Team
        # ---------
        
        historicGames = games[(games['season'] == game.season) & (games['gamesPlayedAway'] < game.gamesPlayedAway) & ((games['teamAwayCode'] == game.teamAwayCode) | (games['teamHomeCode'] == game.teamAwayCode))].sort_values(by='gamesPlayedAway', ascending=True)
       
        homeGamesAway = len(historicGames[(historicGames['teamHomeCode'] == game['teamAwayCode'])])        
        homeWinsAway = len(historicGames[(historicGames['teamHomeCode'] == game['teamAwayCode']) & (games["scoreAway"] < games["scoreHome"])])
        
        awayGamesAway = len(historicGames[(historicGames['teamAwayCode'] == game['teamAwayCode'])])        
        awayWinsAway = len(historicGames[(historicGames['teamAwayCode'] == game['teamAwayCode']) & (games["scoreAway"] > games["scoreHome"])])

        #totalGames = len(historicGames);
        #totalWins = len(
        #    historicGames[
        #        (historicGames['teamHomeCode'] == game['teamHomeCode']) & (games["scoreAway"] < games["scoreHome"]) |
        #        (historicGames['teamAwayCode'] == game['teamHomeCode']) & (games["scoreAway"] > games["scoreHome"])
        #    ]
        #)
        
        totalGamesAway = homeGamesAway + awayGamesAway
        totalWinsAway = homeWinsAway + awayWinsAway
        
        percentageTotalWinAway = round(calc_percentage(totalWinsAway, totalGamesAway), 3)
        percentageHomeWinAway = round(calc_percentage(homeWinsAway, homeGamesAway), 3)
        percentageAwayWinAway = round(calc_percentage(awayWinsAway, awayGamesAway), 3)
        
        # print('AWAY', game['gamesPlayedAway'], 'TG', totalGamesAway, 'W', totalWinsAway, 'HGP', homeGamesAway, 'HW', homeWinsAway, 'AGP', awayGamesAway, 'AW', awayWinsAway)
        # print('AWAY', 'W%', percentageTotalWinAway, 'HW%', percentageHomeWinAway, 'AW%', percentageAwayWinAway)
        
        # ---------    
            
        # Update the row with the history
        data.loc[index] = [
            totalGamesHome, totalWinsHome, homeGamesHome, homeWinsHome, awayGamesHome, awayWinsHome,
            percentageTotalWinHome, percentageHomeWinHome, percentageAwayWinHome,
            totalGamesAway, totalWinsAway, homeGamesAway, homeWinsAway, awayGamesAway, awayWinsAway,
            percentageTotalWinAway, percentageHomeWinAway, percentageAwayWinAway
        ]

    # Add results to the dataset
    if (calc_win_loss_write_all == True):
        games = pd.merge(games, data, left_index=True, right_index=True)
        games[['teamHomeCode', 'teamAwayCode', 'percentageTotalWinHome', 'percentageHomeWinHome', 'percentageAwayWinHome', 'percentageTotalWinAway', 'percentageHomeWinAway', 'percentageAwayWinAway']].sample(5)
    else:
        games = pd.merge(games, data[['percentageHomeWinHome', 'percentageAwayWinAway']], left_index=True, right_index=True)
        games[['teamHomeCode', 'teamAwayCode', 'percentageHomeWinHome', 'percentageAwayWinAway']].sample(5)



### Calculate the results for the last 5 games for the Home team.

In [15]:
if (use_parsed_data == False) & (calc_last_5_games == True):
    
    # Create a dataframe for the results - same size as dataset
    data = pd.DataFrame(index=range(0,len(games)), columns=['lastGame1WinHome', 'lastGame1AtHomeHome', 'lastGame2WinHome', 'lastGame2AtHomeHome', 'lastGame3WinHome', 'lastGame3AtHomeHome', 'lastGame4WinHome', 'lastGame4AtHomeHome', 'lastGame5WinHome', 'lastGame5AtHomeHome'])

    # Iterate through each game where the teams have played at least 5 games.
    for index, game in games.iterrows():

        # Get the last five games for the team.
        last5games = games[(games['season'] == game.season) & (games['gamesPlayedHome'] < game.gamesPlayedHome) & ((games['teamAwayCode'] == game.teamHomeCode) | (games['teamHomeCode'] == game.teamHomeCode))].sort_values(by='gamesPlayedHome', ascending=False).head(5)

        if len(last5games) == 5:
            lastGame1WinHome = last5games["scoreAway"].iloc[0] < last5games["scoreHome"].iloc[0]
            lastGame2WinHome = last5games["scoreAway"].iloc[1] < last5games["scoreHome"].iloc[1]
            lastGame3WinHome = last5games["scoreAway"].iloc[2] < last5games["scoreHome"].iloc[2]
            lastGame4WinHome = last5games["scoreAway"].iloc[3] < last5games["scoreHome"].iloc[3]
            lastGame5WinHome = last5games["scoreAway"].iloc[4] < last5games["scoreHome"].iloc[4]
            #print(lastGame1WinHome, lastGame2WinHome, lastGame3WinHome, lastGame4WinHome, lastGame5WinHome)

            lastGame1AtHomeHome = last5games["teamHomeCode"].iloc[0] == game.teamHomeCode
            lastGame2AtHomeHome = last5games["teamHomeCode"].iloc[1] == game.teamHomeCode
            lastGame3AtHomeHome = last5games["teamHomeCode"].iloc[2] == game.teamHomeCode
            lastGame4AtHomeHome = last5games["teamHomeCode"].iloc[3] == game.teamHomeCode
            lastGame5AtHomeHome = last5games["teamHomeCode"].iloc[4] == game.teamHomeCode
            #print(lastGame1AtHomeHome, lastGame2AtHomeHome, lastGame3AtHomeHome, lastGame4AtHomeHome, lastGame5AtHomeHome)

            # Update the row with the history
            data.loc[index] = [lastGame1WinHome, lastGame1AtHomeHome, lastGame2WinHome, lastGame2AtHomeHome, lastGame3WinHome, lastGame3AtHomeHome, lastGame4WinHome, lastGame4AtHomeHome, lastGame5WinHome, lastGame5AtHomeHome]

    # Convert types 
    data['lastGame1WinHome'] = data['lastGame1WinHome'].astype('bool')
    data['lastGame2WinHome'] = data['lastGame2WinHome'].astype('bool')
    data['lastGame3WinHome'] = data['lastGame3WinHome'].astype('bool')
    data['lastGame4WinHome'] = data['lastGame4WinHome'].astype('bool')
    data['lastGame5WinHome'] = data['lastGame5WinHome'].astype('bool')
    data['lastGame1AtHomeHome'] = data['lastGame1AtHomeHome'].astype('bool')
    data['lastGame2AtHomeHome'] = data['lastGame2AtHomeHome'].astype('bool')
    data['lastGame3AtHomeHome'] = data['lastGame3AtHomeHome'].astype('bool')
    data['lastGame4AtHomeHome'] = data['lastGame4AtHomeHome'].astype('bool')
    data['lastGame5AtHomeHome'] = data['lastGame5AtHomeHome'].astype('bool')

    # Add results to the dataset
    games = pd.merge(games, data, left_index=True, right_index=True)
    
    games[['teamHomeCode', 'teamAwayCode', 'lastGame1WinHome', 'lastGame1AtHomeHome', 'lastGame2WinHome', 'lastGame2AtHomeHome', 'lastGame3WinHome', 'lastGame3AtHomeHome', 'lastGame4WinHome', 'lastGame4AtHomeHome', 'lastGame5WinHome', 'lastGame5AtHomeHome']].sample(5)

### Calculate the results for the last 5 games for the Away team.

In [16]:
if (use_parsed_data == False) & (calc_last_5_games == True):
    
    # Create a dataframe for the results - same size as dataset
    data = pd.DataFrame(index=range(0,len(games)), columns=['lastGame1WinAway', 'lastGame1AtHomeAway', 'lastGame2WinAway', 'lastGame2AtHomeAway', 'lastGame3WinAway', 'lastGame3AtHomeAway', 'lastGame4WinAway', 'lastGame4AtHomeAway', 'lastGame5WinAway', 'lastGame5AtHomeAway'])

    # Iterate through each game where the teams have played at least 5 games.
    for index, game in games.iterrows():

        # Get the last five games for the team.
        last5games = games[(games['season'] == game.season) & (games['gamesPlayedAway'] < game.gamesPlayedAway) & ((games['teamAwayCode'] == game.teamAwayCode) | (games['teamHomeCode'] == game.teamAwayCode))].sort_values(by='gamesPlayedAway', ascending=False).head(5)

        if len(last5games) == 5:
            lastGame1WinAway = last5games["scoreAway"].iloc[0] < last5games["scoreHome"].iloc[0]
            lastGame2WinAway = last5games["scoreAway"].iloc[1] < last5games["scoreHome"].iloc[1]
            lastGame3WinAway = last5games["scoreAway"].iloc[2] < last5games["scoreHome"].iloc[2]
            lastGame4WinAway = last5games["scoreAway"].iloc[3] < last5games["scoreHome"].iloc[3]
            lastGame5WinAway = last5games["scoreAway"].iloc[4] < last5games["scoreHome"].iloc[4]
            #print(lastGame1WinAway, lastGame2WinAway, lastGame3WinAway, lastGame4WinAway, lastGame5WinAway)

            lastGame1AtHomeAway = last5games["teamAwayCode"].iloc[0] == game.teamAwayCode
            lastGame2AtHomeAway = last5games["teamAwayCode"].iloc[1] == game.teamAwayCode
            lastGame3AtHomeAway = last5games["teamAwayCode"].iloc[2] == game.teamAwayCode
            lastGame4AtHomeAway = last5games["teamAwayCode"].iloc[3] == game.teamAwayCode
            lastGame5AtHomeAway = last5games["teamAwayCode"].iloc[4] == game.teamAwayCode
            #print(lastGame1AtHomeAway, lastGame2AtHomeAway, lastGame3AtHomeAway, lastGame4AtHomeAway, lastGame5AtHomeAway)

            # Update the row with the history
            data.loc[index] = [lastGame1WinAway, lastGame1AtHomeAway, lastGame2WinAway, lastGame2AtHomeAway, lastGame3WinAway, lastGame3AtHomeAway, lastGame4WinAway, lastGame4AtHomeAway, lastGame5WinAway, lastGame5AtHomeAway]

    # Convert types         
    data['lastGame1WinAway'] = data['lastGame1WinAway'].astype('bool')
    data['lastGame2WinAway'] = data['lastGame2WinAway'].astype('bool')
    data['lastGame3WinAway'] = data['lastGame3WinAway'].astype('bool')
    data['lastGame4WinAway'] = data['lastGame4WinAway'].astype('bool')
    data['lastGame5WinAway'] = data['lastGame5WinAway'].astype('bool')
    data['lastGame1AtHomeAway'] = data['lastGame1AtHomeAway'].astype('bool')
    data['lastGame2AtHomeAway'] = data['lastGame2AtHomeAway'].astype('bool')
    data['lastGame3AtHomeAway'] = data['lastGame3AtHomeAway'].astype('bool')
    data['lastGame4AtHomeAway'] = data['lastGame4AtHomeAway'].astype('bool')
    data['lastGame5AtHomeAway'] = data['lastGame5AtHomeAway'].astype('bool')

    # Add results to the dataset
    games = pd.merge(games, data, left_index=True, right_index=True)
    
    games[['teamHomeCode', 'teamAwayCode', 'lastGame1WinHome', 'lastGame1AtHomeHome', 'lastGame2WinHome', 'lastGame2AtHomeHome', 'lastGame3WinHome', 'lastGame3AtHomeHome', 'lastGame4WinHome', 'lastGame4AtHomeHome', 'lastGame5WinHome', 'lastGame5AtHomeHome']].sample(5)

### Calculate the average shooting % for the last 5 games

In [17]:
if (use_parsed_data == False) & (calc_ratings == True):
    
    # Create a dataframe for the results - same size as dataset
    data = pd.DataFrame(index=range(0,len(games)), columns=[
        'statsHome.seasonTotalPossessions', 'statsHome.last5TotalPossessions',
        'statsHome.seasonTotalPoints', 'statsHome.last5TotalPoints',
        'statsHome.seasonOffensiveRating', 'statsHome.last5OffensiveRating',
        'statsHome.seasonDefensiveRating', 'statsHome.last5DefensiveRating',
        'statsAway.seasonTotalPossessions', 'statsAway.last5TotalPossessions',
        'statsAway.seasonTotalPoints', 'statsAway.last5TotalPoints',
        'statsAway.seasonOffensiveRating', 'statsAway.last5OffensiveRating',
        'statsAway.seasonDefensiveRating', 'statsAway.last5DefensiveRating'
    ])

    # Iterate through each game where the teams have played at least 5 games.
    for index, game in games.iterrows():

        # Get all previous games - Home
        allgames = games[(games['season'] == game.season) & (games['gamesPlayedHome'] < game.gamesPlayedHome) & ((games['teamAwayCode'] == game.teamHomeCode) | (games['teamHomeCode'] == game.teamHomeCode))].sort_values(by='gamesPlayedHome', ascending=False)

        totalPossessionsHome = allgames['statsHome.FgAtt'].sum() - allgames['statsHome.OffReb'].sum() + allgames['statsHome.Tov'].sum() + (0.4 * allgames['statsHome.FtAtt'].sum());
        last5PossessionsHome = allgames['statsHome.FgAtt'].head(5).sum() - allgames['statsHome.OffReb'].head(5).sum() + allgames['statsHome.Tov'].head(5).sum() + (0.4 * allgames['statsHome.FtAtt'].head(5).sum());

        totalPointsScoredHome = allgames['statsHome.Pts'].sum()
        last5PointsScoredHome = allgames['statsHome.Pts'].head(5).sum()

        totalOffensiveRatingHome = round(calc_percentage(totalPointsScoredHome, totalPossessionsHome) * 100, 2)
        last5OffensiveRatingHome = round(calc_percentage(last5PointsScoredHome, last5PossessionsHome) * 100, 2)

        totalPointsAgainstHome = allgames['statsHome.PtsAgainst'].sum()
        last5PointsAgainstHome = allgames['statsHome.PtsAgainst'].head(5).sum()

        totalDefensiveRatingHome = round(calc_percentage(totalPointsAgainstHome, totalPossessionsHome) * 100, 2)
        last5DefensiveRatingHome = round(calc_percentage(last5PointsAgainstHome, last5PossessionsHome) * 100, 2)
        
        # Get all previous games - Away
        allgames = games[(games['season'] == game.season) & (games['gamesPlayedAway'] < game.gamesPlayedAway) & ((games['teamAwayCode'] == game.teamAwayCode) | (games['teamHomeCode'] == game.teamAwayCode))].sort_values(by='gamesPlayedAway', ascending=False)

        totalPossessionsAway = allgames['statsHome.FgAtt'].sum() - allgames['statsHome.OffReb'].sum() + allgames['statsHome.Tov'].sum() + (0.4 * allgames['statsHome.FtAtt'].sum());
        last5PossessionsAway = allgames['statsHome.FgAtt'].head(5).sum() - allgames['statsHome.OffReb'].head(5).sum() + allgames['statsHome.Tov'].head(5).sum() + (0.4 * allgames['statsHome.FtAtt'].head(5).sum());

        totalPointsScoredAway = allgames['statsHome.Pts'].sum()
        last5PointsScoredAway = allgames['statsHome.Pts'].head(5).sum()

        totalOffensiveRatingAway = round(calc_percentage(totalPointsScoredAway, totalPossessionsAway) * 100, 2)
        last5OffensiveRatingAway = round(calc_percentage(last5PointsScoredAway, last5PossessionsAway) * 100, 2)

        totalPointsAgainstAway = allgames['statsHome.PtsAgainst'].sum()
        last5PointsAgainstAway = allgames['statsHome.PtsAgainst'].head(5).sum()

        totalDefensiveRatingAway = round(calc_percentage(totalPointsAgainstAway, totalPossessionsAway) * 100, 2)
        last5DefensiveRatingAway = round(calc_percentage(last5PointsAgainstAway, last5PossessionsAway) * 100, 2)

        data.loc[index] = [
            totalPossessionsHome, last5PossessionsHome, 
            totalPointsScoredHome, last5PointsScoredHome, 
            totalOffensiveRatingHome, last5OffensiveRatingHome,
            totalDefensiveRatingHome, last5DefensiveRatingHome,
            totalPossessionsAway, last5PossessionsAway, 
            totalPointsScoredAway, last5PointsScoredAway, 
            totalOffensiveRatingAway, last5OffensiveRatingAway,
            totalDefensiveRatingAway, last5DefensiveRatingAway
        ]

        # Update the row with the history
        # data.loc[index] = [lastGame1WinAway, lastGame1AtHomeAway, lastGame2WinAway, lastGame2AtHomeAway, lastGame3WinAway, lastGame3AtHomeAway, lastGame4WinAway, lastGame4AtHomeAway, lastGame5WinAway, lastGame5AtHomeAway]

    data.sample(5)

    # Convert types
    # data['lastGame1WinAway'] = data['lastGame1WinAway'].astype('bool')

    # Add results to the dataset
    games = pd.merge(games, data, left_index=True, right_index=True)

    #games[['teamHomeCode', 'teamAwayCode', 'lastGame1WinHome', 'lastGame1AtHomeHome', 'lastGame2WinHome', 'lastGame2AtHomeHome', 'lastGame3WinHome', 'lastGame3AtHomeHome', 'lastGame4WinHome', 'lastGame4AtHomeHome', 'lastGame5WinHome', 'lastGame5AtHomeHome']].sample(5)


TypeError: ufunc 'true_divide' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [None]:
# test entry for stadium distance calc

coords_1 = (37.750267, -122.202853)
coords_2 = (34.04303865743706, -118.26711416244507)

geopy.distance.vincenty(coords_1, coords_2).miles


#### Save data to CSV to save on processing

In [None]:
if use_parsed_data == False:
    games.to_csv(output_parsed_data_filename, encoding='utf-8')

## Data Cleanse
Look at data types

In [None]:
games.dtypes

Transform the Time column into a datetime object.

In [None]:
games["time"] = pd.to_datetime(games["time"])

### Convert the home and away teams to one-hot encoding 

In [None]:
if use_team_hot_encoding == True:
    homeTeam = pd.get_dummies(games["teamHomeCode"], prefix='team')
    awayTeam = pd.get_dummies(games["teamAwayCode"], prefix='team')

    homeTeam = homeTeam.replace({0:np.nan})
    awayTeam = awayTeam.replace({0:np.nan})

    teams = homeTeam.fillna(awayTeam).fillna(0).astype('bool')
    # teams[['team_NYK', 'team_CLE', 'team_ATL', 'team_BOS']].head(2)
    
    games = pd.concat([games, teams], axis=1)
    
    games[['teamHomeCode', 'teamAwayCode', 'team_NYK', 'team_CLE']].head(2)

In [None]:
if use_team_home_away_hot_encoding == True:
    homeTeam = pd.get_dummies(games["teamHomeCode"], prefix='hometeam').astype('bool')
    awayTeam = pd.get_dummies(games["teamAwayCode"], prefix='awayteam').astype('bool')

    games = pd.concat([games, homeTeam, awayTeam], axis=1)
    
    games[['teamHomeCode', 'teamAwayCode', 'hometeam_NYK', 'hometeam_CLE', 'awayteam_NYK', 'awayteam_CLE']].head(2)

Drop the id fields for teams.

In [None]:
if (use_team_hot_encoding == True) | (use_team_home_away_hot_encoding == True) | (use_team_keys == False):
    games = games.drop(["teamAwayId", "teamHomeId"], axis=1)

In [None]:
games = games.drop(["teamAwayCode", "teamHomeCode"], axis=1)

In [None]:
games = games.drop(["location", "scoreQuarters", "time"], axis=1)

In [None]:
games = games.drop(statsColumns, axis=1)

In [None]:
#gamesQuarters = json_normalize(data=d['games'], record_path='scoreQuarters', meta=['id'])
#gamesQuarters.head(10)

In [None]:
games["homeWin"] = games["scoreHome"] > games["scoreAway"]

In [None]:
# Remove games scores ... not possible to predict the future, if you already know it :)
games = games.drop(["scoreAway", "scoreHome"], axis=1)

In [None]:
# Remove any games that have less than a 5 game history.
if calc_last_5_games == True:
    games = games[(pd.notnull(games['lastGame1WinHome'])) & (pd.notnull(games['lastGame1AtHomeAway']))]

In [None]:
# Fill any NaN stats with 0
games.fillna(0, inplace=True)
#games['statsHome.Ejections'].fillna(0, inplace=True)
#games['statsAway.Ejections'].fillna(0, inplace=True)

In [None]:
# Show any NaN
games[games.isnull().T.any().T]

In [None]:
games.sample(3)

In [None]:
msk = np.random.rand(len(games)) < 0.8
msk

train = games[msk]
test = games[~msk]

Y_train = train["homeWin"]
X_train = train.drop(["id","homeWin"], axis=1)

#
X_test  = test.drop(["id", "homeWin"], axis=1).copy()
X_train.shape, Y_train.shape, X_test.shape

In [None]:
# Reset the index
test.index = range(len(test.index))

In [None]:
X_test.sample(5)

### Logistic Regression

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
Y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_train, Y_train) * 100, 2)
acc_log

In [None]:
coeff_df = pd.DataFrame(games.columns.delete(0))
coeff_df.columns = ['Feature']
coeff_df["Correlation"] = pd.Series(logreg.coef_[0])

coeff_df.sort_values(by='Correlation', ascending=False)

In [None]:
result = pd.concat([test, pd.DataFrame(Y_pred, columns=['forecast'])], axis=1)
total = len(result)
correct = len(result[(result['homeWin'] == result['forecast'])])
incorrect = len(result[(result['homeWin'] != result['forecast'])])
correct_log = round((correct / total) * 100, 2)

print(correct_log, total, correct, incorrect)

### Support Vector Machines

In [None]:
svc = SVC()
svc.fit(X_train, Y_train)
Y_pred = svc.predict(X_test)
acc_svc = round(svc.score(X_train, Y_train) * 100, 2)
acc_svc

In [None]:
result = pd.concat([test, pd.DataFrame(Y_pred, columns=['forecast'])], axis=1)
total = len(result)
correct = len(result[(result['homeWin'] == result['forecast'])])
incorrect = len(result[(result['homeWin'] != result['forecast'])])
correct_svc = round((correct / total) * 100, 2)

print(correct_svc, total, correct, incorrect)

### k-Nearest Neighbors

In [None]:
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)
acc_knn = round(knn.score(X_train, Y_train) * 100, 2)
acc_knn

In [None]:
result = pd.concat([test, pd.DataFrame(Y_pred, columns=['forecast'])], axis=1)
total = len(result)
correct = len(result[(result['homeWin'] == result['forecast'])])
incorrect = len(result[(result['homeWin'] != result['forecast'])])
correct_knn = round((correct / total) * 100, 2)

print(correct_knn, total, correct, incorrect)

### Gaussian Naive Bayes

In [None]:
gaussian = GaussianNB()
gaussian.fit(X_train, Y_train)
Y_pred = gaussian.predict(X_test)
acc_gaussian = round(gaussian.score(X_train, Y_train) * 100, 2)
acc_gaussian

In [None]:
result = pd.concat([test, pd.DataFrame(Y_pred, columns=['forecast'])], axis=1)
total = len(result)
correct = len(result[(result['homeWin'] == result['forecast'])])
incorrect = len(result[(result['homeWin'] != result['forecast'])])
correct_gaussian = round((correct / total) * 100, 2)

print(correct_gaussian, total, correct, incorrect)

### Perceptron

In [None]:
perceptron = Perceptron(max_iter = 5)
perceptron.fit(X_train, Y_train)
Y_pred = perceptron.predict(X_test)
acc_perceptron = round(perceptron.score(X_train, Y_train) * 100, 2)
acc_perceptron

In [None]:
result = pd.concat([test, pd.DataFrame(Y_pred, columns=['forecast'])], axis=1)
total = len(result)
correct = len(result[(result['homeWin'] == result['forecast'])])
incorrect = len(result[(result['homeWin'] != result['forecast'])])
correct_perceptron = round((correct / total) * 100, 2)

print(correct_perceptron, total, correct, incorrect)

### Linear SVC

In [None]:
linear_svc = LinearSVC()
linear_svc.fit(X_train, Y_train)
Y_pred = linear_svc.predict(X_test)
acc_linear_svc = round(linear_svc.score(X_train, Y_train) * 100, 2)
acc_linear_svc

In [None]:
result = pd.concat([test, pd.DataFrame(Y_pred, columns=['forecast'])], axis=1)
total = len(result)
correct = len(result[(result['homeWin'] == result['forecast'])])
incorrect = len(result[(result['homeWin'] != result['forecast'])])
correct_linear_svc = round((correct / total) * 100, 2)

print(correct_linear_svc, total, correct, incorrect)

### Stochastic Gradient Descent

In [None]:
sgd = SGDClassifier(max_iter = 5)
sgd.fit(X_train, Y_train)
Y_pred = sgd.predict(X_test)
acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2)
acc_sgd

In [None]:
result = pd.concat([test, pd.DataFrame(Y_pred, columns=['forecast'])], axis=1)
total = len(result)
correct = len(result[(result['homeWin'] == result['forecast'])])
incorrect = len(result[(result['homeWin'] != result['forecast'])])
correct_sgd = round((correct / total) * 100, 2)

print(correct_sgd, total, correct, incorrect)

### Decision Tree

In [None]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
acc_decision_tree

In [None]:
result = pd.concat([test, pd.DataFrame(Y_pred, columns=['forecast'])], axis=1)
total = len(result)
correct = len(result[(result['homeWin'] == result['forecast'])])
incorrect = len(result[(result['homeWin'] != result['forecast'])])
correct_decision_tree = round((correct / total) * 100, 2)

print(correct_decision_tree, total, correct, incorrect)

### Random Forest

In [None]:
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
Y_pred = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
acc_random_forest

In [None]:
result = pd.concat([test, pd.DataFrame(Y_pred, columns=['forecast'])], axis=1)
total = len(result)
correct = len(result[(result['homeWin'] == result['forecast'])])
incorrect = len(result[(result['homeWin'] != result['forecast'])])
correct_random_forest = round((correct / total) * 100, 2)

print(correct_random_forest, len(Y_pred), total, correct, incorrect)

## Model Evaluation Results

In [None]:
models = pd.DataFrame({
    'Score': [acc_svc, acc_knn, acc_log, 
              acc_random_forest, acc_gaussian, acc_perceptron, 
              acc_sgd, acc_linear_svc, acc_decision_tree],
    'Correct': [correct_svc, correct_knn, correct_log, 
              correct_random_forest, correct_gaussian, correct_perceptron, 
              correct_sgd, correct_linear_svc, correct_decision_tree],
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest', 'Naive Bayes', 'Perceptron', 
              'Stochastic Gradient Decent', 'Linear SVC', 
              'Decision Tree']})
models.sort_values(by='Correct', ascending=False)

In [None]:
result[(result['homeWin'] != result['forecast'])].sample(10)

In [None]:
result[(result['id'] == 33946) | (result['id'] == 34601)]

In [None]:
games[(games['id'] == 33946) | (games['id'] == 34601)]