**Section 1: Obtaining Data**

We first need to gather historical data in order to train our model. Specifically, we will use the following data points:
- Game ID
- Game date
- Season (i.e. 2020-21)

The following stats are needed for both the home and away team in total...
- Team name
- Points
- Rebounds (total, offensive and defensive)
- Assists
- Turnovers
- Steals
- Blocks
- Total free throws
- Field goals (total attempts, made and missed)
- 2 PT field goals (total attempts, made and missed)
- 3 PT field goals (total attempts, made and missed)


In [2]:
# Imports and definitions

import config
import requests
import json
import datetime
import pandas as pd
import math
import time
import glob
import pickle
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression 
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from pprint import pprint

headers = {
    'x-rapidapi-key': config.rapid_api_key,
    'x-rapidapi-host': config.rapid_api_host
    }

In [5]:
for year in ["2015", "2016", "2017", "2018", "2019", "2020"]:

    season_url = "https://api-nba-v1.p.rapidapi.com/games/league/standard/" + year
    response = requests.request("GET", season_url, headers=headers)

    with open('season_data/season_' + year + '.json', 'w') as json_file:
        json.dump(response.json(), json_file)

Next, we need to clean the data obtained from our API. Specifically, we need to only include games that have NBA teams, as we are provided with international matchups as well as the all-star game teams. We should also discard preseason games here.

In [7]:
season_start_2015 = datetime.datetime(2015, 10, 27)
season_start_2016 = datetime.datetime(2016, 10, 25)
season_start_2017 = datetime.datetime(2017, 10, 17)
season_start_2018 = datetime.datetime(2018, 10, 16)
season_start_2019 = datetime.datetime(2019, 10, 22)
season_start_2020 = datetime.datetime(2020, 12, 22)

""" TO BE SWITCHED PER SEASON """
curr_season = "2020"
curr_season_start = season_start_2020
""""""

list_of_teams = [
    "Atlanta Hawks",
    "Boston Celtics",
    "Brooklyn Nets",
    "Charlotte Hornets",
    "Chicago Bulls",
    "Cleveland Cavaliers",
    "Dallas Mavericks",
    "Denver Nuggets",
    "Detroit Pistons",
    "Golden State Warriors",
    "Houston Rockets",
    "Indiana Pacers",
    "LA Clippers",
    "Los Angeles Lakers",
    "Memphis Grizzlies",
    "Miami Heat",
    "Milwaukee Bucks",
    "Minnesota Timberwolves",
    "New Orleans Pelicans",
    "New York Knicks",
    "Oklahoma City Thunder",
    "Orlando Magic",
    "Philadelphia 76ers",
    "Phoenix Suns",
    "Portland Trail Blazers",
    "Sacramento Kings",
    "San Antonio Spurs",
    "Toronto Raptors",
    "Utah Jazz",
    "Washington Wizards"
]

data_builder = {
    "date": [], 
    "game_id": [],
    "season": [],
    "home_team": [],
    "home_team_id": [],
    "away_team": [],
    "away_team_id": []
    }

with open('season_data/season_' + curr_season + '.json') as f:

  season_data = json.load(f)
  for game in season_data["api"]["games"]:
    try:
        game_date = datetime.datetime.strptime(game["startTimeUTC"], '%Y-%m-%dT%H:%M:%S.%fZ')
    except Exception:
        game_date = datetime.datetime.strptime(game["startTimeUTC"], '%Y-%m-%d')

    if curr_season_start <= game_date and game["statusShortGame"] != "1" \
        and game["vTeam"]["fullName"] in list_of_teams \
        and game["hTeam"]["fullName"] in list_of_teams:
        data_builder["date"].append(game_date.strftime("%x"))
        data_builder["game_id"].append(game["gameId"])
        data_builder["season"].append(curr_season)
        data_builder["home_team"].append(game["hTeam"]["fullName"])
        data_builder["home_team_id"].append(game["hTeam"]["teamId"])
        data_builder["away_team"].append(game["vTeam"]["fullName"])
        data_builder["away_team_id"].append(game["vTeam"]["teamId"])

  game_stats_df = pd.DataFrame(data=data_builder)
  game_stats_df['date'] = pd.to_datetime(game_stats_df['date'])
  game_stats_df.sort_values(by='date', ascending=True, inplace=True)
  game_stats_df.reset_index(inplace=True, drop=True)

game_stats_df.head(10)

Unnamed: 0,date,game_id,season,home_team,home_team_id,away_team,away_team_id
0,2020-12-23,7509,2020,Brooklyn Nets,4,Golden State Warriors,11
1,2020-12-23,7510,2020,Los Angeles Lakers,17,LA Clippers,16
2,2020-12-24,7518,2020,Cleveland Cavaliers,7,Charlotte Hornets,5
3,2020-12-24,7519,2020,Indiana Pacers,15,New York Knicks,24
4,2020-12-24,7520,2020,Orlando Magic,26,Miami Heat,20
5,2020-12-24,7521,2020,Philadelphia 76ers,27,Washington Wizards,41
6,2020-12-24,7511,2020,Boston Celtics,2,Milwaukee Bucks,21
7,2020-12-24,7522,2020,Toronto Raptors,38,New Orleans Pelicans,23
8,2020-12-24,7523,2020,Chicago Bulls,6,Atlanta Hawks,1
9,2020-12-24,7525,2020,Memphis Grizzlies,19,San Antonio Spurs,31


Next, we will use the game ID's to pull statistics for each game.

In [8]:
game_stats_builder = {
    "h_points": [],
    "h_field_goals_attempted": [],
    "h_field_goals_made": [],
    "h_field_goals_missed": [],
    "h_free_throws_attempted": [],
    "h_free_throws_made": [],
    "h_free_throws_missed": [],
    "h_3_pt_attempted": [],
    "h_3_pt_made": [],
    "h_3_pt_missed": [],
    "h_2_pt_attempted": [],
    "h_2_pt_made": [],
    "h_2_pt_missed": [],
    "h_total_reb": [],
    "h_off_reb": [],
    "h_def_reb": [],
    "h_assists": [],
    "h_steals": [],
    "h_turnovers": [],
    "h_blocks": [],
    "h_fouls": [],

    "a_points": [],
    "a_field_goals_attempted": [],
    "a_field_goals_made": [],
    "a_field_goals_missed": [],
    "a_free_throws_attempted": [],
    "a_free_throws_made": [],
    "a_free_throws_missed": [],
    "a_3_pt_attempted": [],
    "a_3_pt_made": [],
    "a_3_pt_missed": [],
    "a_2_pt_attempted": [],
    "a_2_pt_made": [],
    "a_2_pt_missed": [],
    "a_total_reb": [],
    "a_off_reb": [],
    "a_def_reb": [],
    "a_assists": [],
    "a_steals": [],
    "a_turnovers": [],
    "a_blocks": [],
    "a_fouls": []
}

print(len(game_stats_df))
to_drop = []

for i in range(len(game_stats_df)): 

    curr_game_id = game_stats_df.loc[i, "game_id"]
    curr_home_id = game_stats_df.loc[i, "home_team_id"]

    stats_url = "https://api-nba-v1.p.rapidapi.com/statistics/games/gameId/" + curr_game_id

    response = requests.request("GET", stats_url, headers=headers)
    stats_data = response.json()["api"]["statistics"]

    home_ind = -1
    away_ind = -1

    print("Index: " + str(i) + " Game ID: " + curr_game_id)

    try:
        if stats_data[0]["teamId"] == curr_home_id:
            home_ind = 0
            away_ind = 1
        else:
            home_ind = 1
            away_ind = 0
    except:
        to_drop.append(i)
        print('An error occurred!')
        continue

    # Home team
    game_stats_builder["h_points"].append(stats_data[home_ind]["points"])
    game_stats_builder["h_field_goals_attempted"].append(stats_data[home_ind]["fga"])
    game_stats_builder["h_field_goals_made"].append(stats_data[home_ind]["fgm"])
    game_stats_builder["h_field_goals_missed"].append(\
        str(int(stats_data[home_ind]["fga"]) - int(stats_data[home_ind]["fgm"])))
    game_stats_builder["h_free_throws_attempted"].append(stats_data[home_ind]["fta"])
    game_stats_builder["h_free_throws_made"].append(stats_data[home_ind]["ftm"])
    game_stats_builder["h_free_throws_missed"].append(\
        str(int(stats_data[home_ind]["fta"]) - int(stats_data[home_ind]["ftm"])))
    game_stats_builder["h_3_pt_attempted"].append(stats_data[home_ind]["tpa"])
    game_stats_builder["h_3_pt_made"].append(stats_data[home_ind]["tpm"])
    game_stats_builder["h_3_pt_missed"].append(\
        str(int(stats_data[home_ind]["tpa"]) - int(stats_data[home_ind]["tpm"])))
        

    h_2_pt_attempted = int(stats_data[home_ind]["fga"]) - int(stats_data[home_ind]["tpa"])
    h_2_pt_made = int(stats_data[home_ind]["fgm"]) - int(stats_data[home_ind]["tpm"])
    h_2_pt_missed = h_2_pt_attempted - h_2_pt_made

    game_stats_builder["h_2_pt_attempted"].append(str(h_2_pt_attempted))
    game_stats_builder["h_2_pt_made"].append(str(h_2_pt_made))
    game_stats_builder["h_2_pt_missed"].append(str(h_2_pt_missed))

    game_stats_builder["h_total_reb"].append(stats_data[home_ind]["totReb"])
    game_stats_builder["h_off_reb"].append(stats_data[home_ind]["offReb"])
    game_stats_builder["h_def_reb"].append(stats_data[home_ind]["defReb"])
    game_stats_builder["h_assists"].append(stats_data[home_ind]["assists"])
    game_stats_builder["h_steals"].append(stats_data[home_ind]["steals"])
    game_stats_builder["h_turnovers"].append(stats_data[home_ind]["turnovers"])
    game_stats_builder["h_blocks"].append(stats_data[home_ind]["blocks"])
    game_stats_builder["h_fouls"].append(stats_data[home_ind]["pFouls"])

    # Away team
    game_stats_builder["a_points"].append(stats_data[away_ind]["points"])
    game_stats_builder["a_field_goals_attempted"].append(stats_data[away_ind]["fga"])
    game_stats_builder["a_field_goals_made"].append(stats_data[away_ind]["fgm"])
    game_stats_builder["a_field_goals_missed"].append(\
        str(int(stats_data[away_ind]["fga"]) - int(stats_data[away_ind]["fgm"])))
    game_stats_builder["a_free_throws_attempted"].append(stats_data[away_ind]["fta"])
    game_stats_builder["a_free_throws_made"].append(stats_data[away_ind]["ftm"])
    game_stats_builder["a_free_throws_missed"].append(\
        str(int(stats_data[away_ind]["fta"]) - int(stats_data[away_ind]["ftm"])))
    game_stats_builder["a_3_pt_attempted"].append(stats_data[away_ind]["tpa"])
    game_stats_builder["a_3_pt_made"].append(stats_data[away_ind]["tpm"])
    game_stats_builder["a_3_pt_missed"].append(\
        str(int(stats_data[away_ind]["tpa"]) - int(stats_data[away_ind]["tpm"])))

    a_2_pt_attempted = int(stats_data[away_ind]["fga"]) - int(stats_data[away_ind]["tpa"])
    a_2_pt_made = int(stats_data[away_ind]["fgm"]) - int(stats_data[away_ind]["tpm"])
    a_2_pt_missed = a_2_pt_attempted - a_2_pt_made

    game_stats_builder["a_2_pt_attempted"].append(str(a_2_pt_attempted))
    game_stats_builder["a_2_pt_made"].append(str(a_2_pt_made))
    game_stats_builder["a_2_pt_missed"].append(str(a_2_pt_missed))

    game_stats_builder["a_total_reb"].append(stats_data[away_ind]["totReb"])
    game_stats_builder["a_off_reb"].append(stats_data[away_ind]["offReb"])
    game_stats_builder["a_def_reb"].append(stats_data[away_ind]["defReb"])
    game_stats_builder["a_assists"].append(stats_data[away_ind]["assists"])
    game_stats_builder["a_steals"].append(stats_data[away_ind]["steals"])
    game_stats_builder["a_turnovers"].append(stats_data[away_ind]["turnovers"])
    game_stats_builder["a_blocks"].append(stats_data[away_ind]["blocks"])
    game_stats_builder["a_fouls"].append(stats_data[away_ind]["pFouls"])

add_stats_df = pd.DataFrame(data=game_stats_builder)
add_stats_df.head()


540
Index: 0 Game ID: 7509
Index: 1 Game ID: 7510
Index: 2 Game ID: 7518
Index: 3 Game ID: 7519
Index: 4 Game ID: 7520
Index: 5 Game ID: 7521
Index: 6 Game ID: 7511
Index: 7 Game ID: 7522
Index: 8 Game ID: 7523
Index: 9 Game ID: 7525
Index: 10 Game ID: 7526
Index: 11 Game ID: 7527
Index: 12 Game ID: 7528
Index: 13 Game ID: 7512
Index: 14 Game ID: 7514
Index: 15 Game ID: 7515
Index: 16 Game ID: 7513
Index: 17 Game ID: 7516
Index: 18 Game ID: 7517
Index: 19 Game ID: 7529
Index: 20 Game ID: 7534
Index: 21 Game ID: 7530
Index: 22 Game ID: 7531
Index: 23 Game ID: 7532
Index: 24 Game ID: 7533
Index: 25 Game ID: 7535
Index: 26 Game ID: 7536
Index: 27 Game ID: 7537
Index: 28 Game ID: 7538
Index: 29 Game ID: 7539
Index: 30 Game ID: 7547
Index: 31 Game ID: 7546
Index: 32 Game ID: 8221
Index: 33 Game ID: 7544
Index: 34 Game ID: 7545
Index: 35 Game ID: 7542
Index: 36 Game ID: 7541
Index: 37 Game ID: 7540
Index: 38 Game ID: 7543
Index: 39 Game ID: 8222
Index: 40 Game ID: 8223
Index: 41 Game ID: 822

Unnamed: 0,h_points,h_field_goals_attempted,h_field_goals_made,h_field_goals_missed,h_free_throws_attempted,h_free_throws_made,h_free_throws_missed,h_3_pt_attempted,h_3_pt_made,h_3_pt_missed,...,a_2_pt_made,a_2_pt_missed,a_total_reb,a_off_reb,a_def_reb,a_assists,a_steals,a_turnovers,a_blocks,a_fouls
0,125,92,42,50,32,26,6,35,15,20,...,27,39,47,13,34,26,6,18,6,24
1,109,81,38,43,31,24,7,29,9,20,...,30,23,40,11,29,22,10,16,3,29
2,121,87,46,41,20,15,5,30,14,16,...,29,17,32,8,24,29,10,15,4,17
3,121,94,46,48,29,21,8,34,8,26,...,27,30,40,5,35,25,8,16,8,27
4,113,88,42,46,30,19,11,28,10,18,...,35,28,43,7,36,25,11,22,2,21


In [9]:
game_stats_df = game_stats_df.drop(to_drop)

game_stats_df.reset_index(inplace=True, drop=True)
temp_df = pd.concat([game_stats_df, add_stats_df], axis=1)
temp_df.to_csv('season_data/' + curr_season + '_games.csv')

temp_df.head()

Unnamed: 0,date,game_id,season,home_team,home_team_id,away_team,away_team_id,h_points,h_field_goals_attempted,h_field_goals_made,...,a_2_pt_made,a_2_pt_missed,a_total_reb,a_off_reb,a_def_reb,a_assists,a_steals,a_turnovers,a_blocks,a_fouls
0,2020-12-23,7509,2020,Brooklyn Nets,4,Golden State Warriors,11,125,92,42,...,27,39,47,13,34,26,6,18,6,24
1,2020-12-23,7510,2020,Los Angeles Lakers,17,LA Clippers,16,109,81,38,...,30,23,40,11,29,22,10,16,3,29
2,2020-12-24,7518,2020,Cleveland Cavaliers,7,Charlotte Hornets,5,121,87,46,...,29,17,32,8,24,29,10,15,4,17
3,2020-12-24,7519,2020,Indiana Pacers,15,New York Knicks,24,121,94,46,...,27,30,40,5,35,25,8,16,8,27
4,2020-12-24,7520,2020,Orlando Magic,26,Miami Heat,20,113,88,42,...,35,28,43,7,36,25,11,22,2,21


In [3]:
# Combine season data to one CSV
extension = 'csv'
all_filenames = [i for i in glob.glob('season_data/*.{}'.format(extension))]

combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ])
combined_csv.to_csv( "all_game_stats.csv", index=False, encoding='utf-8-sig')


In [4]:
# Load game CSV into dataframe
all_game_stats_df = pd.read_csv('all_game_stats.csv')
all_game_stats_df['date']= pd.to_datetime(all_game_stats_df['date'])
all_game_stats_df = all_game_stats_df.loc[:, ~all_game_stats_df.columns.str.contains('^Unnamed')]
all_game_stats_df = all_game_stats_df.dropna()
all_game_stats_df = all_game_stats_df.sort_values(by='game_id') # Sort by game ID to enforce
all_game_stats_df.reset_index(inplace=True, drop=True)

all_game_stats_df.head()

Unnamed: 0,date,game_id,season,home_team,home_team_id,away_team,away_team_id,h_points,h_field_goals_attempted,h_field_goals_made,...,a_2_pt_made,a_2_pt_missed,a_total_reb,a_off_reb,a_def_reb,a_assists,a_steals,a_turnovers,a_blocks,a_fouls
0,2015-10-28,110,2015,Atlanta Hawks,1,Detroit Pistons,10,94,82,37,...,25,42,59,23,36,23,5,15,3,15
1,2015-10-28,111,2015,Chicago Bulls,6,Cleveland Cavaliers,7,97,87,37,...,29,36,50,11,39,26,5,10,7,21
2,2015-10-28,112,2015,Golden State Warriors,11,New Orleans Pelicans,23,111,96,41,...,29,36,33,8,25,21,9,18,3,26
3,2015-10-28,113,2015,Orlando Magic,26,Washington Wizards,41,87,100,37,...,26,30,49,15,34,17,8,17,9,14
4,2015-10-28,114,2015,Boston Celtics,2,Philadelphia 76ers,27,112,85,39,...,27,34,46,14,32,12,11,22,6,22


**Section 2: Feature Calculation**

After retrieving the data, we now calculate the ELO rating and the performance of a team over the past ten games. These ideas were taken from the article here: https://towardsdatascience.com/predicting-the-outcome-of-nba-games-with-machine-learning-a810bb768f20

In [5]:
#Home and road team win probabilities implied by Elo ratings and home court adjustment
def win_probs(home_elo, away_elo, home_court_advantage) :
  h = math.pow(10, home_elo/400)
  r = math.pow(10, away_elo/400)
  a = math.pow(10, home_court_advantage/400) 

  denom = r + a*h
  home_prob = a*h / denom
  away_prob = r / denom 
  
  return home_prob, away_prob

  #odds the home team will win based on elo ratings and home court advantage

def home_odds_on(home_elo, away_elo, home_court_advantage) :
  h = math.pow(10, home_elo/400)
  r = math.pow(10, away_elo/400)
  a = math.pow(10, home_court_advantage/400)
  return a*h/r

#this function determines the constant used in the elo rating, based on margin of victory and difference in elo ratings
def elo_k(MOV, elo_diff):
  k = 20
  if MOV>0:
      multiplier=(MOV+3)**(0.8)/(7.5+0.006*(elo_diff))
  else:
      multiplier=(-MOV+3)**(0.8)/(7.5+0.006*(-elo_diff))
  return k*multiplier


#updates the home and away teams elo ratings after a game 

def update_elo(home_score, away_score, home_elo, away_elo, home_court_advantage) :
  home_prob, away_prob = win_probs(home_elo, away_elo, home_court_advantage) 

  home_score_int = int(home_score)
  away_score_int = int(away_score)

  if (home_score_int - away_score_int > 0) :
    home_win = 1 
    away_win = 0 
  else :
    home_win = 0 
    away_win = 1 
  
  k = elo_k(home_score_int - away_score_int, home_elo - away_elo)

  updated_home_elo = home_elo + k * (home_win - home_prob) 
  updated_away_elo = away_elo + k * (away_win - away_prob)

  return updated_home_elo, updated_away_elo


#takes into account prev season elo
def get_prev_elo(team, date, game_id, season, team_stats, elo_df):
    prev_game = team_stats[team_stats['game_id'] <= game_id]
    prev_game = prev_game[prev_game['game_id'] != game_id][(prev_game['home_team'] == team) | (prev_game['away_team'] == team)].sort_values(by='date').tail(1).iloc[0]

    if team == prev_game['home_team']:
      elo_rating = elo_df[elo_df['game_id'] == prev_game['game_id']]['h_elo_after'].values[0]
    else:
      elo_rating = elo_df[elo_df['game_id'] == prev_game['game_id']]['a_elo_after'].values[0]

    if prev_game['season'] != season:
      return (0.75 * elo_rating) + (0.25 * 1505)
    else :
      return elo_rating

In [6]:
elo_df = pd.DataFrame(columns=['game_id', 'home_team', 'away_team', 'h_elo_before', 'a_elo_before', 'h_elo_after', 'a_elo_after'])
teams_elo_df = pd.DataFrame(columns=['game_id','team', 'elo', 'date', 'home_or_away', 'season']) 

print(len(all_game_stats_df))

for index, row in all_game_stats_df.iterrows(): 
  game_id = row['game_id']
  game_date = row['date']
  season = row['season']
  game_id = row['game_id']
  h_team, a_team = row['home_team'], row['away_team']
  h_score, a_score = row['h_points'], row['a_points']

  print("Processing Index: " + str(index))

  if (h_team not in elo_df['home_team'].values and h_team not in elo_df['away_team'].values) :
    h_team_elo_before = 1500
  else :
    h_team_elo_before = get_prev_elo(h_team, game_date, game_id, season, all_game_stats_df, elo_df)

  if (a_team not in elo_df['home_team'].values and a_team not in elo_df['away_team'].values) :
    a_team_elo_before = 1500
  else :
    a_team_elo_before = get_prev_elo(a_team, game_date, game_id, season, all_game_stats_df, elo_df)

  h_team_elo_after, a_team_elo_after = update_elo(h_score, a_score, h_team_elo_before, \
    a_team_elo_before, 69)

  new_row = {'game_id': game_id, 'home_team': h_team, 'away_team': a_team, 'h_elo_before': \
    h_team_elo_before, 'a_elo_before': a_team_elo_before, 'h_elo_after' : h_team_elo_after, \
    'a_elo_after': a_team_elo_after}
  teams_row_one = {'game_id': game_id,'team': h_team, 'elo': h_team_elo_before,'date': game_date,\
    'home_or_away': 'Home', 'season': season}
  teams_row_two = {'game_id': game_id,'team': a_team, 'elo': a_team_elo_before, 'date':game_date,\
    'home_or_away': 'Away', 'season': season}
  
  elo_df = elo_df.append(new_row, ignore_index = True)
  teams_elo_df = teams_elo_df.append(teams_row_one, ignore_index=True)
  teams_elo_df = teams_elo_df.append(teams_row_two, ignore_index=True)

 Index: 6454
Processing Index: 6455
Processing Index: 6456
Processing Index: 6457
Processing Index: 6458
Processing Index: 6459
Processing Index: 6460
Processing Index: 6461
Processing Index: 6462
Processing Index: 6463
Processing Index: 6464
Processing Index: 6465
Processing Index: 6466
Processing Index: 6467
Processing Index: 6468
Processing Index: 6469
Processing Index: 6470
Processing Index: 6471
Processing Index: 6472
Processing Index: 6473
Processing Index: 6474
Processing Index: 6475
Processing Index: 6476
Processing Index: 6477
Processing Index: 6478
Processing Index: 6479
Processing Index: 6480
Processing Index: 6481
Processing Index: 6482
Processing Index: 6483
Processing Index: 6484
Processing Index: 6485
Processing Index: 6486
Processing Index: 6487
Processing Index: 6488
Processing Index: 6489
Processing Index: 6490
Processing Index: 6491
Processing Index: 6492
Processing Index: 6493
Processing Index: 6494
Processing Index: 6495
Processing Index: 6496
Processing Index: 649

In [7]:
dates = list(set([d.strftime("%m-%d-%Y") for d in teams_elo_df["date"]]))
dates = sorted(dates, key=lambda x: time.strptime(x, '%m-%d-%Y'))
teams = all_game_stats_df["away_team"]
dataset = pd.DataFrame(columns=dates)
dataset["team"] = teams.drop_duplicates()
dataset = dataset.set_index("team")
for index, row in teams_elo_df.iterrows():
  date = row["date"].strftime("%m-%d-%Y")
  team = row["team"]
  elo = row["elo"]
  dataset[date][team] = elo

teams_elo_df['elo'] = teams_elo_df['elo'].astype(float)

elo_df.head()

Unnamed: 0,game_id,home_team,away_team,h_elo_before,a_elo_before,h_elo_after,a_elo_after
0,110,Atlanta Hawks,Detroit Pistons,1500,1500,1486.082766,1513.917234
1,111,Chicago Bulls,Cleveland Cavaliers,1500,1500,1503.884686,1496.115314
2,112,Golden State Warriors,New Orleans Pelicans,1500,1500,1511.302715,1488.697285
3,113,Orlando Magic,Washington Wizards,1500,1500,1495.165763,1504.834237
4,114,Boston Celtics,Philadelphia 76ers,1500,1500,1511.776166,1488.223834


Below, we calculate the statistics of teams over the past 10 games

In [8]:
# Given a team and a date, this method will return that teams average stats over the previous n games

def get_avg_stats_last_n_games(team, game_date, season_team_stats, n) :
    prev_game_df = season_team_stats[season_team_stats['date'] < game_date][(season_team_stats\
        ['home_team'] == team) | (season_team_stats['away_team'] == team)].sort_values\
        (by='date').tail(n)

    prev_game_df.drop(columns = ['home_team_id', 'away_team_id'], inplace=True)

    h_df = prev_game_df.loc[:, prev_game_df.columns.str.startswith('h')]
    a_df = prev_game_df.loc[:, prev_game_df.columns.str.startswith('a')]

    h_df = h_df.rename({"home_team": "h_team"}, axis=1)
    a_df = a_df.rename({"away_team": "a_team"}, axis=1)

    h_df.columns = [x[2:] for x in h_df.columns]
    a_df.columns = [x[2:] for x in a_df.columns]

    df = pd.concat([h_df, a_df])
    df = df[df['team'] == team]
    df.drop(columns = ['team'], inplace=True)

    for column in df:
        if column != "date":
            df[column] = pd.to_numeric(df[column])

    return df.mean()

In [9]:
recent_performance_df = pd.DataFrame()

for season in all_game_stats_df['season'].unique() :
    season_team_stats = all_game_stats_df[all_game_stats_df['season'] == season].sort_values\
        (by='date').reset_index(drop=True)

    season_recent_performance_df = pd.DataFrame()

    for index, row in season_team_stats.iterrows(): 

        print("Processing Index " + str(index))

        game_id = row['game_id']
        game_date = row['date']
        h_team = row['home_team']
        a_team = row['away_team']

        h_team_recent_performance = get_avg_stats_last_n_games(h_team, game_date, \
            season_team_stats, 10)
        h_team_recent_performance.index = ['h_last_10_avg_' + x for x in \
            h_team_recent_performance.index]

        a_team_recent_performance = get_avg_stats_last_n_games(a_team, game_date, \
            season_team_stats, 10)
        a_team_recent_performance.index = ['a_last_10_avg_' + x for x in \
            a_team_recent_performance.index]  

        new_row = pd.concat([h_team_recent_performance, a_team_recent_performance])
        new_row['game_id'] = game_id

        # print(new_row)

        season_recent_performance_df = season_recent_performance_df.append(new_row, \
            ignore_index=True)
        season_recent_performance_df = season_recent_performance_df[new_row.index] 

    recent_performance_df = pd.concat([recent_performance_df, season_recent_performance_df])

recent_performance_df

 Index 782
Processing Index 783
Processing Index 784
Processing Index 785
Processing Index 786
Processing Index 787
Processing Index 788
Processing Index 789
Processing Index 790
Processing Index 791
Processing Index 792
Processing Index 793
Processing Index 794
Processing Index 795
Processing Index 796
Processing Index 797
Processing Index 798
Processing Index 799
Processing Index 800
Processing Index 801
Processing Index 802
Processing Index 803
Processing Index 804
Processing Index 805
Processing Index 806
Processing Index 807
Processing Index 808
Processing Index 809
Processing Index 810
Processing Index 811
Processing Index 812
Processing Index 813
Processing Index 814
Processing Index 815
Processing Index 816
Processing Index 817
Processing Index 818
Processing Index 819
Processing Index 820
Processing Index 821
Processing Index 822
Processing Index 823
Processing Index 824
Processing Index 825
Processing Index 826
Processing Index 827
Processing Index 828
Processing Index 829
Pr

Unnamed: 0,h_last_10_avg_points,h_last_10_avg_field_goals_attempted,h_last_10_avg_field_goals_made,h_last_10_avg_field_goals_missed,h_last_10_avg_free_throws_attempted,h_last_10_avg_free_throws_made,h_last_10_avg_free_throws_missed,h_last_10_avg_3_pt_attempted,h_last_10_avg_3_pt_made,h_last_10_avg_3_pt_missed,...,a_last_10_avg_2_pt_missed,a_last_10_avg_total_reb,a_last_10_avg_off_reb,a_last_10_avg_def_reb,a_last_10_avg_assists,a_last_10_avg_steals,a_last_10_avg_turnovers,a_last_10_avg_blocks,a_last_10_avg_fouls,game_id
0,,,,,,,,,,,...,,,,,,,,,,110.0
1,,,,,,,,,,,...,,,,,,,,,,111.0
2,,,,,,,,,,,...,,,,,,,,,,112.0
3,,,,,,,,,,,...,,,,,,,,,,113.0
4,,,,,,,,,,,...,,,,,,,,,,114.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
535,116.6,87.4,45.3,42.1,15.2,12.2,3.0,33.4,13.8,19.6,...,21.3,41.0,7.8,33.2,25.8,8.0,12.0,4.6,18.7,8039.0
536,101.8,89.6,36.1,53.5,24.1,18.1,6.0,42.8,11.5,31.3,...,28.9,44.8,13.0,31.8,24.2,7.3,15.7,5.6,20.5,8040.0
537,122.7,90.0,45.3,44.7,25.8,19.4,6.4,34.0,12.7,21.3,...,17.5,44.8,9.8,35.0,24.1,7.2,13.1,4.6,19.0,8041.0
538,107.1,90.0,39.2,50.8,22.5,18.7,3.8,28.8,10.0,18.8,...,19.7,42.7,8.7,34.0,25.9,6.7,12.1,4.1,16.9,8042.0


In [10]:
all_game_stats_df = all_game_stats_df.loc[:, ~all_game_stats_df.columns.str.contains('^Unnamed')]
final_team_stats = all_game_stats_df.iloc[0:, [0,1,2,3,4,5]].merge(elo_df.drop(columns=\
    ['home_team', 'away_team']), on='game_id').merge(recent_performance_df, on='game_id')

final_team_stats['h_points'] = all_game_stats_df['h_points']
final_team_stats['a_points'] = all_game_stats_df['a_points']

final_team_stats = final_team_stats.dropna()

final_team_stats.to_csv('final_team_stats.csv')

final_team_stats.head()

Unnamed: 0,date,game_id,season,home_team,home_team_id,away_team,h_elo_before,a_elo_before,h_elo_after,a_elo_after,...,a_last_10_avg_total_reb,a_last_10_avg_off_reb,a_last_10_avg_def_reb,a_last_10_avg_assists,a_last_10_avg_steals,a_last_10_avg_turnovers,a_last_10_avg_blocks,a_last_10_avg_fouls,h_points,a_points
18,2015-10-30,128,2015,New York Knicks,24,Atlanta Hawks,1522.93,1486.08,1508.230877,1500.782009,...,40.0,7.0,33.0,22.0,9.0,15.0,4.0,25.0,101,112
19,2015-10-30,129,2015,LA Clippers,16,Dallas Mavericks,1510.06,1516.81,1521.691142,1505.185211,...,48.0,6.0,42.0,24.0,7.0,8.0,3.0,25.0,104,88
20,2015-10-30,130,2015,Cleveland Cavaliers,7,Miami Heat,1522.58,1508.34,1530.431615,1500.494914,...,41.0,2.0,39.0,23.0,5.0,13.0,7.0,25.0,102,92
21,2015-10-30,131,2015,Orlando Magic,26,Oklahoma City Thunder,1495.17,1506.22,1488.708327,1512.674337,...,45.0,11.0,34.0,21.0,7.0,19.0,4.0,19.0,136,139
22,2015-10-30,132,2015,Philadelphia 76ers,27,Utah Jazz,1488.22,1494.67,1463.846581,1519.047546,...,38.0,4.0,34.0,15.0,4.0,12.0,5.0,25.0,71,99


**Section 3: Predicting Game Winners**

Continuing the afforementioned strategy of used the part performance and ELO rating of a team, we now look to train and test the model. We begin by adding a label to the final team statistics.

In [47]:
final_team_stats = pd.read_csv('final_team_stats.csv')
final_team_stats = final_team_stats.loc[:, ~final_team_stats.columns.str.contains('^Unnamed')]
final_team_stats.sort_values(by='date', ascending=True, inplace=True)
final_team_stats = final_team_stats.reset_index(drop=True)

# final_team_stats['true_winner'] = [1 if x > 0 else 0 for x in final_team_stats['h_points'] - final_team_stats['a_points']]
# final_team_stats.drop(columns=['h_points', 'a_points', 'date', 'game_id', 'season', 'home_team', 'away_team', 'home_team_id', 'h_elo_after', 'a_elo_after'], inplace=True)

# y = final_team_stats['true_winner']

# final_team_stats.drop(columns=['true_winner'], inplace=True)
# X = final_team_stats
# i_am_temp = X.iloc[7222]

X
# i_am_temp

Unnamed: 0,h_elo_before,a_elo_before,h_last_10_avg_points,h_last_10_avg_field_goals_attempted,h_last_10_avg_field_goals_made,h_last_10_avg_field_goals_missed,h_last_10_avg_free_throws_attempted,h_last_10_avg_free_throws_made,h_last_10_avg_free_throws_missed,h_last_10_avg_3_pt_attempted,...,a_last_10_avg_2_pt_made,a_last_10_avg_2_pt_missed,a_last_10_avg_total_reb,a_last_10_avg_off_reb,a_last_10_avg_def_reb,a_last_10_avg_assists,a_last_10_avg_steals,a_last_10_avg_turnovers,a_last_10_avg_blocks,a_last_10_avg_fouls
0,1522.930121,1486.082766,122.0,93.0,42.0,51.0,35.0,29.0,6.0,23.0,...,29.0,26.0,40.0,7.0,33.0,22.0,9.0,15.0,4.0,25.0
1,1510.061898,1516.814455,111.0,80.0,42.0,38.0,31.0,21.0,10.0,19.0,...,30.0,34.0,48.0,6.0,42.0,24.0,7.0,8.0,3.0,25.0
2,1522.583291,1508.343237,100.5,89.0,39.5,49.5,17.0,10.5,6.5,29.0,...,24.0,29.0,41.0,2.0,39.0,23.0,5.0,13.0,7.0,25.0
3,1495.165763,1506.216901,87.0,100.0,37.0,63.0,12.0,8.0,4.0,26.0,...,35.0,32.0,45.0,11.0,34.0,21.0,7.0,19.0,4.0,19.0
4,1488.223834,1494.670293,95.0,83.0,34.0,49.0,23.0,20.0,3.0,22.0,...,33.0,30.0,38.0,4.0,34.0,15.0,4.0,12.0,5.0,25.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7221,1392.287249,1607.991367,116.6,87.4,45.3,42.1,15.2,12.2,3.0,33.4,...,28.5,21.3,41.0,7.8,33.2,25.8,8.0,12.0,4.6,18.7
7222,1566.678186,1426.999326,101.8,89.6,36.1,53.5,24.1,18.1,6.0,42.8,...,29.5,28.9,44.8,13.0,31.8,24.2,7.3,15.7,5.6,20.5
7223,1490.013155,1607.632906,122.7,90.0,45.3,44.7,25.8,19.4,6.4,34.0,...,25.2,17.5,44.8,9.8,35.0,24.1,7.2,13.1,4.6,19.0
7224,1481.906971,1536.908597,107.1,90.0,39.2,50.8,22.5,18.7,3.8,28.8,...,26.1,19.7,42.7,8.7,34.0,25.9,6.7,12.1,4.1,16.9


In [32]:
# Perform train/test split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

Having split the data appropriately, we now test various models. Specifically, a Naive Bayes classifier, a Logistic Regression model and a Random Forest classifier were tested. Using Grid Search to optimize parameters, we found the Random Forest classifier to be the most accurate although more models should be tested.

In [33]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred_rf))

from pprint import pprint
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(rf.get_params())

0.6526752767527675
Parameters currently in use:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}


In [35]:
# Create the parameter grid based on the results of grid search 
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 1600, num = 5)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(50, 110, num = 3)]
max_depth.append(None)
min_samples_split = [3, 5, 8]
min_samples_leaf = [3, 4, 5]
bootstrap = [True]
param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(param_grid)

# Create a based model
rf = RandomForestClassifier(random_state=42)
# Instantiate the grid search model with 2-fold cross-validation
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 2, n_jobs = -1, verbose = 2)

{'bootstrap': [True],
 'max_depth': [50, 80, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [3, 4, 5],
 'min_samples_split': [3, 5, 8],
 'n_estimators': [200, 550, 900, 1250, 1600]}


In [36]:
# Fit the grid search to the data
grid_search.fit(X_train, y_train)
best_grid = grid_search.best_estimator_
best_grid.fit(X_train, y_train)
y_pred_best = best_grid.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred_best))

Fitting 2 folds for each of 360 candidates, totalling 720 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   40.5s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  9.1min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed: 17.1min
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed: 19.4min finished
0.6637453874538746


As our last step, we store the model in a pickle file for later use.

In [37]:
pickle.dump(best_grid, open('model.pkl', 'wb'))

In [43]:
loaded_model = pickle.load(open('model.pkl', 'rb'))
y_pred = loaded_model.predict(X_test)
probability_matrix = loaded_model.predict_proba(X_test)
print("Accuracy: " + str(metrics.accuracy_score(y_test, y_pred)))
print("Probability Matrix: " + str(probability_matrix))
print(probability_matrix)

z_arr = []
z_ser = None

for i in range(len(probability_matrix)):
    max_val = max(all_prob[i])
    max_arr.append(max_val)
    data = np.array(max_arr)
    z_ser = pd.Series(data) 

X_test["proba"] = z_ser
X

Accuracy: 0.6637453874538746
Probability Matrix: [[0.29789507 0.70210493]
 [0.29761101 0.70238899]
 [0.63097909 0.36902091]
 ...
 [0.56713867 0.43286133]
 [0.4129388  0.5870612 ]
 [0.61703624 0.38296376]]
[[0.29789507 0.70210493]
 [0.29761101 0.70238899]
 [0.63097909 0.36902091]
 ...
 [0.56713867 0.43286133]
 [0.4129388  0.5870612 ]
 [0.61703624 0.38296376]]


Unnamed: 0,h_elo_before,a_elo_before,h_last_10_avg_points,h_last_10_avg_field_goals_attempted,h_last_10_avg_field_goals_made,h_last_10_avg_field_goals_missed,h_last_10_avg_free_throws_attempted,h_last_10_avg_free_throws_made,h_last_10_avg_free_throws_missed,h_last_10_avg_3_pt_attempted,...,a_last_10_avg_2_pt_missed,a_last_10_avg_total_reb,a_last_10_avg_off_reb,a_last_10_avg_def_reb,a_last_10_avg_assists,a_last_10_avg_steals,a_last_10_avg_turnovers,a_last_10_avg_blocks,a_last_10_avg_fouls,proba
2512,1535.294580,1526.921635,108.90,87.40,40.8,46.60,20.00,14.7,5.30,35.1,...,31.00,43.6,10.9,32.7,23.30,7.70,12.3,4.6,17.90,0.702105
4701,1639.588293,1467.458905,114.20,89.70,39.7,50.00,26.80,21.8,5.00,35.1,...,28.60,47.7,10.5,37.2,24.70,7.60,14.5,3.9,19.70,0.702389
1829,1421.653656,1648.039826,106.10,91.00,41.9,49.10,17.30,13.0,4.30,28.3,...,19.60,44.4,10.4,34.0,26.60,8.10,14.0,4.8,19.60,0.630979
4965,1560.139477,1498.070154,107.70,93.80,38.5,55.30,27.50,18.9,8.60,34.4,...,29.60,46.5,12.1,34.4,25.10,8.40,13.8,5.4,19.10,0.671652
1218,1724.133123,1436.552195,96.60,79.80,35.8,44.00,24.60,19.4,5.20,17.5,...,37.70,42.0,14.1,27.9,19.30,8.80,12.3,3.1,20.60,0.781279
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7039,1457.381597,1547.618478,114.50,91.40,41.9,49.50,21.00,16.4,4.60,37.4,...,23.60,45.9,8.8,37.1,23.40,8.40,14.1,5.6,21.00,0.777716
829,1525.201114,1730.143085,102.90,85.90,39.6,46.30,26.90,19.4,7.50,15.4,...,24.80,46.4,9.4,37.0,29.90,8.10,15.0,5.2,20.50,0.724011
1754,1427.446504,1617.505922,99.10,79.30,36.9,42.40,17.80,14.1,3.70,29.5,...,20.20,46.5,11.7,34.8,27.90,8.20,15.0,4.0,19.90,0.567139
667,1482.015206,1496.171322,100.30,81.00,37.3,43.70,21.80,16.5,5.30,25.8,...,31.20,44.1,10.1,34.0,21.60,6.70,12.0,3.4,20.40,0.587061


**Section 4: Debugging**

The code below was used for debugging purposes

In [39]:
all_prob = loaded_model.predict_proba(X)
max_arr = []
ser = None

for i in range(len(all_prob)):
    max_val = max(all_prob[i])
    max_arr.append(max_val)
    data = np.array(max_arr)
    ser = pd.Series(data) 

print(ser)


0       0.672172
1       0.685532
2       0.518288
3       0.506504
4       0.675415
          ...   
7221    0.750197
7222    0.700890
7223    0.560465
7224    0.727488
7225    0.518449
Length: 7226, dtype: float64


In [67]:
temp = {
    "h_elo_before": [1547.62],
    "a_elo_before": [1446.06],
    "h_points": [100.7],
    "h_field_goals_attempted": [88.1],
    "h_field_goals_made": [35.4],
    "h_field_goals_missed": [52.7],
    "h_free_throws_attempted": [24.4],
    "h_free_throws_made": [18.7],
    "h_free_throws_missed": [5.7],
    "h_3_pt_attempted": [41.6],
    "h_3_pt_made": [11.2],
    "h_3_pt_missed": [30.4],
    "h_2_pt_attempted": [46.5],
    "h_2_pt_made": [24.2],
    "h_2_pt_missed": [22.3],
    "h_total_reb": [38.9],
    "h_off_reb": [9.3],
    "h_def_reb": [29.6],
    "h_assists": [19.8],
    "h_steals": [7.8],
    "h_turnovers": [13.4],
    "h_blocks": [4.8],
    "h_fouls": [20.3],
    
    "a_points": [105.5],
    "a_field_goals_attempted": [86.9],
    "a_field_goals_made": [39.3],
    "a_field_goals_missed": [47.6],
    "a_free_throws_attempted": [23.0],
    "a_free_throws_made": [16.9],
    "a_free_throws_missed": [6.1],
    "a_3_pt_attempted": [29.9],
    "a_3_pt_made": [10.0],
    "a_3_pt_missed": [19.9],
    "a_2_pt_attempted": [57.0],
    "a_2_pt_made": [29.3],
    "a_2_pt_missed": [27.7],
    "a_total_reb": [44.7],
    "a_off_reb": [11.8],
    "a_def_reb": [32.9],
    "a_assists": [24.0],
    "a_steals": [7.3],
    "a_turnovers": [15.9],
    "a_blocks": [5.5],
    "a_fouls": [20.5],
}

temp_df = pd.DataFrame(data=temp)
temp_df.head()

Unnamed: 0,h_elo_before,a_elo_before,h_points,h_field_goals_attempted,h_field_goals_made,h_field_goals_missed,h_free_throws_attempted,h_free_throws_made,h_free_throws_missed,h_3_pt_attempted,...,a_2_pt_made,a_2_pt_missed,a_total_reb,a_off_reb,a_def_reb,a_assists,a_steals,a_turnovers,a_blocks,a_fouls
0,1547.62,1446.06,100.7,88.1,35.4,52.7,24.4,18.7,5.7,41.6,...,29.3,27.7,44.7,11.8,32.9,24.0,7.3,15.9,5.5,20.5


In [68]:
asd = loaded_model.predict_proba(temp_df)
# asd = loaded_model.predict_proba(i_am_temp.to_frame().T)
asd
# print(i_am_temp)

array([[0.34468183, 0.65531817]])