In [None]:
#importing relavant libraries
from bs4 import BeautifulSoup, Comment
import pandas as pd
import requests
import time
import os, os.path
from datetime import datetime

Web Scraping NBA Games

In [None]:
#creating a list for the seasons we plan to scrape data from
X = int(input("Enter the ending year of the season you wish to being scraping from: "))
Y = int(input("Enter the ending year of the season you wish to scrape to: ")) + 1
seasons = [*range(X, Y)]
print("\nThe basketball seasons that will webscraped are:")
print(seasons)

Enter the ending year of the season you wish to being scraping from: 2015
Enter the ending year of the season you wish to scrape to: 2022

The basketball seasons that will webscraped are:
[2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]


In [None]:
def scrape_season(season):
  #creating URLs with the months within the NBA season
  standings_links = []
  season_links = []

  URL = f'https://www.basketball-reference.com/leagues/NBA_{season}_games.html'
  page = requests.get(URL)
  if page.status_code == 200:
    #importing the raw html in to beautifulsoup
    soup = BeautifulSoup(page.content, 'html.parser')
    print(soup.find('title').get_text())

    path = ""
    a_tags = soup.findAll('a', href=True)
    for a_tag in a_tags:
      if "games-" in a_tag['href']:
        path = str(a_tag['href'])
        standings_links.append(f'https://basketball-reference.com{path}')

    time.sleep(5)

    #creating a csv file for each link to scrape box score data
    for url in standings_links:
      filename = url.replace('/', ' ').replace('.', ' ').split()[-2] + '.csv'
      df = pd.DataFrame()
      df[url] = ""

      df.to_csv(filename, index=False)

for season in seasons:
  scrape_season(season)

2014-15 NBA Schedule | Basketball-Reference.com
2015-16 NBA Schedule | Basketball-Reference.com
2016-17 NBA Schedule | Basketball-Reference.com
2017-18 NBA Schedule | Basketball-Reference.com
2018-19 NBA Schedule | Basketball-Reference.com
2019-20 NBA Schedule | Basketball-Reference.com
2020-21 NBA Schedule | Basketball-Reference.com
2021-22 NBA Schedule | Basketball-Reference.com


In [None]:
#creating a new folder in the local directory
folder_name = 'boxscores_directory'
folder_path = os.path.join('/content', folder_name)

#checking if the folder already exists
if not os.path.exists(folder_path):
    os.makedirs(folder_path)
    print(f"Folder '{folder_name}' created successfully at {folder_path}")

#locating files stored on the local directory (Google Colab)
directory_path = '/content/'

In [None]:
#create an empty list to store the boxscore links
boxscore_links = []

for file_name in os.listdir(directory_path):
  file_path = os.path.join(directory_path, file_name)

  if os.path.isfile(file_path) and file_name.endswith(".csv"):
    with open(file_path, 'r') as file:
        standings_link = file.read().strip()  #reading the link from the file

    page = requests.get(standings_link)
    print(f'Fetching URL from file: {standings_link}')
    if page.status_code == 200:
      soup = BeautifulSoup(page.content, 'html.parser')

      #find all 'td' elements with class "center" containing '/boxscores/'
      table = soup.find_all('td', class_="center")
      for td in table:
        a_tag = td.find('a')
        if a_tag and '/boxscores/' in a_tag.get('href', ''):
            boxscore_links.append('https://basketball-reference.com' + a_tag.get('href'))

      time.sleep(4)
    else:
      print(f'Box score link from {standings_link} could not be found/reached!')

Fetching URL from file: https://basketball-reference.com/leagues/NBA_2017_games-december.html
Fetching URL from file: https://basketball-reference.com/leagues/NBA_2021_games-july.html
Fetching URL from file: https://basketball-reference.com/leagues/NBA_2015_games-may.html
Fetching URL from file: https://basketball-reference.com/leagues/NBA_2015_games-november.html
Fetching URL from file: https://basketball-reference.com/leagues/NBA_2017_games-march.html
Fetching URL from file: https://basketball-reference.com/leagues/NBA_2019_games-january.html
Fetching URL from file: https://basketball-reference.com/leagues/NBA_2019_games-june.html
Fetching URL from file: https://basketball-reference.com/leagues/NBA_2016_games-april.html
Fetching URL from file: https://basketball-reference.com/leagues/NBA_2020_games-january.html
Fetching URL from file: https://basketball-reference.com/leagues/NBA_2019_games-march.html
Fetching URL from file: https://basketball-reference.com/leagues/NBA_2022_games-nove

In [None]:
def scrape_boxscores(boxscore_links, output_folder):
  #creating variables for progress counter
  x = 0
  y = len(boxscore_links)

  #creating a loop to save and scrape box scores from the list of box score urls
  for url in boxscore_links:
    #creating an HTML file with the boxscore number as the name
    boxscore_number = url.split('/boxscores/')[-1]
    html_filename = f'{boxscore_number}'
    html_filepath = os.path.join(output_folder, html_filename)
    existing_files = set(os.listdir(output_folder))

    #checking if the file already exists in the output folder
    if html_filename in existing_files:
      print(f'Skipping {url} as {boxscore_number} already exists.')
    else:
      response = requests.get(url)

      if response.status_code == 200:
        #retreiving only the html content under the "content" id
        soup = BeautifulSoup(response.content, 'html.parser')
        content_tag = soup.find('div', id='content')

        #writing the content into the HTML file
        with open(html_filepath, 'w', encoding='utf-8') as html_file:
            html_file.write(str(content_tag))

        print(f'Saved content from {url} to {html_filepath}')
        time.sleep(4)

      else:
        print(f'Failed to fetch data from {url}')
        time.sleep(4)

    #printing scraping/saving progress using the counter
    x += 1
    print(f'Scraping Progress: {x}/{y} ({round(((x/y)*100),2)}%)')

#creating a folder in the local directory to store the scraped boxscores into HTML files
output_folder = '/content/boxscores_directory'

scrape_boxscores(boxscore_links, output_folder)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Skipping https://basketball-reference.com/boxscores/202102200MEM.html as 202102200MEM.html already exists.
Scraping Progress: 7698/10197 (75.49%)
Skipping https://basketball-reference.com/boxscores/202102200POR.html as 202102200POR.html already exists.
Scraping Progress: 7699/10197 (75.5%)
Skipping https://basketball-reference.com/boxscores/202102210NOP.html as 202102210NOP.html already exists.
Scraping Progress: 7700/10197 (75.51%)
Skipping https://basketball-reference.com/boxscores/202102210CLE.html as 202102210CLE.html already exists.
Scraping Progress: 7701/10197 (75.52%)
Skipping https://basketball-reference.com/boxscores/202102210NYK.html as 202102210NYK.html already exists.
Scraping Progress: 7702/10197 (75.53%)
Skipping https://basketball-reference.com/boxscores/202102210ORL.html as 202102210ORL.html already exists.
Scraping Progress: 7703/10197 (75.54%)
Skipping https://basketball-reference.com/boxscores/20210221

In [None]:
#printing the amount of files within the boxscores directory
output_folder = '/content/boxscores_directory'
len(os.listdir(output_folder))

10190

In [None]:
#compressing all the HTML files into a single zip file for download
import zipfile

def download_files(zip_filename):
  #designating the folder that contains all my HTML files
  folder_path = '/content/boxscores_directory'

  html_files = []
  for file in os.listdir(folder_path):
    if file.endswith('.html'):
      html_files.append(file)
  zip_filename = zip_filename
  zip_filepath = os.path.join('/content', zip_filename)

  with zipfile.ZipFile(zip_filepath, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for html_file in html_files:
      html_filepath = os.path.join(folder_path, html_file)
      zipf.write(html_filepath, os.path.basename(html_filepath))

  print(f'Files have been successfully compressed into {zip_filename}')

download_files('boxscores.zip')

Files have been successfully compressed into boxscores.zip


In [None]:
#uploading and unzipping the zipfile with all the downloaded htmls to prevent rescraping data
import zipfile
import os

#defining file paths
zip_file_path = '/content/boxscores.zip'
output_folder = '/content/boxscores_directory'

#unzipping the zipfile
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
  zip_ref.extractall(output_folder)

import os
extracted_files = os.listdir(output_folder)
print("Files extracted to: '/content/boxscores_directory'")
#creating a counter to know if all files were unzipped successfully
unzipped = 0
for file in extracted_files:
  unzipped += 1
  if unzipped == len(extracted_files):
    print(f'All files have been successfully unzipped. ({unzipped}/{len(extracted_files)})')


Files extracted to: '/content/boxscores_directory'
All files have been successfully unzipped. (10190/10190)


In [None]:
def opening_html(boxscore):
  with open(boxscore, 'r', encoding='utf-8') as file:
    boxscore_html = file.read()
  soup = BeautifulSoup(boxscore_html, 'html.parser')

  return soup

In [None]:
def reading_linescores(boxscore):
  #parsing the HTML comments since the data is unable to be scraped; the data is being loaded dynamically via JavaScript
  soup = opening_html(boxscore)

  #find all HTML comments in the document
  comments = soup.find_all(string=lambda text: isinstance(text, Comment))

  #initialize lists to store data
  teams = []
  scores = []
  column_headers = []

  #scraping and processing comments for desired stats
  for comment in comments:
      if 'id="line_score"' in comment:
          comment_soup = BeautifulSoup(comment, 'html.parser')
          rows = comment_soup.find_all('tr')
          for i, row in enumerate(rows):
              if i == 1:
                  #extract the column headers from table
                  column_headers = [th.text.strip() for th in row.find_all('th', {'data-stat': True})]
                  column_headers = [header.replace('T', 'Total') for header in column_headers]
              else:
                  #extract the team name if available
                  try:
                      team_name = row.find('th', {'data-stat': 'team'}).find('a').text
                      teams.append(team_name)
                  except AttributeError:
                      #handle cases where team information is missing
                      pass  #do nothing for rows without team information

                  #extract the scores from the table data cells (td)
                  score_data = [td.text.strip() for td in row.find_all('td')]

                  #filter out empty strings and append to the scores list
                  score_data = [s for s in score_data if s]
                  scores.extend(score_data)

  #split the scores list into two parts for each team
  half_len = len(scores) // 2
  team1_scores = scores[:half_len]
  team2_scores = scores[half_len:]

  #create a dictionary with the data to turn into a dataframe
  data = {'Team': teams}
  for i, header in enumerate(column_headers[1:]):
      data[header] = [team1_scores[i], team2_scores[i]]

  df = pd.DataFrame(data)
  return df

In [None]:
def reading_fourfactors(boxscore):
    # Parsing the HTML comments to find the Four Factors table
    soup = opening_html(boxscore)

    # Find all HTML comments in the document
    comments = soup.find_all(string=lambda text: isinstance(text, Comment))

    # Initialize variables to store Four Factors data
    data = []
    columns = ["Pace", "eFG%", "TOV%", "ORB%", "FT/FGA", "ORtg"]

    for comment in comments:
        if 'id="four_factors"' in comment:
            comment_soup = BeautifulSoup(comment, 'html.parser')
            rows = comment_soup.select('table.stats_table tbody tr')
            for row in rows:
                team_data = [td.get_text() for td in row.select('td')]
                data.append(team_data)

    df = pd.DataFrame(data, columns=columns)
    return df

In [None]:
def reading_stats(boxscore, team_name, stype):
  #scraping basic/advanced player data and compiling it into a pandas dataframe
  soup = opening_html(boxscore)

  #finding the table containing player stats
  table = soup.find('table', {'id': f'box-{team_name}-game-{stype}'})

  if stype == 'basic':
    #defining column headers for BASIC player stats
    columns = [
        "Player", "MP", "FG", "FGA", "FG%", "3P", "3PA", "3P%", "FT", "FTA",
        "FT%", "ORB", "DRB", "TRB", "AST", "STL", "BLK", "TOV", "PF", "PTS", "+/-"
    ]

  elif stype == 'advanced':
    #defining column headers for ADVANCED player stats
    columns = [
        "Starters", "MP",	"TS%",	"eFG%",	"3PAr",	"FTr",	"ORB%",	"DRB%",
        "TRB%",	"AST%",	"STL%",	"BLK%",	"TOV%",	"USG%",	"ORtg",	"DRtg",	"BPM"
    ]

  #scraping and processing basic player data
  data = []
  starters = []
  for row in table.select('tbody tr'):
      player_name = row.find('th', {'scope': 'row'})
      if player_name:
          player_name = player_name.get_text().split(' (Starter)')[0]  #remove "(Starter)"
          player_data = [player_name] + [td.get_text() for td in row.select('td')]
          data.append(player_data)
          starters.append(player_name)
  #scraping and processing team total data
  team_totals_row = [td.get_text() for td in table.select('tfoot td')]
  data.append(["Team Totals"] + team_totals_row)

  df = pd.DataFrame(data, columns=columns)
  return df

In [None]:
def reading_general(data_type, boxscore):
  #scraping general game data such as season and date
  soup = opening_html(boxscore)

  if data_type == 'season':
    #finding the table containing player stats
    heading_tags = soup.find('h1').get_text().split(",")[-1].replace(" ", "")
  elif data_type == 'date':
    #finding the table containing player stats
    date_string = soup.find('h1').get_text().split("Score, ")[-1]
    heading_tags = datetime.strptime(date_string, "%B %d, %Y").strftime("%Y-%m-%d")

  return heading_tags

In [None]:
#creating a list of filepath for each html file
boxscores = []
for file in os.listdir('/content/boxscores_directory'):
  if file.endswith('.html'):
    boxscores.append(os.path.join('/content/boxscores_directory', file))
print(f'There are {len(boxscores)} filepaths in the boxscore list.')
print(boxscores[-1])

There are 10190 filepaths in the boxscore list.
/content/boxscores_directory/201411130GSW.html


In [None]:
base_columns = None
games = []
skipped_boxscores = []  #list to store skipped boxscores

#creating variables for progress counter
x = 5754
y = len(boxscores)

#creating a loop to scrape each boxscore file for the key stats
for boxscore in boxscores[5754:]:
  #allows for loop to skip boxscores due to errors
  try:
    soup = opening_html(boxscore)
    line_score = reading_linescores(boxscore)
    teams = list(line_score['Team'])

    stat_summaries = []
    #creating a loop for each team within a boxscore and turning numerical data in to numerical data types
    for team in teams:
      basic_stats = reading_stats(boxscore, team, 'basic')
      basic_stats = basic_stats.apply(pd.to_numeric, errors='ignore')
      advanced_stats = reading_stats(boxscore, team, 'advanced')
      advanced_stats = advanced_stats.apply(pd.to_numeric, errors='ignore')

      four_factors = reading_fourfactors(boxscore)

      totals = pd.concat([basic_stats.iloc[-1,:], advanced_stats.iloc[-1,:]], axis = 0)
      totals.index = totals.index.str.lower()

      #processing the highest stat in each category
      max_stats = pd.concat([basic_stats.iloc[:-1].max(numeric_only = True), advanced_stats.iloc[:-1].max(numeric_only = True)])
      max_stats.index = max_stats.index.str.lower() + "_max"

      stat_summary = pd.concat([totals,max_stats], axis= 0)

      #removing duplicating data and appending it to a single list
      if base_columns is None:
        base_columns = list(stat_summary.index.drop_duplicates(keep= "first"))
        base_columns = [b for b in base_columns if "bpm" not in b]

      stat_summary = stat_summary[base_columns]
      stat_summaries.append(stat_summary)
    stat_summary = pd.concat(stat_summaries, axis= 1).T

    game = pd.concat([stat_summary, line_score[['Team', 'Total']]], axis=1)
    #assigning 0 or 1 depending on home or away team (0 = away)
    game['home'] = [0, 1]

    #appending home and away team stats in a single row
    game_opp = game.iloc[::-1].reset_index(drop=True)  #resetting the index
    game_opp.columns += "_opp"

    full_game = pd.concat([game, game_opp, four_factors], axis=1)
    full_game['season'] = reading_general('season', boxscore)
    full_game['date'] = reading_general('date', boxscore)
    full_game['won'] = full_game['Total'] > full_game['Total_opp']
    games.append(full_game)

    #printing scraping/saving progress using the counter
    x += 1
    print(f'Appending Progress: {x}/{y} ({round(((x/y)*100),2)}%)')
  except ValueError as e:
    print(f'Skipping boxscore due to error: {str(e)}')
    skipped_boxscores.append(boxscore)  #creating a list for all the skipped boxscores

print(f'Skipped boxscores: {skipped_boxscores}')

Appending Progress: 5755/10190 (56.48%)
Appending Progress: 5756/10190 (56.49%)
Appending Progress: 5757/10190 (56.5%)
Appending Progress: 5758/10190 (56.51%)
Appending Progress: 5759/10190 (56.52%)
Appending Progress: 5760/10190 (56.53%)
Appending Progress: 5761/10190 (56.54%)
Appending Progress: 5762/10190 (56.55%)
Appending Progress: 5763/10190 (56.56%)
Appending Progress: 5764/10190 (56.57%)
Appending Progress: 5765/10190 (56.58%)
Appending Progress: 5766/10190 (56.58%)
Appending Progress: 5767/10190 (56.59%)
Appending Progress: 5768/10190 (56.6%)
Appending Progress: 5769/10190 (56.61%)
Appending Progress: 5770/10190 (56.62%)
Appending Progress: 5771/10190 (56.63%)
Appending Progress: 5772/10190 (56.64%)
Appending Progress: 5773/10190 (56.65%)
Appending Progress: 5774/10190 (56.66%)
Appending Progress: 5775/10190 (56.67%)
Appending Progress: 5776/10190 (56.68%)
Appending Progress: 5777/10190 (56.69%)
Appending Progress: 5778/10190 (56.7%)
Appending Progress: 5779/10190 (56.71%)
App

In [None]:
#saving all the processed data into a csv file
nbagames_df = pd.concat(games, axis = 0, ignore_index=True)
nbagames_df.to_csv("nbagames_df.csv")

Predicting NBA Game Outcomes

In [None]:
#importing relevant libraries
import pandas as pd

from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import RidgeClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score

rr = RidgeClassifier(alpha = 1)
split = TimeSeriesSplit(n_splits = 3)

sfs = SequentialFeatureSelector(rr, n_features_to_select= .11, direction = 'forward', cv = split)

In [None]:
nbagames_df = pd.read_csv("nbagames_df.csv", index_col= 0)
nbagames_df = nbagames_df.sort_values('date').reset_index(drop= True)

#cleaning up the dataframe by removing columns unnecessary for machine learning
del_columns = ['player','player_opp','mp','mp.1','mp_opp','+/-','+/-_opp','starters','starters_opp']
nbagames_df = nbagames_df.drop(columns= del_columns)

#creating a new column for the "intended target result" for the machine learning prediction
nbagames_df['target'] = nbagames_df.groupby('Team')['won'].shift(-1)

for index, row in nbagames_df.iterrows():
    #checking/replacing the NaN value in the 'target' column
    if pd.isna(row['target']):
        nbagames_df.at[index, 'target'] = 2
    else:
        nbagames_df.at[index, 'target'] = int(row['target'])
nbagames_df['target'] = nbagames_df['target'].astype(int)

In [None]:
nbagames_df[nbagames_df['Team'] == 'ATL'].tail(5)

Unnamed: 0,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,orb,...,Pace,eFG%,TOV%,ORB%,FT/FGA,ORtg,season,date,won,target
20210,29.0,75.0,0.387,10.0,36.0,0.278,23.0,27.0,0.852,8.0,...,97.1,0.453,17.2,18.2,0.307,93.7,2022,2022-04-17,True,1
20217,29.0,75.0,0.387,10.0,36.0,0.278,23.0,27.0,0.852,8.0,...,97.1,0.453,17.2,18.2,0.307,93.7,2022,2022-04-17,True,0
20229,41.0,87.0,0.471,12.0,40.0,0.3,11.0,14.0,0.786,7.0,...,101.1,0.54,16.9,18.9,0.126,103.9,2022,2022-04-19,False,1
20244,41.0,80.0,0.513,12.0,32.0,0.375,17.0,21.0,0.81,5.0,...,96.6,0.588,11.0,12.8,0.213,114.9,2022,2022-04-22,True,1
20256,30.0,75.0,0.4,15.0,42.0,0.357,11.0,20.0,0.55,11.0,...,86.9,0.5,15.2,25.0,0.147,99.0,2022,2022-04-24,True,2


In [None]:
metadata = ['season', 'date', 'won', 'target', 'Team', 'Team_opp']
non_metadata = nbagames_df.columns[~nbagames_df.columns.isin(metadata)]

scaler = MinMaxScaler()
nbagames_df[non_metadata] = scaler.fit_transform(nbagames_df[non_metadata])

In [None]:
nbagames_df

Unnamed: 0,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,orb,...,Pace,eFG%,TOV%,ORB%,FT/FGA,ORtg,season,date,won,target
0,0.285714,0.188406,0.373206,0.413793,0.390625,0.491686,0.767442,0.777778,0.626604,0.368421,...,0.333333,0.417004,0.410448,0.683810,0.726115,0.553965,2014,2014-10-28,False,0
1,0.523810,0.594203,0.327751,0.137931,0.203125,0.279097,0.325581,0.476190,0.397900,0.684211,...,0.447090,0.253036,0.235075,0.750476,0.221338,0.414097,2014,2014-10-28,False,0
2,0.309524,0.347826,0.267943,0.137931,0.109375,0.432304,0.348837,0.317460,0.722287,0.421053,...,0.447090,0.210526,0.567164,0.586667,0.286624,0.216960,2014,2014-10-28,True,1
3,0.214286,0.275362,0.203349,0.103448,0.093750,0.356295,0.697674,0.603175,0.760793,0.289474,...,0.333333,0.145749,0.347015,0.476190,0.608280,0.335903,2014,2014-10-28,True,0
4,0.523810,0.594203,0.327751,0.137931,0.203125,0.279097,0.325581,0.476190,0.397900,0.684211,...,0.447090,0.253036,0.235075,0.750476,0.221338,0.414097,2014,2014-10-28,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20363,0.285714,0.217391,0.344498,0.379310,0.437500,0.408551,0.465116,0.476190,0.623104,0.210526,...,0.468254,0.376518,0.593284,0.354286,0.429936,0.323789,2022,2022-06-13,True,1
20364,0.452381,0.463768,0.344498,0.655172,0.656250,0.490499,0.162791,0.111111,1.000000,0.394737,...,0.365079,0.435223,0.470149,0.634286,0.122611,0.476872,2022,2022-06-16,False,0
20365,0.357143,0.289855,0.373206,0.379310,0.375000,0.466746,0.232558,0.174603,0.903151,0.289474,...,0.365079,0.390688,0.727612,0.523810,0.203822,0.321586,2022,2022-06-16,True,1
20366,0.452381,0.463768,0.344498,0.655172,0.656250,0.490499,0.162791,0.111111,1.000000,0.394737,...,0.365079,0.435223,0.470149,0.634286,0.122611,0.476872,2022,2022-06-16,False,2


In [None]:
sfs.fit(nbagames_df[non_metadata], nbagames_df['target'])

In [None]:
#print the list of predictors used by the feature selector
predictors = list(non_metadata[sfs.get_support()])
predictors

['3pa',
 'drb',
 'ast%',
 'blk%',
 'usg%',
 'ft%_max',
 'drb%_max',
 'mp_opp.1',
 'blk_opp',
 'pts_opp',
 'usg%_opp',
 'ft%_max_opp',
 'orb_max_opp',
 'pf_max_opp',
 'Total_opp']

In [None]:
#utilizing historical(predictors) data to future data by splitting data into seasons
def backtest(data, model, predictors, start = 2, step = 1):
  all_predictions = []

  seasons = sorted(data['season'].unique())

  for i in range(start, len(seasons), step):
    season = seasons[i]

    train = data[data['season'] < season]
    test = data[data['season'] == season]

    model.fit(train[predictors], train['target'])

    preds = model.predict(test[predictors])
    preds = pd.Series(preds, index= test.index)

    combined_df = pd.concat([test['target'], preds], axis = 1)
    combined_df.columns = ['actual', 'prediction']

    all_predictions.append(combined_df)
  return pd.concat(all_predictions)

predictions = backtest(nbagames_df, rr, predictors)

In [None]:
#printing the accuracy of the model's prediction (.5114)
predictions = predictions[predictions['actual'] != 2]
accuracy_score(predictions['actual'], predictions['prediction'])

0.5136683777008476

In [None]:
nbagames_df.groupby('home').apply(lambda x: x[x['won'] == 1].shape[0] / x.shape[0])

home
0.0    0.492145
1.0    0.507855
dtype: float64

In [None]:
#improving performance using rolling averages instead of with just the previous game
rnbagames_df = nbagames_df[list(non_metadata) + ['won', 'Team', 'season']]
rnbagames_df

def finding_teamavg(team):
  rolling = team.rolling(10).mean()
  return rolling

rnbagames_df = rnbagames_df.groupby(['Team','season'], group_keys = False).apply(finding_teamavg)

  rolling = team.rolling(10).mean()


In [None]:
#creating a new dataframe for the rolling averages the concatenating them together
rolling_columns = []
for column in rnbagames_df.columns:
  rolling_column = column + '_10'
  rolling_columns.append(rolling_column)

rnbagames_df.columns = rolling_columns
nbagames_df = pd.concat([nbagames_df, rnbagames_df], axis = 1)
nbagames_df = nbagames_df.dropna() #dropping the first 10 games because it takes the average of the previous 8 games

In [None]:
nbagames_df

Unnamed: 0,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,orb,...,Total_opp_10,home_opp_10,Pace_10,eFG%_10,TOV%_10,ORB%_10,FT/FGA_10,ORtg_10,won_10,season_10
160,0.571429,0.449275,0.488038,0.413793,0.406250,0.475059,0.372093,0.380952,0.626604,0.447368,...,0.369231,0.4,0.487302,0.363158,0.557836,0.404571,0.306210,0.296916,0.5,2014.0
174,0.333333,0.246377,0.382775,0.275862,0.312500,0.395487,0.348837,0.380952,0.579930,0.210526,...,0.293269,0.3,0.317196,0.365992,0.546269,0.573714,0.276752,0.363326,0.2,2014.0
176,0.333333,0.246377,0.382775,0.275862,0.312500,0.395487,0.348837,0.380952,0.579930,0.210526,...,0.299038,0.3,0.315608,0.358300,0.552239,0.543429,0.272452,0.348678,0.2,2014.0
179,0.476190,0.304348,0.507177,0.068966,0.109375,0.216152,0.372093,0.349206,0.695449,0.342105,...,0.295192,0.8,0.383598,0.433806,0.449254,0.409524,0.339809,0.414097,0.7,2014.0
181,0.500000,0.405797,0.444976,0.172414,0.218750,0.330166,0.162791,0.158730,0.681447,0.421053,...,0.281731,0.6,0.359259,0.364372,0.505970,0.495429,0.256051,0.353084,0.6,2014.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20363,0.285714,0.217391,0.344498,0.379310,0.437500,0.408551,0.465116,0.476190,0.623104,0.210526,...,0.327885,0.7,0.403439,0.442915,0.479851,0.387048,0.357325,0.430176,0.9,2022.0
20364,0.452381,0.463768,0.344498,0.655172,0.656250,0.490499,0.162791,0.111111,1.000000,0.394737,...,0.345192,0.6,0.398413,0.465182,0.355224,0.464762,0.251752,0.507159,0.2,2022.0
20365,0.357143,0.289855,0.373206,0.379310,0.375000,0.466746,0.232558,0.174603,0.903151,0.289474,...,0.328846,0.6,0.398413,0.415182,0.523881,0.399238,0.339490,0.388987,0.9,2022.0
20366,0.452381,0.463768,0.344498,0.655172,0.656250,0.490499,0.162791,0.111111,1.000000,0.394737,...,0.335577,0.6,0.414550,0.460931,0.369403,0.461524,0.222293,0.490749,0.1,2022.0


In [None]:
def shifting_columns(df, team, column_name):
    next_column = df.groupby(team)[column_name].shift(-1)
    df[f'{column_name}_next'] = next_column
    return df

nbagames_df = shifting_columns(nbagames_df, 'Team', 'home')
nbagames_df = shifting_columns(nbagames_df, 'Team', 'Team_opp')
nbagames_df = shifting_columns(nbagames_df, 'Team', 'date')
nbagames_df.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'{column_name}_next'] = next_column
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'{column_name}_next'] = next_column
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'{column_name}_next'] = next_column


Unnamed: 0,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,orb,...,eFG%_10,TOV%_10,ORB%_10,FT/FGA_10,ORtg_10,won_10,season_10,home_next,Team_opp_next,date_next
160,0.571429,0.449275,0.488038,0.413793,0.40625,0.475059,0.372093,0.380952,0.626604,0.447368,...,0.363158,0.557836,0.404571,0.30621,0.296916,0.5,2014.0,0.0,TOR,2014-11-09
174,0.333333,0.246377,0.382775,0.275862,0.3125,0.395487,0.348837,0.380952,0.57993,0.210526,...,0.365992,0.546269,0.573714,0.276752,0.363326,0.2,2014.0,1.0,WAS,2014-11-08
176,0.333333,0.246377,0.382775,0.275862,0.3125,0.395487,0.348837,0.380952,0.57993,0.210526,...,0.3583,0.552239,0.543429,0.272452,0.348678,0.2,2014.0,1.0,UTA,2014-11-10
179,0.47619,0.304348,0.507177,0.068966,0.109375,0.216152,0.372093,0.349206,0.695449,0.342105,...,0.433806,0.449254,0.409524,0.339809,0.414097,0.7,2014.0,1.0,DET,2014-11-12
181,0.5,0.405797,0.444976,0.172414,0.21875,0.330166,0.162791,0.15873,0.681447,0.421053,...,0.364372,0.50597,0.495429,0.256051,0.353084,0.6,2014.0,1.0,MEM,2014-11-08


In [None]:
#creating a new dataframe to include opposing team data
full_df = nbagames_df.merge(nbagames_df[rolling_columns + ['Team_opp_next', 'date_next', 'Team']],
                               left_on=['Team', 'date_next'],
                               right_on=['Team_opp_next', 'date_next'])

In [None]:
full_df[['Team_x', 'Team_opp_next_x', 'Team_y', 'Team_opp_next_y', 'date_next']].head(5)

Unnamed: 0,Team_x,Team_opp_next_x,Team_y,Team_opp_next_y,date_next
0,MIL,ORL,ORL,MIL,2014-11-14
1,MIA,BRK,BRK,MIA,2014-11-17
2,WAS,ORL,ORL,WAS,2014-11-15
3,ORL,MIL,MIL,ORL,2014-11-14
4,TOR,MEM,MEM,TOR,2014-11-19


In [None]:
#adding the new non-numerical data types in a list to exclude from the machine learning model
metadata = list(full_df.columns[full_df.dtypes == 'object']) + metadata
non_metadata = full_df.columns[~full_df.columns.isin(metadata)]

In [None]:
sfs.fit(full_df[non_metadata], full_df['target'])

In [None]:
#30 rolling 10 (0.5727)
predictors = list(non_metadata[sfs.get_support()])
predictions = backtest(full_df, rr, predictors)

accuracy_score(predictions['actual'], predictions['prediction'])

0.5731993917330999