<a href="https://colab.research.google.com/github/joshkatzenbach/nba_win_prediction/blob/main/Basketball_Reference_Data_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
import re
import time
from bs4 import BeautifulSoup
import pandas as pd
from google.colab import files
import copy
from IPython.display import clear_output

In [None]:
class Rate_Limiter:
  def __init__(self, max_requests, time_frame):
    self.max_requests = max_requests
    self.request_times = []
    self.time_frame = time_frame

  def log_request(self):
    current_time = time.time()
    if (len(self.request_times) >= self.max_requests):
      if (current_time - self.request_times[0]) > self.time_frame:
        self.request_times = [start_time for start_time in self.request_times if current_time - start_time < self.time_frame]

      else:
        waitTime = current_time - self.request_times[0]
        time.sleep(self.time_frame - waitTime)
        current_time = time.time()
        self.request_times = [start_time for start_time in self.request_times if current_time - start_time < self.time_frame]

    self.request_times.append(time.time())







In [None]:
basicTeamGameStats = ["fg", "fga", "fg3", "fg3a", "ft", "fta", "orb", "drb", "trb", "ast", "stl", "blk", "tov", "pf", "pts"]
advTeamGameStats = ["ts_pct", "efg_pct", "fg3a_per_fga_pct", "fta_per_fga_pct", "orb_pct", "drb_pct", "trb_pct", "ast_pct", "stl_pct", "blk_pct", "tov_pct", "usg_pct", "off_rtg", "def_rtg"]
months = ["october", "november", "december","january", "february", "march", "april", "may", "june", "july", "august", "september"]
test_months = ["april", "may", "june"]

class gameData:
  def __init__(self, gameID):
    self.gameID = gameID
    self.awayBasicH1 = {}
    self.homeBasicH1 = {}
    self.awayBasic = {}
    self.homeBasic = {}
    self.awayBasicQ1 = {}
    self.homeBasicQ1 = {}
    self.awayWins = 0
    self.awayLosses = 0
    self.homeWins = 0
    self.homeLosses = 0
    self.homeWinner = False
    self.awayTeam = ""
    self.homeTeam = ""
    self.gameDate = ""
    self.homeAvg = {}
    self.awayAvg = {}


    # Basic Stats
    for stat in range(len(basicTeamGameStats)):
      self.awayBasicH1[basicTeamGameStats[stat]] = 0
      self.homeBasicH1[basicTeamGameStats[stat]] = 0
      self.awayBasic[basicTeamGameStats[stat]] = 0
      self.homeBasic[basicTeamGameStats[stat]] = 0
      self.awayBasicQ1[basicTeamGameStats[stat]] = 0
      self.homeBasicQ1[basicTeamGameStats[stat]] = 0
      self.homeAvg[basicTeamGameStats[stat]] = 0
      self.awayAvg[basicTeamGameStats[stat]] = 0


  def addRecord(self, awayWins, awayLosses, homeWins, homeLosses):
    self.awayWins = awayWins
    self.awayLosses = awayLosses
    self.homeWins = homeWins
    self.homeLosses = homeLosses

  def addHomeAvgStats(self, stats):
    num_games: float = self.homeWins + self.homeLosses
    if num_games != 0:
    #   for stat in range(len(basicTeamGameStats)):
    #     self.homeAvg[basicTeamGameStats[stat]] = stats[basicTeamGameStats[stat]] / num_games

      self.homeAvg = stats.copy()
      for stat in basicTeamGameStats:
        self.homeAvg[stat] = self.homeAvg[stat] / num_games

  def addAwayAvgStats(self, stats):
    num_games: float = self.awayWins + self.awayLosses
    if num_games != 0:
      # for stat in range(len(basicTeamGameStats)):
      #   self.awayAvg[basicTeamGameStats[stat]] = stats[basicTeamGameStats[stat]] / num_games
      self.awayAvg = stats.copy()
      for stat in basicTeamGameStats:
        self.awayAvg[stat] = self.awayAvg[stat] / num_games


  def getAwayAvgStats(self, stat):
    return self.awayAvg[stat]

  def getAvgStats(self, stat):
    return self.homeAvg[stat]

  # Basic Stats H1

  def addAwayBasicH1(self, data, stat):
    self.awayBasicH1[stat] = data

  def addHomeBasicH1(self, data, stat):
    self.homeBasicH1[stat] = data

  def getAwayBasicH1(self, stat):
    return self.awayBasicH1[stat]

  def getHomeBasicH1(self, stat):
    return self.homeBasicH1[stat]


  # Basic Stats

  def addAwayBasic(self, data, stat):
    self.awayBasic[stat] = data

  def addHomeBasic(self, data, stat):
    self.homeBasic[stat] = data

  def getAwayBasic(self, stat):
    return self.awayBasic[stat]

  def getHomeBasic(self, stat):
    return self.homeBasic[stat]

  # Q1 Stats

  def addAwayBasicQ1(self, data, stat):
    self.awayBasicQ1[stat] = data

  def addHomeBasicQ1(self, data, stat):
    self.homeBasicQ1[stat] = data

  def getAwayBasicQ1(self, stat):
    return self.awayBasicQ1[stat]

  def getHomeBasicQ1(self, stat):
    return self.homeBasicQ1[stat]








In [None]:
def getGameStats(soup, game):

  # Get Basic Game Stats for H1
  table_ID = "h1-basic"
  tables = soup.find_all('table', id = re.compile(table_ID))
  for team in range(len(tables)):
    footer = tables[team].find("tfoot")
    for stat in range(len(basicTeamGameStats)):
      data = int(footer.find('td', {"data-stat" : basicTeamGameStats[stat]}).text.strip())
      if (team == 0):
        game.addAwayBasicH1(data, basicTeamGameStats[stat])
      else:
        game.addHomeBasicH1(data, basicTeamGameStats[stat])

  # Get Basic Stats for Game
  table_ID = "game-basic"
  tables = soup.find_all('table', id = re.compile(table_ID))
  for team in range(len(tables)):
    footer = tables[team].find("tfoot")
    for stat in range(len(basicTeamGameStats)):
      data = int(footer.find('td', {"data-stat" : basicTeamGameStats[stat]}).text.strip())
      if (team == 0):
        game.addAwayBasic(data, basicTeamGameStats[stat])
      else:
        game.addHomeBasic(data, basicTeamGameStats[stat])

  # Get Basic Stats for Q1
  table_ID = "q1-basic"
  tables = soup.find_all('table', id = re.compile(table_ID))
  for team in range(len(tables)):
    footer = tables[team].find("tfoot")
    for stat in range(len(basicTeamGameStats)):
      data = int(footer.find('td', {"data-stat" : basicTeamGameStats[stat]}).text.strip())
      if (team == 0):
        game.addAwayBasicQ1(data, basicTeamGameStats[stat])
      else:
        game.addHomeBasicQ1(data, basicTeamGameStats[stat])


  # Get Winner
  if (game.getAwayBasic("pts") > game.getHomeBasic("pts")):
    game.homeWinner = False
  else:
    game.homeWinner = True

  # Get Team Code

  instance = soup.find('meta', {"name" : "Description"})
  tokens = instance['content'].split(' ')
  game.homeTeam = tokens[3]
  game.awayTeam = tokens[0]

  #Get game Date
  game.gameDate = game.gameID[4:6] + '/' + game.gameID[6:8] + '/' + game.gameID[0:4]




In [None]:
class Season:
  def __init__(self, year, limiter):
    self.games = []
    self.year = year
    self.teams = {}
    self.team_records = {}
    self.games_analyzed = 0
    self.limiter = limiter

  def getMonthURL(self, month):
    return "https://www.basketball-reference.com/leagues/NBA_" + str(self.year) + "_games-" + month + ".html"

  def getGamesFromPage(self, url):
    self.limiter.log_request()
    response = requests.get(url)
    html_content = response.content
    soup = BeautifulSoup(html_content, "html.parser")
    rate_limit_message = soup.find('title', string = "Rate Limited Request (429 error) | Sports-Reference.com")
    if (rate_limit_message != None):
      print("found!!")
    schedule_table = soup.find('table', id = re.compile("schedule"))



    #check for valid month webpage
    if (schedule_table == None):
      return None

    table_body = schedule_table.find('tbody')
    games = table_body.find_all('tr')
    return games



  def addGames(self):
    for month in test_months:
      # if month != "october":
      #   break

      url = self.getMonthURL(month)
      games = self.getGamesFromPage(url)

      # Check for valid month webpage
      if games == None:
        continue

      for game in games:
        td_tag = game.find('td', {'data-stat' : 'visitor_team_name'})
        if (td_tag == None):
          continue
        html_adder = td_tag['csk'].split('.')
        html_adder = html_adder[1]

        newUrl = "https://www.basketball-reference.com/boxscores/" + str(html_adder) + ".html"
        #print(newUrl)
        self.limiter.log_request()
        game_response = requests.get(newUrl)
        game_html_content = game_response.content
        game_soup = BeautifulSoup(game_html_content, "html.parser")
        game1 = gameData(html_adder)
        getGameStats(game_soup, game1)

        # Calculate Record
          # Create dictionary and record if it doesn't exist already
        if (self.teams.get(game1.homeTeam, None) == None):
          self.team_records[game1.homeTeam] = [0,0]
        if (self.teams.get(game1.awayTeam, None) == None):
          self.team_records[game1.awayTeam] = [0,0]

          # Add the record to the game
        game1.addRecord(self.team_records[game1.awayTeam][0], self.team_records[game1.awayTeam][1], self.team_records[game1.homeTeam][0], self.team_records[game1.homeTeam][1])

          # Update the record that is stored in a season
        if (game1.homeWinner):
          self.team_records[game1.homeTeam][0] += 1
          self.team_records[game1.awayTeam][1] += 1
        else:
          self.team_records[game1.homeTeam][1] += 1
          self.team_records[game1.awayTeam][0] += 1

        # Cumulative Home Stats
        if (self.teams.get(game1.homeTeam, None) == None):
          self.teams[game1.homeTeam] = {}
          self.teams[game1.homeTeam] = copy.deepcopy(game1.homeBasic)
        else:
          game1.homeAvg = copy.deepcopy(self.teams[game1.homeTeam])
          num_games = game1.homeWins + game1.homeLosses
          if num_games != 0:
            for stat in game1.homeAvg:
              game1.homeAvg[stat] = game1.homeAvg[stat] / num_games
            for stat in basicTeamGameStats:
              self.teams[game1.homeTeam][stat] += game1.homeBasic[stat]


        # Cumulative Away Stats
        if (self.teams.get(game1.awayTeam, None) == None):
          self.teams[game1.awayTeam] = {}
          self.teams[game1.awayTeam] = copy.deepcopy(game1.awayBasic)
        else:
          game1.awayAvg = copy.deepcopy(self.teams[game1.awayTeam])
          num_games = game1.awayWins + game1.awayLosses
          if num_games != 0:
            for stat in game1.awayAvg:
              game1.awayAvg[stat] = game1.awayAvg[stat] / num_games
            for stat in basicTeamGameStats:
              self.teams[game1.awayTeam][stat] += game1.awayBasic[stat]

        self.games.append(game1)
        self.games_analyzed += 1
        #print("\033[F\033[KGames Analyzed: " + str(self.games_analyzed) + ". " + game1.awayTeam + " vs " + game1.homeTeam + " on " + game1.gameDate)
        print("\r" + str(self.year) + " - Games Analyzed: " + str(self.games_analyzed) + "             ", end = "")
        #clear_output(wait = True)
      #   if (self.games_analyzed == 28):
      #     break
      # if (self.games_analyzed == 28):
      #   break
    print("\r" + str(self.year) + " Season Complete. Games: " + str(self.games_analyzed))

In [None]:
def getData(start_year, end_year):
  rate_limiter = Rate_Limiter(60, 60)
  seasons = []
  for i in range(end_year - start_year + 1):
    seasons.append(Season(start_year + i, rate_limiter))
    seasons[i].addGames()
  return seasons

In [None]:
dict_1 = {}

for stat in basicTeamGameStats:
  dict_1[stat] = 0

dict_2 = dict_1.copy()

for stat in basicTeamGameStats:
  dict_1[stat] +=1

print(dict_1)
print(dict_2)

{'fg': 1, 'fga': 1, 'fg3': 1, 'fg3a': 1, 'ft': 1, 'fta': 1, 'orb': 1, 'drb': 1, 'trb': 1, 'ast': 1, 'stl': 1, 'blk': 1, 'tov': 1, 'pf': 1, 'pts': 1}
{'fg': 0, 'fga': 0, 'fg3': 0, 'fg3a': 0, 'ft': 0, 'fta': 0, 'orb': 0, 'drb': 0, 'trb': 0, 'ast': 0, 'stl': 0, 'blk': 0, 'tov': 0, 'pf': 0, 'pts': 0}


In [None]:
def addStats(stats_names, stat_list, master_list):
  for stat in range(len(stats_names)):
    master_list.append(stat_list[stats_names[stat]])

def loadDataFrame(seasons):
  allGames = []
  header = []

  # General Headers
  header.append("GameID")
  header.append("Date")
  header.append("Home Team")
  header.append("Away Team")

  # Home Headers
  header.append("Home Wins")
  header.append("Home Losses")
  for stat in basicTeamGameStats:
    header.append("Home Avg " + stat)
  for stat in basicTeamGameStats:
    header.append("Home Q1 " + stat)
  for stat in basicTeamGameStats:
    header.append("Home H1 " + stat)
  for stat in basicTeamGameStats:
    header.append("Home " + stat)

  # Away Headers
  header.append("Away Wins")
  header.append("Away Losses")
  for stat in basicTeamGameStats:
    header.append("Away Avg " + stat)
  for stat in basicTeamGameStats:
    header.append("Away Q1 " + stat)
  for stat in basicTeamGameStats:
    header.append("Away H1 " + stat)
  for stat in basicTeamGameStats:
    header.append("Away " + stat)


  for season in seasons:
    for game in season.games:
      singleGame = []
      # General Game Data
      singleGame.append(game.gameID)
      singleGame.append(game.gameDate)
      singleGame.append(game.homeTeam)
      singleGame.append(game.awayTeam)

      # Home Data
      singleGame.append(game.homeWins)
      singleGame.append(game.homeLosses)
      addStats(basicTeamGameStats, game.homeAvg, singleGame)
      addStats(basicTeamGameStats, game.homeBasicQ1, singleGame)
      addStats(basicTeamGameStats, game.homeBasicH1, singleGame)
      addStats(basicTeamGameStats, game.homeBasic, singleGame)

      # Away Data
      singleGame.append(game.awayWins)
      singleGame.append(game.awayLosses)
      addStats(basicTeamGameStats, game.awayAvg, singleGame)
      addStats(basicTeamGameStats, game.awayBasicQ1, singleGame)
      addStats(basicTeamGameStats, game.awayBasicH1, singleGame)
      addStats(basicTeamGameStats, game.awayBasic, singleGame)

      # Add to all Data
      allGames.append(singleGame)

  return allGames, header


def makeExcelSheet(data, sheet_name, titles):
  df = pd.DataFrame(data)
  df.to_excel(sheet_name + ".xlsx", index = False, header = titles)
  files.download(sheet_name + ".xlsx")

In [None]:
# game1 = gameData("202411020CHO")
# getGameStats(soup, game1)

# print(game1.homeTeam)
# print(game1.awayTeam)
# print(game1.gameDate)

# time.sleep(3600)
year = getData(2000, 2024)
data, headers = loadDataFrame(year)
makeExcelSheet(data, "Test_3", headers)


found!!
found!!
found!!
2000 Season Complete. Games: 0
found!!
found!!
found!!
2001 Season Complete. Games: 0
found!!
found!!
found!!
2002 Season Complete. Games: 0
found!!
found!!
found!!
2003 Season Complete. Games: 0
found!!
found!!
found!!
2004 Season Complete. Games: 0
found!!
found!!
found!!
2005 Season Complete. Games: 0


KeyboardInterrupt: 

In [None]:
makeExcelSheet(data, "Test_1", headers)

In [None]:
year[0].team_records["GSW"][0]

In [None]:
for stat in range(len(basicTeamGameStats)):
  print(basicTeamGameStats[stat] + ": " + str(year[0].games[0].awayBasic[basicTeamGameStats[stat]]))
  print(basicTeamGameStats[stat] + ": " + str(year[0].games[14].homeBasic[basicTeamGameStats[stat]]))
  print(basicTeamGameStats[stat] + ": " + str(year[0].games[27].homeBasic[basicTeamGameStats[stat]]))
  print(basicTeamGameStats[stat] + ": " + str(year[0].teams["PHI"][basicTeamGameStats[stat]] / 3))

In [None]:
  # Get Adv Stats for H1
  table_ID = "h1-advanced"
  tables = soup.find_all('table', id = re.compile(table_ID))
  for team in range(len(tables)):
    footer = tables[team].find("tfoot")
    for stat in range(len(advTeamGameStats)):
      data = float(footer.find('td', {"data-stat" : advTeamGameStats[stat]}).text.strip())
      if (team == 0):
        game.addAwayAdvH1(data, advTeamGameStats[stat])
      else:
        game.addHomeAdvH1(data, advTeamGameStats[stat])


  # Get Adv Stats for Game 1
  table_ID = "game-advanced"
  tables = soup.find_all('table', id = re.compile(table_ID))
  for team in range(len(tables)):
    footer = tables[team].find("tfoot")
    for stat in range(len(advTeamGameStats)):
      data = float(footer.find('td', {"data-stat" : advTeamGameStats[stat]}).text.strip())
      if (team == 0):
        game.addAwayAdv(data, advTeamGameStats[stat])
      else:
        game.addHomeAdv(data, advTeamGameStats[stat])

In [None]:
for stat in range(5):
  clear_output(wait=True)
  print(stat)
  time.sleep(.5)