## NBA Games Scrape

The purpose of this notebook is to scrape the website basketball-reference.com for the box score statistics for every game in the 2014-2018 NBA seasons. To accomplish this task, the notebook initially queries each team's landing page, and grabs the links to the box score pages for each of the team's games. Then, the notebook queries each of those box score pages, to return each game's box score statistics. I grabbed both the initial team's stats and their opponent's stats, because I'm going to want the full representation of the game captured by the box score statistics I'm grabbing. Lastly, I put all these box score statistics into json files, by year and by team.

In [1]:
import os
import bs4
import csv
import sys
import json
import time
import requests
from datetime import datetime
from bs4 import BeautifulSoup

In [2]:
team_handles_dict = {'Toronto Raptors': 'TOR',
                     'Boston Celtics': 'BOS',
                     'Philadelphia 76ers': 'PHI',
                     'Cleveland Cavaliers': 'CLE',
                     'Indiana Pacers': 'IND',
                     'Miami Heat': 'MIA',
                     'Milwaukee Bucks': 'MIL',
                     'Washington Wizards': 'WAS',
                     'Detroit Pistons': 'DET',
                     'Charlotte Hornets': 'CHO',
                     'Charlotte Bobcats': 'CHA',
                     'New York Knicks': 'NYK',
                     'Brooklyn Nets': 'BRK',
                     'Chicago Bulls': 'CHI',
                     'Orlando Magic': 'ORL',
                     'Atlanta Hawks': 'ATL',
                     'Houston Rockets': 'HOU',
                     'Golden State Warriors': 'GSW',
                     'Portland Trail Blazers': 'POR',
                     'Oklahoma City Thunder': 'OKC',
                     'Utah Jazz': 'UTA',
                     'New Orleans Pelicans': 'NOP',
                     'San Antonio Spurs': 'SAS',
                     'Minnesota Timberwolves': 'MIN',
                     'Denver Nuggets': 'DEN',
                     'Los Angeles Clippers': 'LAC',
                     'Los Angeles Lakers': 'LAL',
                     'Sacramento Kings': 'SAC',
                     'Dallas Mavericks': 'DAL',
                     'Memphis Grizzlies': 'MEM',
                     'Phoenix Suns': 'PHO'}

In [3]:
home_away_dict = {0: 'away', 1: 'home'}


#### get_game_id_for_team: 
- Returns a list of game ID's for a given team and season, by scraping the ID's from the team's landing page on basketball-reference.com

In [4]:
def get_game_id_for_team(team_handle, year):

    season_page = requests.get(f'https://www.basketball-reference.com/teams/{team_handle}/{year}_games.html')
    season_page = BeautifulSoup(season_page.text, 'html.parser')
    games = {}
    for row in season_page.find('table', {'id': 'games'}).tbody.find_all('td'):
        if row['data-stat'] == 'date_game':
            game_date = row['csk'].replace('-', '')
#             print(game_date)
        if row['data-stat'] == 'game_location':
            away = row.text
#             print(len(away))
            if not away:
                games[game_date] = 'home'
            else:
                games[game_date] = 'away'
    gid_list = [x + '0' + team_handle for x in games.keys() if games[x] == 'home']
#     print(gid_list)
    return gid_list

#### get_team_stats: 
- Returns the counting stats for an individual game, for team playing in the game, by pulling the stats from the tables located on the box score page

In [5]:
def get_team_stats(teams, page, gid):
    team_stats_tables = []
    teams_playing = []
    for team_slug in teams:
        box_score = page.find('div', attrs={'id': f'all_box-{team_slug}-game-basic'}).find('tfoot').find_all('td')
        team_stats_tables.append(box_score)
        teams_playing.append(team_slug.upper())
    
    game_stats = []
    
    for i, item in enumerate(team_stats_tables):
        team_stats = [gid, teams_playing[i], home_away_dict[i]]
        for row in item:
            if row.attrs['data-stat'] != 'plus_minus':
                team_stats.append(row.text)
        game_stats.append(team_stats)
    return game_stats

#### get_box_score_stats: 
- Queries the page containing the box score for a single game, and returns the box score stats for both teams for a single game

In [6]:
def get_box_score_stats(gid):
    box_score_page = requests.get(f'https://www.basketball-reference.com/boxscores/{gid}.html')
    box_score_page = BeautifulSoup(box_score_page.text, 'html.parser')
    bs_page_teams = []
    for item in box_score_page.find('div', attrs={'class', 'scorebox'}).find_all('strong'):
        team_slug = team_handles_dict[item.text.replace('\n', '')]
#         bs_page_teams.append(team_slug.lower())
        bs_page_teams.append(team_slug)
#     print(bs_page_teams)
    return get_team_stats(bs_page_teams, box_score_page, gid)

I needed to separate 2014 into its own scrape because after the 2014 season, the Charlotte Bobcats changed their team name to the Charlotte Hornets. On basketball-reference.com, these two monikers have different team slugs to access their corresponding pages, hence the need to split the scrape into two parts.

In [7]:
years_to_gather= ['2014','2015', '2016', '2017', '2018']

In [None]:
# %%capture cap --no-stderr
for year in years_to_gather:
    os.makedirs(f"../data/{year}", exist_ok=True) 
    print(f"getting {year} data")
    for team in team_handles_dict.values():
        if year != '2014':
            start = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            print(f"started scraping games for {team} at {start}")
            if team != 'CHA':
                gid_list = get_game_id_for_team(team, year)
                game_stats = []
                for gid in gid_list:
                    game_stats.append(get_box_score_stats(gid))
                    time.sleep(2)
                finished = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                print(f"finished scraping games for {team} at {finished}")
                team_games_list = f'../data/{year}/{team}_games_list_{year}.json'
                with open(team_games_list, 'w') as f:
                    json.dump(game_stats, f)
                print(f"saved {year} {team} game data in {team_games_list}")
            print("\n")
        elif year == '2014':
            start = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            os.makedirs(f"../data/{year}/", exist_ok=True)
            print(f"started scraping games for {team} at {start}")
            if team != 'CHO':
                gid_list = get_game_id_for_team(team, year)
                game_stats = []
                for gid in gid_list:
                    game_stats.append(get_box_score_stats(gid))
                    time.sleep(2)
                finished = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                print(f"finished scraping games for {team} at {finished}")
                team_games_list = f'../data/{year}/{team}_games_list_{year}.json'
                with open(team_games_list, 'w') as f:
                    json.dump(game_stats, f)
                print(f"saved {year} {team} game data in {team_games_list}")
            print("\n")

# with open('../output/season_scraping_logs.txt', 'w') as f:
#     f.write(cap.stdout)

getting 2014 data
started scraping games for TOR at 2019-12-05 22:28:54
finished scraping games for TOR at 2019-12-05 22:30:56
saved 2014 TOR game data in ../data/2014/TOR_games_list_2014.json


started scraping games for BOS at 2019-12-05 22:30:56


In [None]:
print("System and module version information:")
print("\n")
print(f'Python version: {sys.version_info}')
print(f'Beautiful Soup version: {bs4.__version__}')
print(f'requests version: {requests.__version__}')
print(f'last updated: {datetime.now().strftime("%Y-%m-%d %H:%M")}')