In [1]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from datetime import datetime, time, timedelta
from tqdm import tqdm

In [2]:
teams = {
    "Atlanta Hawks": "ATL",
    "Boston Celtics": "BOS",
    "Brooklyn Nets": "BRK",
    "Charlotte Hornets": "CHO",
    "Chicago Bulls": "CHI",
    "Cleveland Cavaliers": "CLE",
    "Dallas Mavericks": "DAL",
    "Denver Nuggets": "DEN",
    "Detroit Pistons": "DET",
    "Golden State Warriors": "GSW",
    "Houston Rockets": "HOU",
    "Indiana Pacers": "IND",
    "Los Angeles Clippers": "LAC",
    "Los Angeles Lakers": "LAL",
    "Memphis Grizzlies": "MEM",
    "Miami Heat": "MIA",
    "Milwaukee Bucks": "MIL",
    "Minnesota Timberwolves": "MIN",
    "New Orleans Pelicans": "NOP",
    "New York Knicks": "NYK",
    "Oklahoma City Thunder": "OKC",
    "Orlando Magic": "ORL",
    "Philadelphia 76ers": "PHI",
    "Phoenix Suns": "PHO",
    "Portland Trail Blazers": "POR",
    "Sacramento Kings": "SAC",
    "San Antonio Spurs": "SAS",
    "Toronto Raptors": "TOR",
    "Utah Jazz": "UTA",
    "Washington Wizards": "WAS"
}

### Get HTML for Boxscore

In [21]:
# Send a GET request to the webpage
url_team = 'https://www.basketball-reference.com/teams/BOS/2018_games.html'
response_team = requests.get(url_team)

# Parse the HTML content using BeautifulSoup
soup_team = BeautifulSoup(response_team.content, 'html.parser')

# Find all the links that contain 'boxscores' in their href attribute
boxscore_links = []
for link in soup_team.find_all('a', href=True):
    if 'boxscores' in link['href'] and 'html' in link['href'] :
        a=link['href'].split('/',2)[2]
        boxscore_links.append('https://www.basketball-reference.com/boxscores/pbp/'+a)

# # Print the list of box score links
# boxscore_links

### Get HTML of All Box Score

In [4]:
def data_extraction(x):
    df = pd.DataFrame()
    r = requests.get(x)
    soup = BeautifulSoup(r.content, 'html.parser')
    # Extract the table with play-by-play data
    pbp_table = soup.find_all('table')[0]
    strong=soup.find_all('strong')
    team_A=strong[1].get_text()
    team_A=team_A.replace('\n', '')
    team_B=strong[2].get_text()
    team_B=team_B.replace('\n', '')

    team_A=teams[team_A]
    team_B=teams[team_B]

    date=(strong[0].get_text().split(',',1)[1]).strip()

    for row in pbp_table.find_all("tr"):
        tds = row.find_all("td")
        if len(tds) > 0:
            time = tds[0].get_text()
            if row.find("td", {"colspan": "5"}) or row.find("td", {"colspan": "6"}):
                event = tds[1].get_text()
                df = df.append({"Team": '', 
                                "Time": time,
                                "Raw": event,
                                "Home_Away":'',
                                "BOS_Score":'',
                                "OPP_Score":''}, ignore_index=True)
            else:
                score = tds[3].get_text()
                if team_A=='BOS':
                    score_A,score_B=score.split('-')
                    home_away=0
                if team_B=='BOS':
                    score_B,score_A=score.split('-')
                    home_away=1
                if tds[1].get_text().strip()=="":
                    event =tds[5].get_text()
                    team=team_B
                else:
                    event = tds[1].get_text()
                    team=team_A
                df = df.append({"Team": team, 
                                "Time": time,
                                "Raw": event,
                                "Home_Away":home_away,
                                "BOS_Score":score_A,
                                "OPP_Score":score_B}, ignore_index=True)
    return df

In [22]:
empty=pd.DataFrame()
game=1
for x in tqdm(boxscore_links):
    data=data_extraction(x)
    data['Game']=game
    game+=1
    empty=pd.concat([empty,data])

100%|████████████████████████████████████████████████████████████████████████████████| 101/101 [03:33<00:00,  2.12s/it]


In [23]:
empty

Unnamed: 0,BOS_Score,Home_Away,OPP_Score,Raw,Team,Time,Game
0,,,,Start of 1st quarter,,12:00.0,1
1,,,,Jump ball: K. Love vs. A. Horford (K. Irving g...,,12:00.0,1
2,2,0,0,K. Irving makes 2-pt jump shot from 10 ft (ass...,BOS,11:44.0,1
3,2,0,0,D. Rose misses 2-pt layup from 1 ft (block by ...,CLE,11:27.0,1
4,2,0,0,Defensive rebound by A. Horford,BOS,11:23.0,1
5,2,0,0,G. Hayward misses 3-pt jump shot from 25 ft,BOS,11:21.0,1
6,2,0,0,Defensive rebound by D. Rose,CLE,11:18.0,1
7,2,0,0,K. Love misses 2-pt jump shot from 15 ft,CLE,11:02.0,1
8,2,0,0,Defensive rebound by J. Brown,BOS,10:59.0,1
9,2,0,0,J. Tatum misses 2-pt layup from 2 ft (block by...,BOS,10:51.0,1


In [24]:
empty.to_csv('BOS-2018_Raw.csv')