In [2]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from datetime import datetime, time, timedelta
from tqdm import tqdm

In [3]:
teams = {
    "Atlanta Hawks": "ATL",
    "Boston Celtics": "BOS",
    "Brooklyn Nets": "BRK",
    "Charlotte Hornets": "CHO",
    "Chicago Bulls": "CHI",
    "Cleveland Cavaliers": "CLE",
    "Dallas Mavericks": "DAL",
    "Denver Nuggets": "DEN",
    "Detroit Pistons": "DET",
    "Golden State Warriors": "GSW",
    "Houston Rockets": "HOU",
    "Indiana Pacers": "IND",
    "Los Angeles Clippers": "LAC",
    "Los Angeles Lakers": "LAL",
    "Memphis Grizzlies": "MEM",
    "Miami Heat": "MIA",
    "Milwaukee Bucks": "MIL",
    "Minnesota Timberwolves": "MIN",
    "New Orleans Pelicans": "NOP",
    "New York Knicks": "NYK",
    "Oklahoma City Thunder": "OKC",
    "Orlando Magic": "ORL",
    "Philadelphia 76ers": "PHI",
    "Phoenix Suns": "PHO",
    "Portland Trail Blazers": "POR",
    "Sacramento Kings": "SAC",
    "San Antonio Spurs": "SAS",
    "Toronto Raptors": "TOR",
    "Utah Jazz": "UTA",
    "Washington Wizards": "WAS"
}

### Get HTML for Boxscore

In [33]:
# Send a GET request to the webpage
url_team = 'https://www.basketball-reference.com/teams/BOS/2022_games.html'
response_team = requests.get(url_team)

# Parse the HTML content using BeautifulSoup
soup_team = BeautifulSoup(response_team.content, 'html.parser')

# Find all the links that contain 'boxscores' in their href attribute
boxscore_links = []
for link in soup_team.find_all('a', href=True):
    if 'boxscores' in link['href'] and 'html' in link['href'] :
        a=link['href'].split('/',2)[2]
        boxscore_links.append('https://www.basketball-reference.com/boxscores/pbp/'+a)

# # Print the list of box score links
# boxscore_links

### Get HTML of All Box Score

In [14]:
def data_extraction(x):
    df = pd.DataFrame()
    r = requests.get(x)
    soup = BeautifulSoup(r.content, 'html.parser')
    # Extract the table with play-by-play data
    pbp_table = soup.find_all('table')[0]
    strong=soup.find_all('strong')
    team_A=strong[1].get_text()
    team_A=team_A.replace('\n', '')
    team_B=strong[2].get_text()
    team_B=team_B.replace('\n', '')

    team_A=teams[team_A]
    team_B=teams[team_B]

    date=(strong[0].get_text().split(',',1)[1]).strip()

    for row in pbp_table.find_all("tr"):
        tds = row.find_all("td")
        if len(tds) > 0:
            time = tds[0].get_text()
            if row.find("td", {"colspan": "5"}) or row.find("td", {"colspan": "6"}):
                event = tds[1].get_text()
                df = df.append({"Team": '', 
                                "Time": time,
                                "Raw": event,
                                "Home_Away":'',
                                "BOS_Score":'',
                                "OPP_Score":''}, ignore_index=True)
            else:
                score = tds[3].get_text()
                if team_A=='BOS':
                    score_A,score_B=score.split('-')
                    home_away=0
                if team_B=='BOS':
                    score_B,score_A=score.split('-')
                    home_away=1
                if tds[1].get_text().strip()=="":
                    event =tds[5].get_text()
                    team=team_B
                else:
                    event = tds[1].get_text()
                    team=team_A
                df = df.append({"Team": team, 
                                "Time": time,
                                "Raw": event,
                                "Home_Away":home_away,
                                "BOS_Score":score_A,
                                "OPP_Score":score_B}, ignore_index=True)
    df['BOS_Score']=pd.to_numeric(df['BOS_Score'])
    df['OPP_Score']=pd.to_numeric(df['OPP_Score'])    
    df['Time'] = pd.to_datetime(df['Time'], format='%M:%S.%f')
    conditions=[(df["Raw"].str.contains("End of 1st quarter")),(df["Raw"].str.contains("End of 2nd quarter")),
                (df["Raw"].str.contains("End of 3rd quarter")),(df["Raw"].str.contains("End of 4th quarter")),
               (df["Raw"].str.contains("End of 1st overtime")),(df["Raw"].str.contains("End of 2nd overtime")),
               (df["Raw"].str.contains("End of 3rd overtime")),(df["Raw"].str.contains("End of 4th overtime")),]
    outcomes=['Q1','Q2','Q3','Q4','OT1','OT2','OT3','OT4']
    default=pd.NaT
    df['Q']=np.select(conditions, outcomes,default=default)
    df['Q']=df['Q'].fillna(method='backfill')
    ot4 = pd.to_datetime('1:08:00.0', format='%H:%M:%S.%f')
    ot3 = pd.to_datetime('1:03:00.0', format='%H:%M:%S.%f')
    ot2 = pd.to_datetime('00:58:00.0', format='%H:%M:%S.%f')
    ot1 = pd.to_datetime('00:53:00.0', format='%H:%M:%S.%f')
    dt48 = pd.to_datetime('00:48:00.0', format='%H:%M:%S.%f')
    dt36 = pd.to_datetime('00:36:00.0', format='%H:%M:%S.%f')
    dt24 = pd.to_datetime('00:24:00.0', format='%H:%M:%S.%f')
    dt12 = pd.to_datetime('00:12:00.0', format='%H:%M:%S.%f')


    mask1 = df['Q'] == 'Q1'
    df.loc[mask1, 'Time_48'] = dt12 - df.loc[mask1, 'Time']
    mask2 = df['Q'] == 'Q2'
    df.loc[mask2, 'Time_48'] = dt24 - df.loc[mask2, 'Time']
    mask3 = df['Q'] == 'Q3'
    df.loc[mask3, 'Time_48'] = dt36 - df.loc[mask3, 'Time']
    mask4 = df['Q'] == 'Q4'
    df.loc[mask4, 'Time_48'] = dt48 - df.loc[mask4, 'Time']
    mask5 = df['Q'] == 'OT1'
    df.loc[mask5, 'Time_48'] = ot1 - df.loc[mask5, 'Time']
    mask6 = df['Q'] == 'OT2'
    df.loc[mask6, 'Time_48'] = ot2 - df.loc[mask6, 'Time']
    mask7 = df['Q'] == 'OT3'
    df.loc[mask7, 'Time_48'] = ot3 - df.loc[mask7, 'Time']
    mask8 = df['Q'] == 'OT4'
    df.loc[mask8, 'Time_48'] = ot4 - df.loc[mask8, 'Time']
    
    df['BOS_Score'] = df['BOS_Score'].replace(r'^\s*$', np.nan, regex=True).ffill()
    df['OPP_Score'] = df['OPP_Score'].replace(r'^\s*$', np.nan, regex=True).ffill()
    df['Home_Away'] = df['Home_Away'].replace(r'^\s*$', np.nan, regex=True).ffill()
    df['Result'] = np.where((df['BOS_Score'].iloc[-1]>df['OPP_Score'].iloc[-1]),'BOS','OPP')
    return df

In [34]:
empty=pd.DataFrame()
game=1
for x in tqdm(boxscore_links):
    data=data_extraction(x)
    data['Game']=game
    game+=1
    empty=pd.concat([empty,data])

100%|████████████████████████████████████████████████████████████████████████████████| 106/106 [03:52<00:00,  2.20s/it]


In [35]:
empty

Unnamed: 0,BOS_Score,Home_Away,OPP_Score,Raw,Team,Time,Q,Time_48,Result,Game
0,,,,Jump ball: R. Williams vs. M. Robinson (K. Wal...,,1900-01-01 00:12:00,Q1,00:00:00,OPP,1
1,0.0,0.0,3.0,J. Randle makes 3-pt jump shot from 25 ft,NYK,1900-01-01 00:11:37,Q1,00:00:23,OPP,1
2,0.0,0.0,3.0,Turnover by G. Williams (lost ball; steal by K...,BOS,1900-01-01 00:11:18,Q1,00:00:42,OPP,1
3,0.0,0.0,3.0,K. Walker misses 2-pt layup at rim,NYK,1900-01-01 00:11:13,Q1,00:00:47,OPP,1
4,0.0,0.0,3.0,Offensive rebound by M. Robinson,NYK,1900-01-01 00:11:11,Q1,00:00:49,OPP,1
5,0.0,0.0,5.0,M. Robinson makes 2-pt dunk at rim,NYK,1900-01-01 00:11:11,Q1,00:00:49,OPP,1
6,0.0,0.0,5.0,J. Tatum misses 3-pt jump shot from 25 ft,BOS,1900-01-01 00:11:00,Q1,00:01:00,OPP,1
7,0.0,0.0,5.0,Offensive rebound by R. Williams,BOS,1900-01-01 00:10:58,Q1,00:01:02,OPP,1
8,0.0,0.0,5.0,M. Smart misses 3-pt jump shot from 23 ft,BOS,1900-01-01 00:10:51,Q1,00:01:09,OPP,1
9,0.0,0.0,5.0,Defensive rebound by M. Robinson,NYK,1900-01-01 00:10:49,Q1,00:01:11,OPP,1


In [36]:
empty.isna().sum()

BOS_Score    106
Home_Away    106
OPP_Score    106
Raw            0
Team           0
Time           0
Q              0
Time_48        0
Result         0
Game           0
dtype: int64

In [37]:
empty.dtypes

BOS_Score            float64
Home_Away            float64
OPP_Score            float64
Raw                   object
Team                  object
Time          datetime64[ns]
Q                     object
Time_48      timedelta64[ns]
Result                object
Game                   int64
dtype: object

In [38]:
empty.to_csv('BOS-2022_Raw.csv')