In [30]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
from io import StringIO

In [31]:
def get_team_codes():
    """Returns a list of all current NBA team codes"""
    return [
        'ATL', 'BOS', 'BRK', 'CHI', 'CHO', 'CLE', 'DAL', 'DEN', 'DET', 'GSW',
        'HOU', 'IND', 'LAC', 'LAL', 'MEM', 'MIA', 'MIL', 'MIN', 'NOP', 'NYK',
        'OKC', 'ORL', 'PHI', 'PHO', 'POR', 'SAC', 'SAS', 'TOR', 'UTA', 'WAS'
    ]

In [32]:
def get_number_stats_categories():
    '''Retunrs a list of all the statistics in the advanced game log on
    Basketball Reference that are numberic (eg PtsFwd as opposed to Opponent)'''
    return [
        'PtsFwd', 'PtsAgt', 'ORtg','DRtg', 'Pace', 'FTr', '3PAr',
        'TS%','TRB%', 'AST%', 'STL%', 'BLK%', 'eFG%', 'TOV%',
        'ORB%', 'FT/FGA', 'OppeFG%', 'OppTOV%', 'DRB%', 'OppFT/FGA'
       ]

In [33]:
def scrape_team_advanced_gamelog(team_code, year=2024):
    """
    Scrapes advanced game log for a specific team and year.
    
    Args:
        team_code (str): Team code (e.g., 'CHO' for Charlotte).
        year (int): Season year (e.g., 2024 for the 2023-24 season).
    
    Returns:
        pandas.DataFrame: Cleaned advanced game log data, or None if scraping fails.
    """
    url = f"https://www.basketball-reference.com/teams/{team_code}/{year}/gamelog-advanced/"
    
    # Add a polite delay to avoid overloading the server
    time.sleep(5)
    
    try:
        # Fetch the page
        response = requests.get(url)
        response.raise_for_status()
        
        # Check if the page contains valid data
        if "Page Not Found" in response.text:
            print(f"Warning: No data found for {team_code} {year}")
            return None
        
        # Parse the HTML and extract the advanced game log table
        soup = BeautifulSoup(response.text, 'html.parser')
        table = soup.find('table', id='tgl_advanced')
        
        if table is None:
            print(f"Warning: No table found for {team_code} {year}")
            return None
        
        # Convert the HTML table to a DataFrame
        df = pd.read_html(StringIO(str(table)))[0]
        
        return df
    
    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred for {team_code}: {http_err}")
    except requests.exceptions.ConnectionError as conn_err:
        print(f"Connection error occurred for {team_code}: {conn_err}")
    except Exception as err:
        print(f"An error occurred for {team_code}: {err}")
    
        return None


In [34]:
# Scrape regular season stats
cho_2024 = scrape_team_advanced_gamelog('CHO')

# Set column names to the stats they represent
cho_2024.columns = cho_2024.columns.get_level_values(1)

# Make Game Number (G) the index
cho_2024.set_index('G', inplace=True)

# Drop unneccessary columns
cho_2024.drop(columns=['Rk'], inplace=True)
cho_2024.drop(columns=['Unnamed: 18_level_1'], inplace=True)
cho_2024.drop(columns=['Unnamed: 23_level_1'], inplace=True)

# Rename columns
cho_2024.rename(columns={'Unnamed: 3_level_1': 'Home/Away'}, inplace=True)
cho_2024.rename(columns={'Tm': 'PtsFwd'}, inplace=True)
cho_2024.rename(columns={'Opp': 'PtsAgt'}, inplace=True)
cho_2024.columns.values[2] = 'Opponent'        # Column indexed at 2 and 5 both titled 'Opp' originally
cho_2024.columns.values[20] = 'OppeFG%'
cho_2024.columns.values[21] = 'OppTOV%'
cho_2024.columns.values[23] = 'OppFT/FGA'

# Drop extraneous rows that do not house game info
cho_2024 = cho_2024[cho_2024.index.notnull()]  # Keep only non-NaN indices (ie rows without the 3 collective headers)
cho_2024 = cho_2024[cho_2024.index != "G"]     # Keep only rows wiht games (ie rows without individual stat headers)

# Edit values in Home/Away column to be 1 and 0
cho_2024['Home/Away'] = cho_2024['Home/Away'].fillna('1')
cho_2024['Home/Away'] = cho_2024['Home/Away'].replace('@', '0')

# Edit values in W/L to be 1/0
cho_2024['W/L'] = cho_2024['W/L'].replace('W', '1')
cho_2024['W/L'] = cho_2024['W/L'].replace('L', '0')

In [35]:
cho_2024

Unnamed: 0_level_0,Date,Home/Away,Opponent,W/L,PtsFwd,PtsAgt,ORtg,DRtg,Pace,FTr,...,STL%,BLK%,eFG%,TOV%,ORB%,FT/FGA,OppeFG%,OppTOV%,DRB%,OppFT/FGA
G,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2023-10-25,1,ATL,1,116,110,112.2,106.4,103.4,.302,...,4.8,4.7,.564,16.3,28.6,.221,.446,10.0,76.5,.290
2,2023-10-27,1,DET,0,99,111,92.3,103.5,107.2,.386,...,12.1,12.1,.415,14.9,21.2,.295,.511,19.5,71.4,.239
3,2023-10-30,1,BRK,0,121,133,116.2,127.7,104.2,.208,...,7.7,9.3,.531,8.7,18.2,.198,.632,12.2,75.6,.264
4,2023-11-01,0,HOU,0,119,128,117.7,126.6,101.1,.278,...,8.9,17.0,.561,13.7,32.6,.200,.649,15.3,80.6,.226
5,2023-11-04,0,IND,1,125,124,127.4,126.4,98.1,.261,...,5.1,20.8,.580,11.7,32.4,.261,.602,13.0,72.5,.129
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78,2024-04-07,1,OKC,0,118,121,119.6,122.7,98.6,.169,...,12.2,17.6,.645,16.8,21.2,.133,.621,15.2,74.4,.149
79,2024-04-09,1,DAL,0,104,130,103.7,129.6,100.3,.236,...,8.0,7.0,.494,10.9,17.0,.180,.629,12.8,66.0,.140
80,2024-04-10,0,ATL,1,115,114,118.5,117.5,97.0,.278,...,11.3,4.8,.595,13.6,13.9,.266,.634,16.8,80.0,.122
81,2024-04-12,0,BOS,0,98,131,99.8,133.4,98.2,.289,...,3.1,11.1,.520,18.9,9.8,.250,.587,5.2,64.4,.087


In [36]:
# Take the stats and make them rolling averages (currently past 5 games but may be interesting to optimize)
number_stats = get_number_stats_categories()
rolling_avgs = cho_2024.copy()

# For every stat that is a number, get the rolling average and drop the non-rolling avg column of that same stat
for stat in number_stats:
    rolling_avgs[f'Avg_{stat}'] = rolling_avgs[stat].shift().rolling(5).mean()
    rolling_avgs = rolling_avgs.drop(columns=stat)

# Drop the first five rows becasue they are all NaN
rolling_avgs = rolling_avgs.drop(index=rolling_avgs.index[:5])

rolling_avgs

Unnamed: 0_level_0,Date,Home/Away,Opponent,W/L,Avg_PtsFwd,Avg_PtsAgt,Avg_ORtg,Avg_DRtg,Avg_Pace,Avg_FTr,...,Avg_STL%,Avg_BLK%,Avg_eFG%,Avg_TOV%,Avg_ORB%,Avg_FT/FGA,Avg_OppeFG%,Avg_OppTOV%,Avg_DRB%,Avg_OppFT/FGA
G,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,2023-11-05,0,DAL,0,116.0,121.2,113.16,118.12,102.80,0.2870,...,7.72,12.78,0.5302,13.06,26.60,0.2350,0.5680,14.00,75.32,0.2296
7,2023-11-08,1,WAS,0,116.4,124.0,113.30,120.56,103.02,0.2690,...,7.72,12.66,0.5244,11.34,25.04,0.2150,0.5886,13.90,75.74,0.2244
8,2023-11-10,0,WAS,1,119.8,128.2,117.26,125.36,102.28,0.2804,...,6.84,12.10,0.5554,12.14,26.58,0.2218,0.6018,11.84,76.16,0.2178
9,2023-11-12,0,NYK,0,120.4,125.0,118.42,122.84,101.78,0.2898,...,7.26,11.84,0.5456,11.92,32.76,0.2150,0.5890,12.08,78.00,0.2036
10,2023-11-14,1,MIA,0,118.0,125.2,117.80,125.14,100.24,0.2576,...,6.56,10.00,0.5398,11.34,32.90,0.1898,0.5844,10.66,75.42,0.2044
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78,2024-04-07,1,OKC,0,105.8,113.4,112.06,120.24,94.44,0.2266,...,5.46,6.48,0.5544,11.34,14.62,0.1836,0.5576,11.06,72.92,0.1912
79,2024-04-09,1,DAL,0,110.0,114.6,115.06,119.98,95.60,0.2032,...,7.04,9.30,0.5882,12.62,14.16,0.1698,0.5616,11.64,73.30,0.2030
80,2024-04-10,0,ATL,1,107.2,114.6,110.52,118.04,96.98,0.2254,...,8.00,9.84,0.5608,13.04,15.00,0.1898,0.5540,12.20,70.74,0.1510
81,2024-04-12,0,BOS,0,109.4,113.8,112.00,116.34,97.66,0.2568,...,9.62,9.68,0.5754,14.40,14.44,0.2232,0.5628,13.50,71.86,0.1462
