In [1]:
# Getting team acronyms as used by https://www.basketball-reference.com
TEAMS = [
  "ATL", # Atlanta Hawks
  "BOS", # Boston Celtics
  "BRK", # Brooklyn Nets
  "CHO", # Charlotte Hornets
  "CHI", # Chicago Bulls
  "CLE", # Cleveland Cavaliers
  "DAL", # Dallas Mavericks
  "DEN", # Denver Nuggets
  "DET", # Detroit Pistons
  "GSW", # Golden State Warriors
  "HOU", # Houston Rockets
  "IND", # Indiana Pacers
  "LAC", # Los Angeles Clippers
  "LAL", # Los Angeles Lakers
  "MEM", # Memphis Grizzlies
  "MIA", # Miami Heat
  "MIL", # Milwaukee Bucks
  "MIN", # Minnesota Timberwolves
  "NOP", # New Orleans Pelicans
  "NYK", # New York Knicks
  "OKC", # Oklahoma City Thunder
  "ORL", # Orlando Magic
  "PHI", # Philadelphia 76ers
  "PHO", # Phoenix Suns
  "POR", # Portland Trail Blazers
  "SAC", # Sacramento Kings
  "SAS", # San Antonio Spurs
  "TOR", # Toronto Raptors
  "UTA", # Utah Jazz
  "WAS"  # Washington Wizards
]

# Defining the years I want to include data from as well as where to find the data
YEARS = [2025, 2024]
URL_STATS = "https://www.basketball-reference.com/teams/{}/{}.html"
URL_FIXTURES = "https://www.basketball-reference.com/teams/{}/{}_games.html"
URL_BOX_SCORES = "https://www.basketball-reference.com/teams/{}/{}/gamelog/"
BOX_SCORES_HTMLS_PATH = "box scores html pages/{}/{}.html"
STATS_HTMLS_PATH = "stats html pages/{}/{}.html"
FIXTURES_HTMLS_PATH = "fixtures html pages/{}/{}.html"

In [2]:
# Getting the stats using selenium as the page is rendered using javascript (I think)
from selenium import webdriver
from selenium.webdriver.edge.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [3]:
# Headless mode means the browser won't open on my device
options = Options()
options.add_argument("--headless")
driver = webdriver.Edge(options=options)

# Getting the box scores for each game for each team for each year
for year in YEARS:
    for team in TEAMS:
        url = URL_BOX_SCORES.format(team, year)
        driver.get(url)
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
        html = driver.page_source
        with open(BOX_SCORES_HTMLS_PATH.format(year, team), "w+", encoding="utf-8") as f:
            f.write(html)


In [4]:
# Getting future fixtures
CURRENT_SEASON_YEAR = 2025

for team in TEAMS:
    url = URL_FIXTURES.format(team,CURRENT_SEASON_YEAR)
    driver.get(url)
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
    html = driver.page_source
    with open(FIXTURES_HTMLS_PATH.format(CURRENT_SEASON_YEAR,team), "w+", encoding="utf-8") as f:
        f.write(html)

driver.quit()

In [5]:
# Cleaning the box scores to make it easier to work with, as well as adding the win percentage
import pandas as pd

def clean_box_scores(df, team):
    df.rename(columns={"Unnamed: 3":"Home","Tm":"Points Scored","Opp.1":"Opp Points Scored", "W/L": "Win"}, inplace=True)
    df = df.drop(columns=["Unnamed: 24","Rk","G"])
    df.columns = [col if not col.endswith('.1') else f"Opp {col[:-2]}" for col in df.columns]

    df["Home"] = df["Home"].apply(lambda x: 1 if pd.isna(x) else 0)
    df["Win"] = df["Win"].apply(lambda x: 1 if x == "W" else 0)
    df["Date"] = pd.to_datetime(df["Date"])
    df["Win Percentage"] = df["Win"].expanding().mean().shift(1)
    df["Win Percentage"] = df["Win Percentage"].fillna(0)
    df["Team"] = team
    columns = df.columns.to_list()
    columns.insert(2, columns.pop(columns.index(f"Team")))
    df = df[columns]
    return df


In [6]:
from bs4 import BeautifulSoup
from io import StringIO

In [7]:
# Cleaning the HTML finding the table containing the box scores and converting them to CSVs
for year in YEARS:
    for team in TEAMS:
        with open(BOX_SCORES_HTMLS_PATH.format(year,team), "r", encoding="utf-8") as f:
            page = f.read()

        soup = BeautifulSoup(page,'html.parser')
        for tr in soup.find_all("tr", class_="over_header"):
            tr.decompose()
        for tr in soup.find_all("tr", class_="thead"):
            tr.decompose()
        box_scores_html = StringIO(str(soup.find(id="tgl_basic")))
        
        try:
            box_scores = pd.read_html(box_scores_html)[0]
            box_scores = clean_box_scores(box_scores, team)
            box_scores.to_csv(f"team box scores/{year}/{team} box scores.csv", index=False)
        except Exception as e:
            print(e)
            print(f"Failed to read {team} for {year}")
            break

In [8]:
# Adding the ID column to the box scores so I can remove duplicates
def remove_duplicate_games(df):
    for row, game in df.iterrows():
        df.at[row, "ID"] = f"{game['Date']}_{"_".join(sorted([game['Team'], game['Opp']]))}"

    # Removing duplicates
    df = df.drop_duplicates(subset="ID", keep="first").reset_index(drop=True)
    df.drop(columns=["ID"], inplace=True)
    return df

In [9]:
# Combining all the box scores into one dataframe
new_dfs = []
for year in YEARS:
    for team in TEAMS:
        df = pd.read_csv(f"team box scores/{year}/{team} box scores.csv")
        new_dfs.append(df)

dfs = pd.concat(new_dfs).reset_index(drop=True)
dfs.to_csv("all box scores.csv", index=False)

In [10]:
def get_rolling_stats(df, relevant_team_stats, dfs):
    # df is the box score dataframe for the current team
    # dfs is the box score dataframe containing all the teams
    rolling_df = pd.DataFrame()
    rolling_df = df[["Date", "Home","Team", "Opp", "Win", "Win Percentage"]].copy()

    # Getting the rolling averages for the current team
    for stat in relevant_team_stats:
            rolling_df[f"Rolling {stat} Average"] = df[stat].expanding().mean().shift(1)


    opponent_rolling_averages = {}

    # Getting the rolling averages for the opposing team
    for game_index, game in df.iterrows():
        # For each game in the dataframe for the current team
        # Find all the games the opposing team played before the current game
        opposing_team_games = dfs[
        (dfs["Team"] == game["Opp"]) & (dfs["Date"] < game["Date"])
    ]
        
        # Dictionary to store the rolling averages for the opposing team before the game we are on
        opponent_rolling_averages_for_game = {}

        if not opposing_team_games.empty:
            # We have all the games before the current game the opposing team playe
            # So we take the last row of the created rolling dataframe to get the latest rolling averages
            for stat in relevant_team_stats:
                opponent_rolling_averages_for_game[f"Rolling Opp {stat} Average"] = opposing_team_games[stat].expanding().mean().iloc[-1] 

            # Calculate the win percentage for the opposing team
            opponent_wins = opposing_team_games[opposing_team_games["Win"] == 1].shape[0]
            opponent_games_played = opposing_team_games.shape[0]

            if opponent_games_played > 0:
                opponent_win_percentage = opponent_wins / opponent_games_played
            else:
                opponent_win_percentage = None  
            
            opponent_rolling_averages_for_game["Opp Win Percentage"] = opponent_win_percentage
        else:
             # No games so we can't calculate rolling averages
             for stat in relevant_team_stats:
                opponent_rolling_averages_for_game[f"Rolling Opp {stat} Average"] = None
             opponent_rolling_averages_for_game["Opp Win Percentage"] = None

        # Add the rolling averages for all the stats for the opposing team to the dictionary, indexed by the game index
        opponent_rolling_averages[game_index] = opponent_rolling_averages_for_game

    # Loop through all the rows again and add the rolling averages for each stat for opposing team to the rolling dataframe
    # Using the game index to find the correct row to add them to
    for game_index, game in df.iterrows():
        for stat in relevant_team_stats:
            rolling_df.loc[game_index, f"Rolling Opp {stat} Average"] = opponent_rolling_averages[game_index][f"Rolling Opp {stat} Average"]
        rolling_df.loc[game_index, "Opp Win Percentage"] = opponent_rolling_averages[game_index]["Opp Win Percentage"]

    # Drop the first row as we can't calculate rolling averages for the first game
    return rolling_df.iloc[1:]


In [11]:
# Getting the rolling averages for each team
relevant_team_stats = ['Points Scored',
       'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%', 'ORB',
       'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF']

rolling_dfs = []

for df in new_dfs:
    rolling_df = get_rolling_stats(df, relevant_team_stats, dfs)
    rolling_dfs.append(rolling_df)

final_merged_df = pd.concat(rolling_dfs).reset_index(drop=True).dropna()

final_merged_df.dropna(inplace=True)
for col in final_merged_df.columns:
    if "Opp" in col and col != "Opp":
        final_merged_df[col] = pd.to_numeric(final_merged_df[col])
final_merged_df.dropna(inplace=True)

full_rolling_df = remove_duplicate_games(final_merged_df)

full_rolling_df.sort_values("Date", inplace=True)
full_rolling_df.reset_index(drop=True, inplace=True)
full_rolling_df.to_csv("rolling stats.csv", index=False)


In [12]:
from datetime import datetime, timedelta
abbreviations = {
    'Atlanta Hawks': 'ATL',
    'Boston Celtics': 'BOS',
    'Brooklyn Nets': 'BRK',
    'Charlotte Hornets': 'CHO',
    'Chicago Bulls': 'CHI',
    'Cleveland Cavaliers': 'CLE',
    'Dallas Mavericks': 'DAL',
    'Denver Nuggets': 'DEN',
    'Detroit Pistons': 'DET',
    'Golden State Warriors': 'GSW',
    'Houston Rockets': 'HOU',
    'Indiana Pacers': 'IND',
    'Los Angeles Clippers': 'LAC',
    'Los Angeles Lakers': 'LAL',
    'Memphis Grizzlies': 'MEM',
    'Miami Heat': 'MIA',
    'Milwaukee Bucks': 'MIL',
    'Minnesota Timberwolves': 'MIN',
    'New Orleans Pelicans': 'NOP',
    'New York Knicks': 'NYK',
    'Oklahoma City Thunder': 'OKC',
    'Orlando Magic': 'ORL',
    'Philadelphia 76ers': 'PHI',
    'Phoenix Suns': 'PHO',
    'Portland Trail Blazers': 'POR',
    'Sacramento Kings': 'SAC',
    'San Antonio Spurs': 'SAS',
    'Toronto Raptors': 'TOR',
    'Utah Jazz': 'UTA',
    'Washington Wizards': 'WAS'
}
def clean_fixtures(df, team):
    new_df = df[["Date", "Unnamed: 5"]].copy()
    new_df["Team"] = team
    new_df["Opponent"] = df["Opponent"]
    new_df.rename(columns={"Opponent":"Opp", "Unnamed: 5": "Home"}, inplace=True)
    new_df["Opp"] = new_df["Opp"].apply(lambda x: abbreviations[x])
    new_df["Home"] = new_df["Home"].apply(lambda x: 0 if x == "@" else 1)
    new_df["Date"] = pd.to_datetime(new_df["Date"], format="%a, %b %d, %Y")
    # Doesnt account for games that will be played on the same day at a later time (yet)
    today = datetime.today()
    next_week = today + timedelta(days=7)
    new_df = new_df[(new_df["Date"] >= today) & (new_df["Date"] <= next_week)]
    return new_df

In [13]:
# Cleaning the HTML files of fixtures and converting them to CSVs
future_fixtures = []
for team in TEAMS:
    with open(FIXTURES_HTMLS_PATH.format(CURRENT_SEASON_YEAR,team), "r", encoding="utf-8") as f:
        page = f.read()

    soup = BeautifulSoup(page,'html.parser')
    for tr in soup.find_all("tr", class_="thead"):
        tr.decompose()
    fixtures_html = StringIO(str(soup.find(id="games")))
    try:
        fixtures = pd.read_html(fixtures_html)[0]
        fixtures = clean_fixtures(fixtures, team)
        future_fixtures.append(fixtures)
        fixtures.to_csv(f"future fixtures/{CURRENT_SEASON_YEAR}/{team} fixtures.csv", index=False)
    except Exception as e:
        print(e)
        print(f"Failed to read {team} for {CURRENT_SEASON_YEAR}")
        break

In [14]:
def get_latest_stats(df, rolling_stats):
    new_df = df.copy()

    team_name = df["Team"].iloc[0]
    team_df = rolling_stats[rolling_stats["Team"] == team_name]
    team_stats = team_df.iloc[-1]
    new_df["Win Percentage"] = team_stats["Win Percentage"]
    for stat in relevant_team_stats:
                new_df[f"Rolling {stat} Average"] = team_stats[f"Rolling {stat} Average"]

    for index, row in df.iterrows():
        opponent_df = rolling_stats[rolling_stats["Team"] == row["Opp"]]
        opponent_stats = opponent_df.iloc[-1]

        for stat in relevant_team_stats:
                new_df.at[index, f"Rolling Opp {stat} Average"] = opponent_stats[f"Rolling {stat} Average"]

        new_df.at[index, "Opp Win Percentage"] = opponent_stats["Win Percentage"]

    return new_df
        

In [15]:
games_to_predict = []
for df in future_fixtures:
    df = get_latest_stats(df, final_merged_df)
    games_to_predict.append(df)

games_to_predict_df = pd.concat(games_to_predict).reset_index(drop=True)
games_to_predict_df = remove_duplicate_games(games_to_predict_df)
games_to_predict_df.sort_values("Date", inplace=True)

In [16]:
games_to_predict_df.to_csv("games to predict.csv", index=False)