In [1]:
import pandas as pd
import altair as alt

In [2]:
list_of_df = pd.read_html("https://www.espn.com/nba/standings/_/group/league")
df = list_of_df[1]

In [3]:
team_names = [
    'Oklahoma City Thunder',
    'Cleveland Cavaliers',
    'Boston Celtics',
    'Houston Rockets',
    'New York Knicks',
    'Los Angeles Lakers',
    'Denver Nuggets',
    'Indiana Pacers',
    'Los Angeles Clippers',
    'Minnesota Timberwolves',
    'Golden State Warriors',
    'Memphis Grizzlies',
    'Milwaukee Bucks',
    'Detroit Pistons',
    'Orlando Magic',
    'Atlanta Hawks',
    'Sacramento Kings',
    'Chicago Bulls',
    'Dallas Mavericks',
    'Miami Heat',
    'Phoenix Suns',
    'Portland Trail Blazers',
    'San Antonio Spurs',
    'Toronto Raptors',
    'Brooklyn Nets',
    'Philadelphia 76ers',
    'New Orleans Pelicans',
    'Charlotte Hornets',
    'Washington Wizards',
    'Utah Jazz'
]
team_codes = [
    'OKC', 'CLE', 'BOS', 'HOU', 'NY', 'LAL', 'DEN', 'IND', 'LAC', 'MIN',
    'GS', 'MEM', 'MIL', 'DET', 'ORL', 'ATL', 'SAC', 'CHI', 'DAL', 'MIA',
    'PHX', 'POR', 'SA', 'TOR', 'BKN', 'PHI', 'NO', 'CHA', 'WSH', 'UTAH'
]
team_code_to_name = dict(zip(team_codes, team_names))
bonus_team_codes = {
    'SAS': 'San Antonio Spurs',
    'UTA': 'Utah Jazz',
    'GSW': 'Golden State Warriors',
    'BRK': 'Brooklyn Nets',
    'PHO': 'Phoenix Suns',
    'NYK': 'New York Knicks',
    'WAS': 'Washington Wizards',
    'NOP': 'New Orleans Pelicans',
}
team_code_to_name.update(bonus_team_codes)
logo_url_template = "https://a.espncdn.com/i/teamlogos/nba/500/{}.png"
df['team'] = team_names
df['team_code'] = team_codes
df['logo_url'] = df['team_code'].apply(lambda code: logo_url_template.format(code.lower()))
df.head()

Unnamed: 0,W,L,PCT,GB,HOME,AWAY,DIV,CONF,PPG,OPP PPG,DIFF,STRK,L10,team,team_code,logo_url
0,68,14,0.829,-,35-6,32-8,12-4,39-13,120.5,107.6,12.9,W4,8-2,Oklahoma City Thunder,OKC,https://a.espncdn.com/i/teamlogos/nba/500/okc.png
1,64,18,0.78,4,34-7,30-11,12-4,41-11,121.9,112.4,9.5,L1,6-4,Cleveland Cavaliers,CLE,https://a.espncdn.com/i/teamlogos/nba/500/cle.png
2,61,21,0.744,7,28-13,33-8,14-2,39-13,116.3,107.2,9.1,W2,8-2,Boston Celtics,BOS,https://a.espncdn.com/i/teamlogos/nba/500/bos.png
3,52,30,0.634,16,29-12,23-17,13-3,31-21,114.3,109.8,4.5,L3,6-4,Houston Rockets,HOU,https://a.espncdn.com/i/teamlogos/nba/500/hou.png
4,51,31,0.622,17,27-14,24-17,12-4,34-18,115.8,111.7,4.1,W1,6-4,New York Knicks,NY,https://a.espncdn.com/i/teamlogos/nba/500/ny.png


In [4]:
team_logo_df = df[['team', 'logo_url']].copy()

In [5]:
chart = alt.Chart(df).mark_image(
    width=40,
    height=40
).encode(
    x=alt.X('PPG:Q', scale=alt.Scale(domain=[100, 130]), title='Points Per Game'),
    y=alt.Y('PCT:Q', title='Winning Percentage'),
    url='logo_url:N',  # This tells Altair to use the image URL
    tooltip=['team', 'W', 'L', 'PCT'],
).properties(
    width=600,
    height=300,
    title='NBA Teams: Wins vs Losses'
).interactive()  # Make the chart interactive

chart


In [6]:
SEASON_YEAR_FIELD = "season_year"

PLAYOFF_TEAM_STATS_URL = "https://www.basketball-reference.com/playoffs/NBA_{}.html"
IN_SEASON_TEAM_STATS_URL = "https://www.basketball-reference.com/leagues/NBA_{}.html"

PLAYOFF_PLAYER_STATS_URL = "https://www.basketball-reference.com/playoffs/NBA_{}_advanced.html"
IN_SEASON_PLAYER_STATS_URL = "https://www.basketball-reference.com/leagues/NBA_{}_advanced.html"

PLAYOFF_ADVANCED_STATS_INDEX = 20
IN_SEASON_ADVANCED_STATS_INDEX = 10

PLAYOFF_TEAM_FIELD = "Tm"
IN_SEASON_TEAM_FIELD = "Team"


def load_advanced_stats_by_season_part(year: int, is_playoff: bool = False) -> pd.DataFrame:
    """
    Load advanced stats for either playoff or in-season parts of the NBA season for a given year.

    :param year: The NBA season year (e.g., 2024 for the 2023-2024 season).
    :param is_playoff: Boolean indicating whether to load playoff stats (True) or in-season stats (False).

    :return: A DataFrame containing advanced stats for the specified part of the season.
    """
    if is_playoff:
        url = PLAYOFF_TEAM_STATS_URL.format(year)
        index = PLAYOFF_ADVANCED_STATS_INDEX
        team_field = PLAYOFF_TEAM_FIELD

    else:
        url = IN_SEASON_TEAM_STATS_URL.format(year)
        index = IN_SEASON_ADVANCED_STATS_INDEX  # Regular season advanced stats index
        team_field = IN_SEASON_TEAM_FIELD

    data = pd.read_html(url)
    adv_stats_df = data[index]
    adv_stats_df.columns = ["_".join(col) if "Unnamed" not in col[0] else col[1] for col in adv_stats_df.columns]
    # adv_stats_df = adv_stats_df.droplevel(0, axis=1)
    adv_stats_df.dropna(how='all', axis=1, inplace=True)
    adv_stats_df[team_field] = adv_stats_df[team_field].str.replace("*", "")
    adv_stats_df = adv_stats_df.drop(
        adv_stats_df[adv_stats_df[team_field] == "League Average"].index
    )
    return adv_stats_df


def load_advanced_player_stats_by_season_part(year: int, is_playoff: bool = False) -> pd.DataFrame:
    """
    Load advanced player stats for either playoff or in-season parts of the NBA season for a given year.

    :param year: The NBA season year (e.g., 2024 for the 2023-2024 season).
    :param is_playoff: Boolean indicating whether to load playoff stats (True) or in-season stats (False).

    :return: A DataFrame containing advanced player stats for the specified part of the season.
    """
    if is_playoff:
        url = PLAYOFF_PLAYER_STATS_URL.format(year)
        team_field = PLAYOFF_TEAM_FIELD

    else:
        url = IN_SEASON_PLAYER_STATS_URL.format(year)
        team_field = IN_SEASON_TEAM_FIELD

    data = pd.read_html(url)
    adv_stats_df = data[0] # Assuming the first table contains the advanced player stats
    adv_stats_df.dropna(how='all', axis=1, inplace=True)
    adv_stats_df = adv_stats_df[~(adv_stats_df["Rk"] == "Rk")]  
    adv_stats_df = adv_stats_df.drop(
        adv_stats_df[
            # Drop the cummulative row for both temas a traded player was on
            (adv_stats_df.Player == "League Average") | (adv_stats_df[team_field] == "2TM")
        ].index
    )
    return adv_stats_df


def load_advanced_stats_by_year_all(year: int, stats_level: str) -> pd.DataFrame:
    """
    Load advanced stats for both playoff and in-season parts of the NBA season for a given year.

    :param year: The NBA season year (e.g., 2024 for the 2023-2024 season).
    :param stats_level: The level of stats to load, either 'playoff' or 'in_season'.

    :return: A DataFrame containing advanced stats for both playoff and in-season parts,
             with playoff teams matched to their in-season stats.
    """
    if stats_level == "player":
        in_season_df = load_advanced_player_stats_by_season_part(year, is_playoff=False)
        playoff_df = load_advanced_player_stats_by_season_part(year, is_playoff=True)
        join_key = "Player"

    elif stats_level == "team":
        in_season_df = load_advanced_stats_by_season_part(year, is_playoff=False)
        playoff_df = load_advanced_stats_by_season_part(year, is_playoff=True)
        join_key = PLAYOFF_TEAM_FIELD

    else:
        raise ValueError("stats_level must be either 'player' or 'team'.")

    in_season_df.rename(columns={IN_SEASON_TEAM_FIELD: PLAYOFF_TEAM_FIELD}, inplace=True)

    merged_df = playoff_df.join(
        in_season_df.set_index(join_key),
        on=join_key,
        how="left",
        lsuffix="_playoff",
        rsuffix="_in_season"
    )
    merged_df["season_year"] = year
    return merged_df

In [7]:
team_logo_df.rename(columns={'team': PLAYOFF_TEAM_FIELD}, inplace=True)

In [8]:
def load_multiple_years(start_year, end_year, stats_level):
    data_years = [
        load_advanced_stats_by_year_all(year, stats_level=stats_level)
        for year in range(start_year, end_year + 1)
    ]
    return pd.concat(data_years, ignore_index=True)

all_teams_2019_2024_df = load_multiple_years(2019, 2024, stats_level="team")
all_teams_2019_2024_df.columns = all_teams_2019_2024_df.columns.str.replace(r'\s+', '_', regex=True)
all_teams_2019_2024_df = all_teams_2019_2024_df.join(
    team_logo_df.set_index(PLAYOFF_TEAM_FIELD),
    on=PLAYOFF_TEAM_FIELD,
    how="left"
)
all_teams_2019_2024_df.to_csv("all_teams_playoff_vs_season_stats_2019_2024.csv", index=False)

In [9]:
all_players_2019_2024_df = load_multiple_years(2019, 2024, stats_level="player")
all_players_2019_2024_df["Tm"] = all_players_2019_2024_df.apply(
    lambda x: team_code_to_name.get(x["Tm_playoff"]), axis=1
)
all_players_2019_2024_df.to_csv("all_players_playoff_vs_season_stats_2019_2024.csv", index=False)

In [10]:
all_players_2019_2024_df[all_players_2019_2024_df.Tm.isna()][["Player", "Tm_playoff", "Tm"]].Tm_playoff.unique()

array([], dtype=object)