<head>
  <link href="https://fonts.cdnfonts.com/css/pf-tempesta-seven" rel="stylesheet">
  <link rel="stylesheet" href="https://use.typekit.net/bbp6pmz.css">
</head>

# NBA API Stats




In the following, we want to forecast the future success of potential NBA players and their performance in the NBA draft, as well as ascertain which colleges tend to produce the most successful NBA players. We analyze and evaluate player and game statistics to ascertain patterns and trends that contribute to successful NBA transitions.


In [1]:
# data processing
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from itertools import product
from lxml import html
import time
import requests
import json

# nba api stat endpoints
from nba_api.stats.static import teams
from nba_api.stats.static import players
from nba_api.stats.endpoints import commonplayerinfo
from nba_api.stats.endpoints import playercareerstats
from nba_api.stats.endpoints import playerawards
from nba_api.stats.endpoints import boxscoreadvancedv2
from nba_api.stats.endpoints import leaguegamefinder

import circlify
import plotly.express as px
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.colors as clr
%matplotlib inline
from pylab import *
import warnings
warnings.filterwarnings('ignore')


<div class="mycode">

We will primarily be using `nba_api.stats.endpoints` for each API endpoint supported by [stats.nba.com](https://stats.nba.com/). For example, we use API endpoints such as `commonplayerinfo`, `drafthistory`

</div>


### Getting Team and Player IDs

The package also includes utilities for fetching player and team information available under `nba_api.stats.static`.

In [2]:
from nba_api.stats.static import teams

# get_teams returns a list of 30 dictionaries, each an NBA team
nba_teams = teams.get_teams()
print(f"Number of teams fetched: {len(nba_teams)}")

Number of teams fetched: 30


In [3]:
from IPython.display import display, HTML
nba_teams_df = pd.DataFrame(nba_teams)

# Function to generate the URL based on team ID
def generate_team_logo_url(team_id):
    return f"https://cdn.nba.com/logos/nba/{team_id}/primary/L/logo.svg"

# Apply the function to create a new column with the logo URLs
nba_teams_df['logo_url'] = nba_teams_df['id'].apply(generate_team_logo_url)
nba_teams_df = nba_teams_df[['logo_url', 'id', 'full_name', 'abbreviation', 'state', 'year_founded']]

# Function to display image in HTML format
def path_to_image_html(path):
    return f'<img src="{path}" width="50" >'

# Convert the DataFrame to HTML and display images
html_table = nba_teams_df.head(6).to_html(escape=False, formatters=dict(logo_url=path_to_image_html))
display(HTML(html_table))

Unnamed: 0,logo_url,id,full_name,abbreviation,state,year_founded
0,,1610612737,Atlanta Hawks,ATL,Georgia,1949
1,,1610612738,Boston Celtics,BOS,Massachusetts,1946
2,,1610612739,Cleveland Cavaliers,CLE,Ohio,1970
3,,1610612740,New Orleans Pelicans,NOP,Louisiana,2002
4,,1610612741,Chicago Bulls,CHI,Illinois,1966
5,,1610612742,Dallas Mavericks,DAL,Texas,1980


#### Get All NBA Players 

In [4]:
from nba_api.stats.static import players

# get_players returns a list of dictionaries, each representing a player.
nba_players = players.get_players()
print(f"Number of players fetched: {len(nba_players)}")

nba_active_players = players.get_active_players()
print(f"Number of active players fetched: {len(nba_active_players)}")

Number of players fetched: 4900
Number of active players fetched: 531


----

## CommonTeamRoster

In [5]:
from nba_api.stats.endpoints import CommonTeamRoster
all_nba_teams = [team["id"] for team in nba_teams]

In [6]:
%%script false

# List to hold data frames of all players info
all_players_info = []

# Loop through each team ID and get the roster data
for team_id in all_nba_teams:
    common_team_roster = CommonTeamRoster(
        team_id=team_id,
        league_id_nullable='00',  # NBA league ID
        season='2023-24',
        timeout=200
    )
    team_roster_df = common_team_roster.get_data_frames()[0]
    all_players_info.append(team_roster_df)

# Concatenate all individual team data frames into a single data frame
all_players_info_df = pd.concat(all_players_info, ignore_index=True)
all_players_info_df.to_csv("data/nba_players.csv",index=False)

Couldn't find program: 'false'


In [7]:
all_players_info_df = pd.read_csv("data/nba_players.csv")
all_players_info_df.head()

Unnamed: 0,TeamID,SEASON,LeagueID,PLAYER,NICKNAME,PLAYER_SLUG,NUM,POSITION,HEIGHT,WEIGHT,BIRTH_DATE,AGE,EXP,SCHOOL,PLAYER_ID,HOW_ACQUIRED
0,1610612737,2023,0,Jalen Johnson,Jalen,jalen-johnson,1.0,F,6-8,219,"DEC 18, 2001",22.0,2,Duke,1630552,#20 Pick in 2021 Draft
1,1610612737,2023,0,Trent Forrest,Trent,trent-forrest,2.0,G,6-4,210,"JUN 12, 1998",25.0,3,Florida State,1630235,Signed on 08/08/22
2,1610612737,2023,0,Seth Lundy,Seth,seth-lundy,3.0,G-F,6-4,220,"APR 02, 2000",24.0,R,Penn State,1641754,#46 Pick in 2023 Draft
3,1610612737,2023,0,Kobe Bufkin,Kobe,kobe-bufkin,4.0,G,6-5,195,"SEP 21, 2003",20.0,R,Michigan,1641723,#15 Pick in 2023 Draft
4,1610612737,2023,0,Dejounte Murray,Dejounte,dejounte-murray,5.0,G,6-5,180,"SEP 19, 1996",27.0,6,Washington,1627749,Traded from SAS on 06/30/22


---

### Colleges

In [8]:
# Calculate the value counts and filter to values > 1
filtered_school_counts_df = all_players_info_df['SCHOOL'].value_counts().reset_index()
filtered_school_counts_df.columns = ['SCHOOL', 'Count']
filtered_school_counts_df = filtered_school_counts_df[filtered_school_counts_df['Count'] > 1]


# Dictionary for replacements to the 'SCHOOL' column
replacements = {'Southern California': 'Southern<br>California',
                'North Carolina': 'North<br>Carolina', 'NBA G League Ignite': 'NBA<br>G League', 'State': 'St.'}
filtered_school_counts_df['SCHOOL'] = filtered_school_counts_df['SCHOOL'].replace(replacements, regex=True)

In [9]:
def get_color(name, number):
    return list(sns.color_palette(palette=name, n_colors=number).as_hex())

# Create custom colormap and convert to a list of hex colors
cmap = clr.LinearSegmentedColormap.from_list('custom blue', ['#FDE3FE', '#E8E08D', '#FCA611', '#FB831E', '#FF4F2D', '#FA6094', '#1C3858'], N=92)
colors = [matplotlib.colors.rgb2hex(cmap(i)) for i in range(cmap.N)]

# Set the custom palette
custom_palette = sns.set_palette(sns.color_palette(colors))
pal_vi = get_color(custom_palette, len(filtered_school_counts_df))

In [10]:
import circlify

# compute circle positions:
circles = circlify.circlify(
    filtered_school_counts_df['Count'].tolist(),
    target_enclosure=circlify.Circle(x=0, y=0, r=1),
    show_enclosure=False
)

bubble_df = pd.DataFrame({
    'x': [cir.x for cir in circles],
    'y': [cir.y for cir in circles],
    'r': [cir.r for cir in circles],
    'l': filtered_school_counts_df.sort_values('Count').SCHOOL.values,
    's': [math.pi * (cir.r ** 2) for cir in circles],
    'k': filtered_school_counts_df.sort_values('Count').Count.values
})

bubble_df["rank"] = bubble_df.sort_values(by="k").index
font_size = 90 * bubble_df.r.values
font_colors = ['white' if i > 9 else '#F4F4F4' if i > 8 else 'black' for i in bubble_df['k']]

In [11]:
import plotly.express as px

# Create the scatter plot
fig = px.scatter(bubble_df, x="x", y="y", custom_data=["l", "rank", "k"], color="rank", width=600, height=500,
                 size="s", hover_name="l", size_max=70, text="l",
                 color_continuous_scale=pal_vi, opacity=0.95)

# Update trace and layout settings
fig.update_traces(
    hovertemplate="%{customdata[0]}<br>%{customdata[2]}",
    texttemplate="<b style='letter-spacing: 0.25px;text-transform:uppercase;font-family:Roboto Condensed;'>%{customdata[0]}</b><br>%{customdata[2]}",
    textfont_color=font_colors,
    textfont_size=font_size
)

fig.update_layout(
    showlegend=False, coloraxis_showscale=False, xaxis_visible=False, yaxis_visible=False,
    yaxis_scaleanchor="x", yaxis_scaleratio=0.95, plot_bgcolor='rgba(0, 0, 0, 0)', paper_bgcolor='rgba(0, 0, 0, 0)',
    #title={'text': "<b>NBA Player Background</b>", 'y': 0.97, 'x': 0.5, 'xanchor': 'center', 'yanchor': 'top'},
    font=dict(
        family="Roboto",
        size=12,
        color="black"
    )
)

fig.show()

----

### Team Player Dashboard

In [12]:
%%script false

from nba_api.stats.endpoints import TeamPlayerDashboard

all_team_headlines = [] # List to hold data frames of all team info
all_player_headlines = []  # List to hold data frames of all player info

# Loop through each team ID and get the player dashboard data
for team_id in all_nba_teams:
    team_dashboard = TeamPlayerDashboard(
        team_id=team_id,
        season="2023-24",
        league_id_nullable='00'
        )
    all_team_headlines.append(team_dashboard.get_data_frames()[0])
    all_player_headlines.append(team_dashboard.get_data_frames()[1])

# Combine all individual team data frames into a single data frame
team_info_df = pd.concat(all_team_headlines, ignore_index=True)
team_players_info_df = pd.concat(all_player_headlines, ignore_index=True)

Couldn't find program: 'false'


In [13]:
#team_info_df.to_csv("data/team_dashboard.csv", index=False)
#team_players_info_df.to_csv("data/team_player_dashboard.csv", index=False)
team_players_info_df = pd.read_csv("data/team_player_dashboard.csv")
team_info_df = pd.read_csv("data/team_dashboard.csv")

team_info_df = team_info_df[['TEAM_ID', 'TEAM_NAME', 'GROUP_VALUE', 'GP', 'FGM', 'FGA', 'FG3M',
                             'FG3A', 'FTM', 'FTA', 'OREB', 'DREB', 'REB', 'AST', 'TOV', 'STL', 'BLK', 'PF', 'PTS', 'PLUS_MINUS']]

team_info_df = team_info_df.rename(
    columns={'GP': 'GP_TEAM', 'FGM': 'FGM_TEAM', 'FGA': 'FGA_TEAM', 'FTM': 'FTM_TEAM',
    'FTA': 'FTA_TEAM', 'OREB': 'OREB_TEAM', 'DREB': 'DREB_TEAM', 'REB': 'REB_TEAM',
    'AST': 'AST_TEAM', 'TOV': 'TOV_TEAM', 'STL': 'STL_TEAM', 'BLK': 'BLK_TEAM',
    'PTS': 'PTS_TEAM', 'PF': 'PF_TEAM', 'FG3M': 'FG3M_TEAM', 'FG3A': 'FG3A_TEAM'
    })


# Identify and resolve duplicates based on most games played (GP)
team_players_info_df['GP'] = team_players_info_df['GP'].astype(int)  # Ensure GP is integer
team_players_info_df = team_players_info_df.loc[team_players_info_df.groupby('PLAYER_ID')['GP'].idxmax()]

In [14]:
# Join with previous dataset all_players_info_df on PLAYER_ID
merged_df = pd.merge(all_players_info_df, team_players_info_df, on='PLAYER_ID', how='inner')

In [15]:
merged_df = merged_df[[
    'TeamID', 'SEASON', 'PLAYER_ID', 'PLAYER',
    'GP', 'W', 'L', 'MIN', 'FGM', 'FGA', 
    'FG3M', 'FG3A', 'FTM', 'FTA', 'OREB', 'DREB',
    'REB', 'AST', 'TOV', 'STL', 'BLK', 'PF', 'PTS',
    'PLUS_MINUS', 'NBA_FANTASY_PTS',
    'POSITION', 'HEIGHT', 'WEIGHT', 'BIRTH_DATE', 'AGE', 'EXP', 'SCHOOL', 'HOW_ACQUIRED']]

# Rename the 'SCHOOL' column to 'COLLEGE'
merged_df = merged_df.rename(columns={'TeamID': 'TEAM_ID'})
merged_df.head()

Unnamed: 0,TEAM_ID,SEASON,PLAYER_ID,PLAYER,GP,W,L,MIN,FGM,FGA,...,PLUS_MINUS,NBA_FANTASY_PTS,POSITION,HEIGHT,WEIGHT,BIRTH_DATE,AGE,EXP,SCHOOL,HOW_ACQUIRED
0,1610612737,2023,1630552,Jalen Johnson,56,27,29,1888.866667,359,702,...,-76,2027.1,F,6-8,219,"DEC 18, 2001",22.0,2,Duke,#20 Pick in 2021 Draft
1,1610612737,2023,1630235,Trent Forrest,38,18,20,413.761667,34,90,...,-94,304.2,G,6-4,210,"JUN 12, 1998",25.0,3,Florida State,Signed on 08/08/22
2,1610612737,2023,1641754,Seth Lundy,9,2,7,51.983333,4,17,...,-13,21.4,G-F,6-4,220,"APR 02, 2000",24.0,R,Penn State,#46 Pick in 2023 Draft
3,1610612737,2023,1641723,Kobe Bufkin,17,8,9,196.003333,34,92,...,-35,187.1,G,6-5,195,"SEP 21, 2003",20.0,R,Michigan,#15 Pick in 2023 Draft
4,1610612737,2023,1627749,Dejounte Murray,78,34,44,2783.343333,672,1463,...,-154,3209.2,G,6-5,180,"SEP 19, 1996",27.0,6,Washington,Traded from SAS on 06/30/22


------

## Player Statistics

In [16]:
merged_df2 = pd.merge(merged_df, team_info_df, on='TEAM_ID', how='inner')

In [17]:
merged_df2 = merged_df2[[ 'SEASON', 
'PLAYER_ID', 'PLAYER', 'GP', 'W', 'L', 'MIN',
       'FGM', 'FGA', 'FG3M', 'FG3A', 'FTM', 'FTA', 'OREB', 'DREB', 'REB',
       'AST', 'TOV', 'STL', 'BLK', 'PF', 'PTS', 'PLUS_MINUS_x',
       'NBA_FANTASY_PTS', 'POSITION', 'HEIGHT', 'WEIGHT', 'BIRTH_DATE', 'AGE',
       'EXP', 'SCHOOL', 'HOW_ACQUIRED', 
       'TEAM_ID', 'TEAM_NAME',
       'GP_TEAM',
       'FGM_TEAM', 'FGA_TEAM', 'FG3M_TEAM', 'FG3A_TEAM', 'FTM_TEAM',
       'FTA_TEAM', 'OREB_TEAM', 'DREB_TEAM', 'REB_TEAM', 'AST_TEAM',
       'TOV_TEAM', 'STL_TEAM', 'BLK_TEAM', 'PF_TEAM', 'PTS_TEAM',
       'PLUS_MINUS_y']]


### Points Per Game

In [18]:
# Calculate points per game
merged_df2['PPG'] = merged_df2['PTS'] / merged_df2['GP']

### Player Impact Estimate (PIE) score




(PTS + FGM + FTM - FGA - FTA + DREB + (.5 * OREB) + AST + STL + (.5 * BLK) - PF - TO) / (GmPTS + GmFGM + GmFTM - GmFGA - GmFTA + GmDREB + (.5 * GmOREB) + GmAST + GmSTL + (.5 * GmBLK) - GmPF - GmTO)

The Player Impact Estimate (PIE) is a comprehensive metric that estimates a player's overall statistical contribution against the total statistics in games they play. The NBA's official definition of PIE is:

$
\text{PIE} = \frac{\text{Player's Stats} + \text{Team's Stats} - \text{Opponent's Stats}}{\text{Team's Total Stats} + \text{Opponent's Total Stats}}
$

<div class = "mycode">

1. **Player's Stats**: Include points, field goals made, field goals attempted, free throws made, free throws attempted, offensive rebounds, defensive rebounds, assists, steals, blocks, turnovers, and personal fouls.
2. **Team's Stats**: Sum of all players' stats on the team.
3. **Opponent's Stats**: Sum of all players' stats on the opponent team.
4. **Team's Total Stats**: Total of all the team stats, including points, field goals, free throws, rebounds, assists, steals, blocks, turnovers, and fouls.
5. **Opponent's Total Stats**: Same as above, but for the opponent team.

</div>

The formula is: $\text{PIE} = \frac{\text{PTS} + \text{FGM} + \text{FTM} - \text{FGA} - \text{FTA} + \text{OREB} + \text{DREB} + \text{AST} + \text{STL} + \text{BLK} - \text{TO} - \text{PF}}{\text{Team's Total Stats} + \text{Opponent's Total Stats}}$



In [19]:
# Calculate player's contribution
player_contrib = (merged_df2["PTS"] + merged_df2["FGM"] + merged_df2["FTM"] -
                  merged_df2["FGA"] - merged_df2["FTA"] + merged_df2["OREB"] +
                  merged_df2["DREB"] + merged_df2["AST"] + merged_df2["STL"] +
                  merged_df2["BLK"] - merged_df2['TOV'] - merged_df2["PF"])

# Calculate team and opponent total stats
team_total_stats = (merged_df2['PTS_TEAM'] + merged_df2['FGM_TEAM'] + merged_df2['FTM_TEAM'] -
                    merged_df2['FGA_TEAM'] - merged_df2['FTA_TEAM'] + merged_df2['OREB_TEAM'] +
                    merged_df2['DREB_TEAM'] + merged_df2['AST_TEAM'] + merged_df2['STL_TEAM'] +
                    merged_df2['BLK_TEAM'] - merged_df2['TOV_TEAM'] - merged_df2['PF_TEAM'])

# Calculate PIE
merged_df2['PIE'] = (player_contrib/team_total_stats)*100

------

### PER


Functions for Calculating PER Variants:


- calculate_uPER: Calculates the unadjusted Player Efficiency Rating (uPER) using basic player statistics such as points, rebounds, assists, turnovers, and games played.
- calculate_aPER: Adjusts uPER based on the team's pace relative to the league average, yielding the adjusted PER (aPER).
- calculate_nPER: Normalizes aPER so that the league average is set to a constant value of 15, resulting in the normalized PER (nPER).

In [20]:
from nba_api.stats.endpoints import leaguedashteamstats, playercareerstats

def fetch_pace_data(season):
    team_stats = leaguedashteamstats.LeagueDashTeamStats(season=season, measure_type_detailed_defense='Advanced')
    team_data = team_stats.get_data_frames()[0]
    pace_data = team_data[['TEAM_ID', 'PACE']]
    league_pace = pace_data['PACE'].mean()
    return pace_data, league_pace

# Fetch player stats and team pace data
team_pace_data, league_pace = fetch_pace_data(season = '2023-24')

In [21]:
def calculate_PER(df, league_pace):
    # Calculate the basic components of PER
    df['REB'] = df['OREB'] + df['DREB']
    df['PER'] = (df['PTS'] + df['REB'] + df['AST'] + df['STL'] + df['BLK'] - (df['FGA'] - df['FGM']) - (df['FTA'] - df['FTM']) - df['TOV'])
    df['uPER'] = (df['PTS'] + df['REB'] + df['AST'] - df['TOV']) / df['GP']
    df['aPER'] = df['uPER'] * (league_pace / df['PACE'])

    league_average_aPER = df['aPER'].mean(skipna=True)
    df['nPER'] = df['aPER'] * (15 / league_average_aPER)

    return df

# Apply the function to calculate PER and uPER
merged_df3 = pd.merge(merged_df2, team_pace_data, on='TEAM_ID', how='inner')
merged_df3 = calculate_PER(merged_df3, league_pace)

In [22]:
# Sorting players by PER in descending order
overall_rankings = merged_df3.sort_values(by='nPER', ascending=False)

# Selecting top players
top_players = overall_rankings.head(45)

# Displaying the top players
print("Top Players Based on uPER:")
top_players[['PLAYER', 'SCHOOL', 'PIE', 'PPG', 'PER', 'uPER', 'aPER', 'nPER']].reset_index()

Top Players Based on uPER:


Unnamed: 0,index,PLAYER,SCHOOL,PIE,PPG,PER,uPER,aPER,nPER
0,102,Luka Doncic,Real Madrid,25.667828,33.857143,2580,48.871429,48.167181,53.186126
1,321,Joel Embiid,Kansas,15.146262,34.692308,1510,47.487179,47.96139,52.958892
2,114,Nikola Jokic,Mega Basket,28.636135,26.392405,3039,44.708861,45.498291,50.239141
3,220,Giannis Antetokounmpo,Filathlitikos,25.131051,30.438356,2655,45.054795,44.445308,49.076439
4,398,Shai Gilgeous-Alexander,Kentucky,21.912429,30.053333,2416,39.626667,38.958822,43.01827
5,175,Anthony Davis,Kentucky,23.352704,24.684211,2548,38.736842,37.884896,41.832443
6,266,Jalen Brunson,Villanova,20.11665,28.727273,1972,36.662338,37.881232,41.828397
7,369,Domantas Sabonis,Gonzaga,25.818452,19.426829,2679,37.97561,37.853568,41.79785
8,18,Jayson Tatum,Duke,17.978804,26.851351,2045,37.351351,37.797499,41.735939
9,186,LeBron James,St. Vincent-St. Mary HS (OH),20.171378,25.661972,2126,37.802817,36.971413,40.823777


-------

In [23]:

# Group by college and calculate the average points per game
avg_per_by_college = merged_df3.groupby('SCHOOL')['nPER'].mean().reset_index()

# Rename the columns for clarity
avg_per_by_college.columns = ['SCHOOL', 'AVG_PER']
avg_per_by_college['AVG_PER'] = round(avg_per_by_college['AVG_PER'],3)

# Calculate the count of players from each school
school_count = merged_df3['SCHOOL'].value_counts().reset_index()
school_count.columns = ['SCHOOL', 'COUNT']

# Merge the average PPG and player count DataFrames
avg_per_with_count = pd.merge(school_count, avg_per_by_college, on='SCHOOL')

# Sort by average points per game in descending order

avg_per_with_count['PER_WEIGHTED'] = (log(avg_per_with_count['COUNT'])*avg_per_with_count['AVG_PER']) + avg_per_with_count['AVG_PER']
avg_per_with_count['PER_WEIGHTED'] = round(avg_per_with_count['PER_WEIGHTED'], 3)

avg_per_with_count = avg_per_with_count.sort_values(by='PER_WEIGHTED', ascending=False)
filtered_per_count = avg_per_with_count[avg_per_with_count['PER_WEIGHTED'] > 20]

# Display the result
filtered_per_count.head(20)

Unnamed: 0,SCHOOL,COUNT,AVG_PER,PER_WEIGHTED
0,Kentucky,27,22.202,95.376
1,Duke,24,19.723,82.404
6,Gonzaga,10,20.362,67.247
5,Villanova,10,18.732,61.864
37,Mega Basket,4,24.955,59.55
44,Oklahoma,3,27.301,57.294
10,Arizona,9,17.865,57.118
47,Georgia,3,26.13,54.837
8,Southern California,10,16.526,54.579
15,Florida State,8,17.598,54.192


In [24]:
#merged_df.to_csv("data/merged_df.csv", index=None)