In [1]:
!pip install nba_api pandas matplotlib seaborn scikit-learn

import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import seaborn as sns
from nba_api.stats.endpoints import leaguedashplayerstats
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

# Set visualization style
sns.set(style="whitegrid")
%matplotlib inline

Defaulting to user installation because normal site-packages is not writeable


In [2]:
def get_nba_data(season='2023-24'):
    print(f"Fetching data for {season} season via NBA API...")
    
    # Fetch Base Stats
    base = leaguedashplayerstats.LeagueDashPlayerStats(
        season=season,
        per_mode_detailed='PerGame',
        season_type_all_star='Regular Season',
        measure_type_detailed_defense='Base'
    ).get_data_frames()[0]
    
    time.sleep(0.5) # Short pause for API politeness
    
    # Fetch Advanced Stats
    adv = leaguedashplayerstats.LeagueDashPlayerStats(
        season=season,
        per_mode_detailed='PerGame',
        season_type_all_star='Regular Season',
        measure_type_detailed_defense='Advanced'
    ).get_data_frames()[0]
    
    # Merge them
    cols_to_use = ['PLAYER_ID', 'PLAYER_NAME', 'OFF_RATING', 'DEF_RATING', 
                   'NET_RATING', 'TS_PCT', 'USG_PCT', 'AST_PCT', 'REB_PCT']
    full_df = pd.merge(base, adv[cols_to_use], on=['PLAYER_ID', 'PLAYER_NAME'], how='left')
    return full_df

# Get the data
df = get_nba_data()

# Filter low minutes
df['MIN'] = df['MIN'].astype(float)
df_filtered = df[df['MIN'] > 15].reset_index(drop=True)
print(f"Data Loaded. Players: {len(df_filtered)}")

Fetching data for 2023-24 season via NBA API...
Data Loaded. Players: 336


In [3]:
features = [
    'PTS', 'REB', 'AST', 'STL', 'BLK', 'FG_PCT', 'FG3_PCT', 'FT_PCT', 
    'USG_PCT', 'TS_PCT', 'AST_PCT', 'FG3A'
]
X = df_filtered[features].fillna(0)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [4]:
# PCA
pca = PCA(n_components=2)
pca_data = pca.fit_transform(X_scaled)
pca_df = pd.DataFrame(pca_data, columns=['PC1', 'PC2'])
pca_df['Player'] = df_filtered['PLAYER_NAME']

# K-Means
kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
pca_df['Cluster'] = kmeans.fit_predict(X_scaled)
df_filtered['Cluster'] = pca_df['Cluster']

In [None]:
plt.figure(figsize=(12, 8))
sns.scatterplot(x='PC1', y='PC2', hue='Cluster', data=pca_df, palette='viridis', s=100)

# Annotate a few famous players
stars = df_filtered.nlargest(15, 'USG_PCT')['PLAYER_NAME'].tolist()
for i in range(len(pca_df)):
    if pca_df.Player[i] in stars:
        plt.text(pca_df.PC1[i]+0.1, pca_df.PC2[i], pca_df.Player[i], fontsize=9)

plt.title('NBA Player Archetypes (NBA API Data)')
plt.show()

In [6]:
print("Cluster Averages:")
display(df_filtered.groupby('Cluster')[features].mean().style.background_gradient(cmap='coolwarm'))

Cluster Averages:


Unnamed: 0_level_0,PTS,REB,AST,STL,BLK,FG_PCT,FG3_PCT,FT_PCT,USG_PCT,TS_PCT,AST_PCT,FG3A
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,20.550575,5.211494,5.417241,1.056322,0.524138,0.472747,0.367609,0.824184,0.256506,0.582678,0.253782,5.717241
1,8.719375,3.296875,2.048125,0.711875,0.35125,0.43805,0.355925,0.784131,0.162587,0.557375,0.128775,3.655
2,9.504,7.864,1.54,0.648,1.184,0.62376,0.03284,0.62376,0.15088,0.63624,0.0934,0.084
3,11.567857,6.194643,2.080357,0.785714,0.876786,0.532714,0.381429,0.758411,0.175518,0.616554,0.112286,2.617857
4,3.55,3.525,1.425,0.8125,0.35,0.3155,0.184875,0.236875,0.133875,0.3455,0.122875,1.65
