In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

In [102]:
df = pd.read_csv('synergy_all_playtypes_2015_to_2025.csv')
df

Unnamed: 0,SEASON_ID,PLAYER_ID,PLAYER_NAME,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,TYPE_GROUPING,PERCENTILE_Isolation,GP_Isolation,POSS_PCT_Isolation,...,TOV_POSS_PCT_Misc,SF_POSS_PCT_Misc,PLUSONE_POSS_PCT_Misc,SCORE_POSS_PCT_Misc,EFG_PCT_Misc,POSS_Misc,PTS_Misc,FGM_Misc,FGA_Misc,FGMX_Misc
0,22015,708,Kevin Garnett,1610612750,MIN,Minnesota Timberwolves,Offensive,,,,...,0.824,0.000,0.00,0.176,0.000,0.4,0.1,0.0,0.0,0.0
1,22015,977,Kobe Bryant,1610612747,LAL,Los Angeles Lakers,Offensive,0.292,66.0,0.199,...,0.407,0.051,0.00,0.254,0.119,0.9,0.5,0.0,0.3,0.3
2,22015,1495,Tim Duncan,1610612759,SAS,San Antonio Spurs,Offensive,,,,...,0.803,0.015,0.00,0.167,0.600,1.1,0.3,0.0,0.1,0.0
3,22015,1713,Vince Carter,1610612763,MEM,Memphis Grizzlies,Offensive,0.021,60.0,0.049,...,0.286,0.071,0.04,0.500,0.500,0.5,0.5,0.1,0.2,0.1
4,22015,1717,Dirk Nowitzki,1610612742,DAL,Dallas Mavericks,Offensive,0.938,75.0,0.046,...,0.383,0.000,0.00,0.500,0.222,0.8,0.7,0.0,0.1,0.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4698,22024,1642358,AJ Johnson,1610612764,WAS,Washington Wizards,Offensive,,,,...,,,,,,,,,,
4699,22024,1642366,Quinten Post,1610612744,GSW,Golden State Warriors,Offensive,,,,...,0.889,0.000,0.00,0.000,0.000,0.5,0.0,0.0,0.0,0.0
4700,22024,1642367,Jonathan Mogbo,1610612761,TOR,Toronto Raptors,Offensive,0.779,52.0,0.031,...,0.600,0.000,0.00,0.200,0.375,0.5,0.2,0.1,0.2,0.1
4701,22024,1642377,Jaylen Wells,1610612763,MEM,Memphis Grizzlies,Offensive,0.341,71.0,0.059,...,0.303,0.091,0.00,0.455,0.375,0.5,0.4,0.1,0.2,0.1


In [3]:
# Dropping columns that will not be used in the analysis
columns_to_drop = [
    'PLAYER_ID', 
     'TYPE_GROUPING',
     'SEASON_ID',
     'PLAYER_NAME'
]

# Drop columns with 'team' in the name
columns_to_drop += [col for col in df.columns if 'team' in col.lower()]

# Drop columns starting with FGA, FGMX, FGM, and POSS but keep those that have _PCT followed by other text
columns_to_drop += [col for col in df.columns if (col.startswith(('FGA', 'FGMX', 'FGM', 'POSS')) and not '_PCT' in col)]

# Drop columns with PERCENTILE
columns_to_drop += [col for col in df.columns if 'PERCENTILE' in col]

# Drop columns starting with PTS
columns_to_drop += [col for col in df.columns if col.startswith('PTS')]

# Drop columns starting with GP
columns_to_drop += [col for col in df.columns if col.startswith('GP')]

df['Player'] = df['SEASON_ID'].astype(str).str[-4:] + '_' + df['PLAYER_NAME']

df.drop(columns=columns_to_drop, inplace=True)

df.set_index('Player', inplace=True)
# Display the updated DataFrame
df

Unnamed: 0_level_0,POSS_PCT_Isolation,PPP_Isolation,FG_PCT_Isolation,FT_POSS_PCT_Isolation,TOV_POSS_PCT_Isolation,SF_POSS_PCT_Isolation,PLUSONE_POSS_PCT_Isolation,SCORE_POSS_PCT_Isolation,EFG_PCT_Isolation,POSS_PCT_Transition,...,EFG_PCT_OffRebound,POSS_PCT_Misc,PPP_Misc,FG_PCT_Misc,FT_POSS_PCT_Misc,TOV_POSS_PCT_Misc,SF_POSS_PCT_Misc,PLUSONE_POSS_PCT_Misc,SCORE_POSS_PCT_Misc,EFG_PCT_Misc
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015_Kevin Garnett,,,,,,,,,,0.093,...,,0.121,0.235,0.000,0.176,0.824,0.000,0.00,0.176,0.000
2015_Kobe Bryant,0.199,0.712,0.260,0.120,0.037,0.116,0.007,0.333,0.289,0.096,...,0.604,0.044,0.525,0.095,0.237,0.407,0.051,0.00,0.254,0.119
2015_Tim Duncan,,,,,,,,,,0.045,...,0.725,0.113,0.242,0.600,0.121,0.803,0.015,0.00,0.167,0.600
2015_Vince Carter,0.049,0.300,0.077,0.100,0.250,0.100,0.000,0.150,0.077,0.095,...,0.389,0.068,1.000,0.400,0.393,0.286,0.071,0.04,0.500,0.500
2015_Dirk Nowitzki,0.046,1.066,0.489,0.164,0.066,0.131,0.000,0.541,0.500,0.046,...,0.500,0.046,0.850,0.222,0.467,0.383,0.000,0.00,0.500,0.222
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024_AJ Johnson,,,,,,,,,,0.282,...,,,,,,,,,,
2024_Quinten Post,,,,,,,,,,0.148,...,0.429,0.070,0.000,0.000,0.056,0.889,0.000,0.00,0.000,0.000
2024_Jonathan Mogbo,0.031,1.000,0.500,0.200,0.100,0.200,0.100,0.500,0.500,0.249,...,0.357,0.078,0.400,0.375,0.080,0.600,0.000,0.00,0.200,0.375
2024_Jaylen Wells,0.059,0.814,0.395,0.047,0.093,0.047,0.023,0.372,0.421,0.244,...,0.545,0.045,0.788,0.333,0.333,0.303,0.091,0.00,0.455,0.375


In [4]:
duplicated_mask = df.index.duplicated(keep=False)
duplicate_players = df.index[duplicated_mask].unique()
# Create a copy of the original DataFrame to work with
df_combined = df.copy()

# For each player with multiple entries, replace with their average stats
for player in duplicate_players:
    # Get all rows for this player
    player_rows = df.loc[player]
    
    # Calculate mean for all stats
    player_mean = player_rows.mean()
    
    # Remove all duplicate rows
    df_combined = df_combined.drop(index=player)
    
    # Add back the averaged row
    player_mean.name = player
    df_combined = pd.concat([df_combined, pd.DataFrame(player_mean).T])

Found 420 players with multiple entries


In [5]:
def cosine_similarity(A, B):
    """
    Compute cosine similarity between A (target) and B (others).
    """
    # Convert to numpy arrays if they aren't already
    A = np.asarray(A)
    B = np.asarray(B)
    
    # Compute norms of the matrices
    A_norm = np.sqrt(np.sum(A**2, axis=1))
    B_norm = np.sqrt(np.sum(B**2, axis=1))
    
    # Handle zero vectors to avoid division by zero
    # Replace zero norms with 1
    A_norm[A_norm == 0] = 1
    B_norm[B_norm == 0] = 1
    
    # Compute dot product
    dot_product = np.dot(A, B.T)
    
    # Reshape
    A_norm = A_norm.reshape(-1, 1)
    B_norm = B_norm.reshape(1, -1)
    
    # Compute similarity
    similarity = dot_product / (A_norm * B_norm)
    
    return similarity

In [6]:
def MinMaxScaler(vector):
    """
    Apply min-max scaling to a vector, transforming values to range [0, 1].
    """
    # Convert to numpy array if it's not already
    vector = np.asarray(vector)
    
    # Get min and max values
    min_val = np.min(vector)
    max_val = np.max(vector)
    
    # Handle the case where min == max (constant vector)
    if min_val == max_val:
        return np.zeros_like(vector)
    
    # Apply min-max scaling
    scaled_vector = (vector - min_val) / (max_val - min_val)
    
    return scaled_vector

In [105]:
def collab_filter(df, target_user, k):

  # Check if the target player is in the DataFrame
  if target_user not in df.index:
    print(f"Error: {target_user} not found in the dataset.")
    return None

  
  # Centering the data around the mean for each player
  player_means = df.mean(axis=1)
  df_scaled = df.sub(player_means, axis=0).fillna(0)
  
  # Separating the target player from the rest
  target_stats = df_scaled.loc[target_user].values.reshape(1,-1)
  # Extract the player name (everything after the first underscore)
  player_name = target_user.split('_', 1)[1]

  # Create a mask to find all rows where this player appears
  player_mask = ~df_scaled.index.str.contains(f"_{player_name}$")

  # Filter the dataframe to exclude all seasons of the target player
  other_player_stats = df_scaled[player_mask]
  
  # Calculate cosine similarity
  similarity_scores = cosine_similarity(target_stats, other_player_stats)[0]

  # Minmax scaling the similarity scores
  similarity_scores = MinMaxScaler(similarity_scores.reshape(-1,1))
  similarity_scores = similarity_scores.flatten()

  filtered_indices = df.index[player_mask]
  target_user_sim_scores = pd.Series(similarity_scores, index=filtered_indices)

  # Getting the top k most similar players and their stats
  top_k_sim_players = target_user_sim_scores.nlargest(k)

  # Finding which stats the target player is missing
  unscored = df.loc[target_user][df.loc[target_user].isna()].index


  predictions = []
  for stat in unscored:
      # Extract the stat type (everything before the last underscore)
      stat_parts = stat.split('_')
      
      if len(stat_parts) > 2:
          stat_type = '_'.join(stat_parts[:-1])
      else:
          stat_type = stat_parts[0] 
      
      # Find all columns with the same stat type but different play styles
      related_cols = [col for col in df.columns if col.startswith(stat_type + '_') and col != stat]
      
      # Finding the stats of the top k similar players for the missing stat
      relevant_stats = df.loc[top_k_sim_players.index, stat]

      
      # For players with missing values, fill with mean of related stats for that player
      for player in top_k_sim_players.index:

          if pd.isna(relevant_stats[player]):
              # Get this player's values for related stats
              player_related_values = df.loc[player, related_cols]
              
              # Calculate mean of non-NaN values
              player_related_mean = player_related_values.mean(skipna=True)
              
              # If we have a valid mean, use it; otherwise keep as NaN
              if not pd.isna(player_related_mean):
                  relevant_stats[player] = player_related_mean
      
      # If we still have NaNs after trying to fill with related stats, use global mean
      if relevant_stats.isna().any():
          # Get global mean for this stat from players who have it
          global_stat_mean = df[stat].mean(skipna=True)
          relevant_stats = relevant_stats.fillna(global_stat_mean)
      
      # Calculating the predicted stat for the missing stat
      prediction = round(np.sum(relevant_stats * top_k_sim_players)/np.sum(top_k_sim_players), 2)
      predictions.append((stat, prediction))
  
  return predictions, top_k_sim_players

In [106]:
predictions, top_k_sim_players = collab_filter(df_combined, '2024_Devin Booker', 5)
print(predictions)
print(top_k_sim_players.index)

[('POSS_PCT_Transition', 0.13), ('PPP_Transition', 1.12), ('FG_PCT_Transition', 0.51), ('FT_POSS_PCT_Transition', 0.13), ('TOV_POSS_PCT_Transition', 0.11), ('SF_POSS_PCT_Transition', 0.11), ('PLUSONE_POSS_PCT_Transition', 0.02), ('SCORE_POSS_PCT_Transition', 0.5), ('EFG_PCT_Transition', 0.6), ('POSS_PCT_Cut', 0.09), ('PPP_Cut', 1.06), ('FG_PCT_Cut', 0.5), ('FT_POSS_PCT_Cut', 0.11), ('TOV_POSS_PCT_Cut', 0.1), ('SF_POSS_PCT_Cut', 0.1), ('PLUSONE_POSS_PCT_Cut', 0.03), ('SCORE_POSS_PCT_Cut', 0.48), ('EFG_PCT_Cut', 0.56), ('POSS_PCT_Misc', 0.07), ('PPP_Misc', 0.82), ('FG_PCT_Misc', 0.55), ('FT_POSS_PCT_Misc', 0.17), ('TOV_POSS_PCT_Misc', 0.33), ('SF_POSS_PCT_Misc', 0.06), ('PLUSONE_POSS_PCT_Misc', 0.01), ('SCORE_POSS_PCT_Misc', 0.41), ('EFG_PCT_Misc', 0.58)]
Index(['2023_Paul George', '2023_Jayson Tatum', '2023_Kyle Kuzma',
       '2024_Kevin Durant', '2024_Harrison Barnes'],
      dtype='object')


In [107]:
def display_predicted_stats(player_name, predicted_stats, df):
    """
    Display predicted stats with rankings compared to other players from the same year.
    """
    # If no missing stats, return
    if not predicted_stats:
        return
    
    # Extract year from player name
    year = player_name.split('_')[0]
    
    # Extract player name
    name = player_name.replace('_', ' ')
    
    # Group stats by playstyle
    playstyle_stats = {}
    for stat_name, predicted_value in predicted_stats:
        playstyle = stat_name.split('_')[-1]
        if playstyle not in playstyle_stats:
            playstyle_stats[playstyle] = []
        playstyle_stats[playstyle].append((stat_name, predicted_value))
    
    # Display stats for each playstyle
    for playstyle, stats in playstyle_stats.items():
        print(f"\nIf {name} played with a {playstyle} playstyle, their stats would be:")
        
        # For each stat, find its ranking among players from the same year with same playstyle
        for stat_name, predicted_value in stats:
            # Extract base stat name without playstyle
            base_stat_name = '_'.join(stat_name.split('_')[:-1])
            
            # Get all values for this specific stat+playstyle from players in the same year
            year_mask = df.index.str.startswith(year)
            playstyle_values = df.loc[year_mask, stat_name].dropna().sort_values(ascending=False)
            
            # If there are no values for comparison, skip ranking
            if len(playstyle_values) == 0:
                formatted_value = f"{predicted_value:.3f}" if 'PCT' in stat_name else f"{predicted_value:.2f}"
                print(f"  {base_stat_name}: {formatted_value} (no other players for comparison)")
                continue
                
            # Calculate rank (add the predicted value and find its position)

            combined_values = pd.concat([playstyle_values, pd.Series([predicted_value], index=['predicted'])])

            combined_values = combined_values.sort_values(ascending=False)
            rank = combined_values.index.get_loc('predicted') + 1
            
            # Format value based on stat type
            if 'PCT' in stat_name:
                formatted_value = f"{predicted_value:.3f}"
            else:
                formatted_value = f"{predicted_value:.2f}"
                
            # Get ordinal suffix
            if rank % 10 == 1 and rank != 11:
                suffix = 'st'
            elif rank % 10 == 2 and rank != 12:
                suffix = 'nd'
            elif rank % 10 == 3 and rank != 13:
                suffix = 'rd'
            else:
                suffix = 'th'
                
            # Print stat with ranking
            print(f"  {base_stat_name}: {formatted_value} ({rank}{suffix} out of {len(playstyle_values) + 1})")

In [108]:
display_predicted_stats('2023_Devin Booker', predictions, df_combined)


If 2023 Devin Booker played with a Transition playstyle, their stats would be:
  POSS_PCT: 0.130 (267th out of 342)
  PPP: 1.12 (184th out of 342)
  FG_PCT: 0.510 (196th out of 342)
  FT_POSS_PCT: 0.130 (140th out of 342)
  TOV_POSS_PCT: 0.110 (164th out of 342)
  SF_POSS_PCT: 0.110 (164th out of 342)
  PLUSONE_POSS_PCT: 0.020 (187th out of 342)
  SCORE_POSS_PCT: 0.500 (179th out of 342)
  EFG_PCT: 0.600 (179th out of 342)

If 2023 Devin Booker played with a Cut playstyle, their stats would be:
  POSS_PCT: 0.090 (108th out of 282)
  PPP: 1.06 (250th out of 282)
  FG_PCT: 0.500 (265th out of 282)
  FT_POSS_PCT: 0.110 (181st out of 282)
  TOV_POSS_PCT: 0.100 (39th out of 282)
  SF_POSS_PCT: 0.100 (190th out of 282)
  PLUSONE_POSS_PCT: 0.030 (150th out of 282)
  SCORE_POSS_PCT: 0.480 (267th out of 282)
  EFG_PCT: 0.560 (235th out of 282)

If 2023 Devin Booker played with a Misc playstyle, their stats would be:
  POSS_PCT: 0.070 (82nd out of 308)
  PPP: 0.82 (31st out of 308)
  FG_PCT: 0.

In [62]:
def create_player_comparison_radar(target_player, similar_players, df, k=5):
    """
    Create a radar chart comparing a target player with similar players using average stats across all play types.
    """
    # Extract player names for the legend
    target_name = target_player.replace('_', ' ')
    
    # Limit to top k similar players
    top_k_similar = similar_players.head(k)
    
    # Define the playstyles and stat types
    playstyles = ['Isolation', 'Transition', 'PRBallHandler', 'PRRollman', 
                  'Postup', 'Spotup', 'Handoff', 'Cut', 'OffScreen', 'OffRebound']
    key_stat_types = ['PPP', 'EFG_PCT', 'FG_PCT', 'SCORE_POSS_PCT', 'TOV_POSS_PCT', 'FT_POSS_PCT', 'SF_POSS_PCT', 'PLUSONE_POSS_PCT']
    
    # Function to find a player's dominant playstyle
    def get_dominant_playstyle(player_id):
        poss_pcts = {}
        for style in playstyles:
            col = f'POSS_PCT_{style}'
            if col in df.columns and not pd.isna(df.loc[player_id, col]):
                poss_pcts[style] = df.loc[player_id, col]
        
        if poss_pcts:
            return max(poss_pcts.items(), key=lambda x: x[1])[0]
        return "Unknown"
    
    # Get target player's dominant playstyle
    target_dominant_style = get_dominant_playstyle(target_player)
    
    # Calculate average stats across all playstyles for each stat type
    avg_stats = {}
    for stat_type in key_stat_types:
        # Get columns for this stat type across all playstyles
        stat_cols = [f"{stat_type}_{style}" for style in playstyles if f"{stat_type}_{style}" in df.columns]
        
        if not stat_cols:
            continue
            
        # Calculate average for target player and each similar player
        avg_stats[stat_type] = {}
        
        # For target player
        target_values = df.loc[target_player, stat_cols].dropna()
        if len(target_values) > 0:
            avg_stats[stat_type][target_player] = target_values.mean()
        else:
            avg_stats[stat_type][target_player] = 0
            
        # For similar players
        for player in top_k_similar.index:
            player_values = df.loc[player, stat_cols].dropna()
            if len(player_values) > 0:
                avg_stats[stat_type][player] = player_values.mean()
            else:
                avg_stats[stat_type][player] = 0
    
    # Get the list of stats that have data
    valid_stats = list(avg_stats.keys())[:len(key_stat_types)]
    
    if len(valid_stats) == 0:
        print("No valid stats found across playstyles.")
        return None
    
    # Create readable labels for the chart
    stat_labels = [stat.replace('PCT', '%') for stat in valid_stats]
    
    # Collect all values for normalization
    all_values = {}
    for stat in valid_stats:
        all_values[stat] = []
        for player in [target_player] + list(top_k_similar.index):
            if player in avg_stats[stat]:
                all_values[stat].append(avg_stats[stat][player])
    
    # Initialize the plot
    fig = go.Figure()
    
    # Add the target player
    target_stats = [avg_stats[stat][target_player] for stat in valid_stats]
    
    
    # Add trace for target player with dominant playstyle in label
    fig.add_trace(go.Scatterpolar(
        r=target_stats,
        theta=stat_labels,
        fill='toself',
        name=f"{target_name} ({target_dominant_style})",
        hovertemplate='%{theta}: %{customdata:.3f}<extra></extra>',
        customdata=target_stats
    ))
    
    # Add similar players
    for i, player in enumerate(top_k_similar.index, 1):
        # Get player's dominant playstyle
        player_dominant_style = get_dominant_playstyle(player)
        
        player_name = player.replace('_', ' ')
        player_stats = [avg_stats[stat][player] for stat in valid_stats]
        
        
        # Add trace for similar player with dominant playstyle in label
        fig.add_trace(go.Scatterpolar(
            r=player_stats,
            theta=stat_labels,
            fill='toself',
            name=f"{player_name} ({player_dominant_style}, Sim: {top_k_similar[player]:.2f})",
            hovertemplate='%{theta}: %{customdata:.3f}<extra></extra>',
            customdata=player_stats
        ))
    
    # Update layout
    fig.update_layout(
        polar=dict(
            radialaxis=dict(
                visible=True,
                range=[0, 1],
                showticklabels=False
            )
        ),
        title=dict(
            text=f"<b>{target_name} vs {k} Most Similar Players</b><br><sup>Average Across All Playstyles</sup>",
            x=0.5,
            font=dict(size=18)
        ),
        legend=dict(
            orientation="h",  
            yanchor="top",    
            y=-0.15,          
            xanchor="center", 
            x=0.5,            
            font=dict(size=10), 
            itemwidth=30      
        ),
        margin=dict(l=80, r=80, t=100, b=150), 
        height=800,          
        width=800
    )
    
    return fig

In [44]:
fig = create_player_comparison_radar('2023_Zach LaVine', top_k_sim_players, df_combined)

In [45]:
fig.show()