Start with

- Formations (under center, shotgun, pistol)
- Personnel (% 11, % mult TEs, % no TEs, % mult RBs, % no RBs, % extra OL)

- % Pass
- % Pass neutral downs
- QB Scrambles

- ADOT
- % Screens
- % Long
- % passes from play-action
- % passes from under center vs shotgun vs pistol
- number receivers in top 80% targets

- % runs middle, guard/tackle, edge
- % rushes from under center vs. shotgun vs. pistol
- number rushers to account for 20% rushes

In [1]:
''' Imports '''

import pandas as pd
import polars as pl
import numpy as np
import math

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from scipy.stats.mstats import trimmed_var
from sklearn.cluster import KMeans, DBSCAN
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.preprocessing import StandardScaler
from scipy.stats import percentileofscore

from prep_data import load_pbp_participation_data, load_stats_team_tendencies_offense, load_stats_team_tendencies_defense

In [9]:
offense_tendencies = load_stats_team_tendencies_offense()

print(offense_tendencies.head().to_string())

KeyboardInterrupt: 

In [2]:
pbp_data = load_pbp_participation_data()

print(pbp_data.head().to_string())

  play_id          game_id old_game_id home_team away_team season_type week posteam posteam_type defteam side_of_field yardline_100   game_date quarter_seconds_remaining half_seconds_remaining game_seconds_remaining game_half quarter_end drive   sp  qtr down goal_to_go   time   yrdln ydstogo ydsnet                                                                                              desc play_type yards_gained shotgun no_huddle qb_dropback qb_kneel qb_spike qb_scramble pass_length pass_location air_yards yards_after_catch run_location run_gap field_goal_result kick_distance extra_point_result two_point_conv_result home_timeouts_remaining away_timeouts_remaining timeout timeout_team td_team td_player_name td_player_id posteam_timeouts_remaining defteam_timeouts_remaining total_home_score total_away_score posteam_score defteam_score score_differential posteam_score_post defteam_score_post score_differential_post no_score_prob opp_fg_prob opp_safety_prob opp_td_prob   fg_prob safet

In [8]:
print(pbp_data['OffensePersonnelGroup'].unique().tolist())

['21', '11', 'Other', '12', '13', '22']


In [4]:
''' Features '''
# '% Pass Neutral Downs', '% Under Center Neutral Downs', '% Shotgun Neutral Downs',

OFFENSE_FEATURES = [
    'Plays / Game', 'Drives / Game', 
    '% Pass',  'Scrambles / Game',
    '% Plays Plays_11_Personnel', '% Plays Plays_Mult_RBs', '% Plays Plays_Zero_RBs', '% Plays Plays_Mult_TEs', '% Plays Plays_Zero_TEs', '% Plays Plays_Extra_OL',
    '% Under Center', '% Shotgun', 'Shotgun % Pass', 'Under Center % Pass',
    'ADOT', 'ADOT to Sticks', 'Avg Time to Throw', '% Passes Behind LOS', '% Passes Deep', 'MaxTargetShare',
    '% Rush Inside', '% Rush Outside', 'MaxRushAttemptsShare',
]

VIZ_FEATURES = ['Plays / Game', '% Pass', 'Scrambles / Game', 
                '% Plays Plays_11_Personnel',
                '% Under Center', 'ADOT', 'Avg Time to Throw', 'MaxTargetShare', 
                '% Rush Outside', 'MaxRushAttemptsShare']

# VIZ_FEATURES = ['Plays / Game', '% Pass', 'Scrambles / Game', 
#                 '% Plays Plays_11_Personnel', '% Plays Plays_Mult_RBs', '% Plays Plays_Mult_TEs',
#                 '% Under Center', 'Shotgun % Pass', 'Under Center % Pass', 
#                 'ADOT', 'Avg Time to Throw', '% Passes Behind LOS', '% Passes Deep', 'MaxTargetShare', 
#                 '% Rush Outside', 'MaxRushAttemptsShare']

# Visualize

In [5]:
''' Visualize a team '''
# TODO - % rushes from under center vs shotgun

def offense_team_spider_chart(team: str, season: int):

    ## Data ##
    # Get slice from offensive tendencies
    team_sl = offense_tendencies.loc[(offense_tendencies.index.get_level_values('posteam') == team) &
                                     (offense_tendencies.index.get_level_values('season') == season), :]
    
    # Feature values
    team_feature_vals = team_sl[VIZ_FEATURES].values.tolist()[0]
    
    # Feature value percentiles
    vals_fmt = []
    pct_scores = []
    pct_scores_fmt = []
    for i in range(len(VIZ_FEATURES)):
        feature = VIZ_FEATURES[i]
        val = team_feature_vals[i]
        pct_score = percentileofscore(offense_tendencies[feature].tolist(), val, kind='weak') / 100
        
        val_fmt = f'{val:.1%}' if feature[0] == '%' else f'{val:.2f}'
        vals_fmt.append(val_fmt)
        pct_scores.append(pct_score)
        pct_scores_fmt.append(f'{pct_score:.1%}')

    ## Figure ##

    fig = make_subplots(
        rows=1, cols=2, 
        column_widths=[4,3],
        horizontal_spacing=0.1,
        specs=[[{"type": "polar"}, {"type": "domain"}]]
    )

    fig.add_trace(
        go.Scatterpolar(
            r=pct_scores,
            theta=VIZ_FEATURES,
            opacity=0.7,
            fill='toself'
        ),
        row=1, col=1
    )
    fig.update_layout(
        title_text=f"Team: {season} {team}",
        polar=dict(radialaxis_range=(0,1)),
        margin=dict(b=50, r=50, l=75, t=75)
    )

    fig.add_trace(
        go.Table(
            columnwidth=[2,1,1],
            header={
                "values": ['Component', 'Value', 'Percentile'],
            },
            cells={
                "values": [VIZ_FEATURES, vals_fmt, pct_scores_fmt]
            }
        ),
        row=1, col=2
    )

    fig.show()

offense_team_spider_chart('PHI', 2024)

NameError: name 'offense_tendencies' is not defined

In [37]:
''' Team personnel spider chart '''

PERSONNEL_COLS = ['11', '12', '13', '21', '22', 'Other']

offense_personnel = pbp_data.groupby(['posteam', 'season', 'OffensePersonnelGroup']).aggregate(
    Plays=('posteam', 'size')
)
offense_personnel['% Plays'] = offense_personnel['Plays'] / offense_personnel.groupby(level=['posteam', 'season'])['Plays'].sum()
offense_personnel['% Plays %ile'] = offense_personnel.groupby('OffensePersonnelGroup')['% Plays'].rank(pct=True, ascending=True)
offense_personnel = offense_personnel.reindex(labels=PERSONNEL_COLS, level='OffensePersonnelGroup')

# print(offense_personnel.loc[offense_personnel.index.get_level_values(2) == '11',:].sort_values(by='% Plays', ascending=False).to_string())

def offense_personnel_spider_chart(team: str, season: int):

    ## Data ##

    # Get slice from offensive tendencies
    team_sl = offense_personnel.loc[(offense_personnel.index.get_level_values('posteam') == team) &
                                     (offense_personnel.index.get_level_values('season') == season), :]
    
    # Personnel values
    cols = team_sl.index.get_level_values('OffensePersonnelGroup').tolist()
    vals = team_sl['% Plays'].tolist()
    percentiles = team_sl['% Plays %ile'].tolist()

    ## Figure ##

    fig = make_subplots(
        rows=1, cols=2, 
        column_widths=[4,3],
        horizontal_spacing=0.1,
        specs=[[{"type": "polar"}, {"type": "domain"}]]
    )

    spider = px.line_polar(
        r=vals,
        theta=cols,
        line_close=True,
        # line_dash='dash'
    )
    # spider = go.Scatterpolar(
    #     r=team_feature_vals,
    #     theta=PERSONNEL_COLS,
    #     opacity=0.7,
    #     fill='toself'
    # )
    for trace in spider.data:
        fig.add_trace(
            trace,
            row=1, col=1
        )
    fig.update_layout(
        title_text=f"Offensive Personnel: {season} {team}",
        polar=dict(radialaxis_range=(0,1)),
        margin=dict(b=50, r=50, l=75, t=75)
    )

    fig.add_trace(
        go.Table(
            columnwidth=[2,1,1],
            header={
                "values": ['Personnel', 'Value', 'Percentile'],
            },
            cells={
                "values": [cols, [f'{val:.1%}' for val in vals], [f'{p:.1%}' for p in percentiles]]
            }
        ),
        row=1, col=2
    )

    fig.show()


print(offense_personnel.head().to_string())

offense_personnel_spider_chart('NO', 2024)

                                      Plays   % Plays  % Plays %ile
posteam season OffensePersonnelGroup                               
ARI     2018   11                       654  0.703226      0.691964
               12                       161  0.173118      0.334821
               13                        22  0.023656      0.413462
               21                        62  0.066667      0.653659
               22                        24  0.025806      0.666667


In [None]:
''' Correlation Matrix '''

corr_matrix = offense_tendencies[OFFENSE_FEATURES].corr()

fig = px.imshow(
    corr_matrix,
    color_continuous_scale=px.colors.diverging.PRGn,
    aspect="auto"
)
fig.update_xaxes(side="top")
fig.update_coloraxes(
    cmid=0,
    showscale=False,
)
fig.update_layout(
    title='Feature Correlations',
    margin=dict(r=25, b=25)
)
fig.show()

In [None]:
''' Variance '''

# Calculate variance, get 10 largest features
top_ten_variance = offense_tendencies[OFFENSE_FEATURES].var().sort_values().tail(10)

# Create horizontal bar chart of `top_ten_var`
fig = px.bar(
    x=top_ten_variance,
    y=top_ten_variance.index,
    title="Offense Tendencies: High Variance Features"
)
fig.update_layout(xaxis_title="Variance", yaxis_title="Features")
fig.show()

# Calculate trimmed variance
top_ten_trim_variance = offense_tendencies[OFFENSE_FEATURES].apply(trimmed_var).sort_values().tail(10)

# Create horizontal bar chart of `top_ten_trim_var`
fig = px.bar(
    x=top_ten_trim_variance,
    y=top_ten_trim_variance.index,
    title="Offense Tendencies: High Variance Features (Trimmed)"
)
fig.update_layout(xaxis_title="Trimmed Variance", yaxis_title="Features")
fig.show()


In [None]:
fig = px.histogram(
    data_frame=offense_tendencies,
    x='Plays / Game',
    title='Plays / Game'
)
fig.show()

# Model Preprocessing

In [None]:
''' Transform and Scale '''

# ## Log transform data
# transformed_data = pd.DataFrame(np.log(offense_tendencies[OFFENSE_FEATURES]), columns=OFFENSE_FEATURES).replace(math.inf, 0).replace(-(math.inf), 0)

## Scale data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(offense_tendencies[OFFENSE_FEATURES])

# Put `scaled_data` into DataFrame
scaled_data_df = pd.DataFrame(scaled_data, columns=OFFENSE_FEATURES)

print("scaled DF type:", type(scaled_data_df))
print("scaled DF shape:", scaled_data_df.shape)
scaled_data_df.head()

In [None]:
''' Variance of Scaled / Transformed Data '''

# Calculate variance, get 10 largest features
top_ten_variance = scaled_data_df.var().sort_values().tail(10)

# Create horizontal bar chart of `top_ten_var`
fig = px.bar(
    x=top_ten_variance,
    y=top_ten_variance.index,
    title="Scaled Data: High Variance Features"
)
fig.update_layout(xaxis_title="Variance", yaxis_title="Features")
fig.show()

# PCA

In [None]:
''' PCA '''

# Instantiate transformer
pca = PCA(random_state=42)

# Transform data with pa
pca_component_data = pca.fit_transform(scaled_data_df)

print('Total variance:', scaled_data_df.var().sum())
print(f'Singular values:\n', pca.singular_values_)
print(f'Explained variance:\n', pca.explained_variance_.round(5))
print(f'Ratio:\n', pca.explained_variance_ratio_.round(3))
print(pca.feature_names_in_)

# Create horizontal bar chart of explained variance
fig = px.line(
    x=[i + 1 for i in range(len(pca.explained_variance_ratio_))],
    y=pca.explained_variance_ratio_.cumsum(),
    title="Explained variance"
)
fig.update_layout(xaxis_title="Principal Component", yaxis_title="Cumulative Explained Variance (%)")
fig.show()

In [None]:
''' PCA - final '''

# Set number of PCA components to use after initial try
PCA_N_COMPONENTS = 8
COMPONENT_COLS = [f'Component {n}' for n in range(1, PCA_N_COMPONENTS + 1)]

# Instantiate transformer
pca_final = PCA(n_components=PCA_N_COMPONENTS, random_state=42)

# Transform sku profiles
pca_component_data_final = pca_final.fit_transform(scaled_data_df)

# Evaluate components
total_variance = scaled_data_df.var().sum()
expl_variance = pca_final.explained_variance_.sum()

print(f'Data set variance: {total_variance:,.3f}')
print(f'PCA explained variance: {expl_variance:,.3f} ({round((expl_variance / total_variance) * 100, 2)}%)')

pcs = pd.DataFrame(pca_final.components_, columns=OFFENSE_FEATURES)

# Create bar charts of contribution
for n in range(2):      #PCA_N_COMPONENTS):
    pc = pcs.transpose()[n].sort_values(ascending=False)

    fig = px.bar(
        x=pc,
        y=pc.index,
        title=f"PC{n+1}: Greatest contributors"
    )
    fig.update_layout(
        xaxis_title="Correlation", 
        yaxis_title="Features", 
        yaxis={'dtick': 1, 'categoryorder':'total ascending'},
    )
    fig.show()

    comp_expl_variance = pca_final.explained_variance_[n]
    print(f'Explained variance: {comp_expl_variance:,} ({round((comp_expl_variance / total_variance) * 100, 2)}%)')
                                                                                                                                                                                                              
# Make df of PCA scores
pca_component_df = pd.DataFrame(data=pca_component_data_final, columns=[f'Component {i}' for i in range(1, PCA_N_COMPONENTS + 1)])
print(pca_component_df.shape)
print(pca_component_df.head().to_string())

# Add PCA scores to original dataframe
offense_tendencies = offense_tendencies.drop(columns=list(filter(lambda x: x.startswith("Component"), offense_tendencies.columns)))
offense_tendencies = offense_tendencies.reset_index().merge(pca_component_df, left_index=True, right_index=True, how='left').set_index(['posteam', 'season'])


print(f'PCA values')
print(offense_tendencies.head().to_string())

In [None]:
''' Visualize PCA - Correlation Chart '''

# COMPONENT_NAMES = ['Small Cartons', 'High Volume', 'High Inventory - Light', 'High Inventory - Heavy']
COMPONENT_NAMES = COMPONENT_COLS

# Create "correlation matrix" - correlation with each of the components to each of the features
corr_matrix = pd.DataFrame.from_records(pca_final.components_, index=COMPONENT_NAMES, columns=OFFENSE_FEATURES).transpose()

# Visualize
fig = px.imshow(
    corr_matrix,
    color_continuous_scale=px.colors.diverging.PRGn,
    aspect="auto"
)
fig.update_xaxes(side="top")
fig.update_coloraxes(
    showscale=False,
    cmid=0,
)
fig.update_layout(
    title='Offense Tendency Factors',
    margin=dict(r=25, b=25, t=75),
    height=700,
    width=900

)
fig.show()

In [None]:
''' Visualize PCA - Items '''

# Percentile each component
for n in range(1, PCA_N_COMPONENTS + 1):
    offense_tendencies[f'Component {n} Percentile'] = offense_tendencies[f'Component {n}'].rank(pct=True)

COMPONENT_PERCENTILES = [f'Component {n} Percentile' for n in range(1, PCA_N_COMPONENTS + 1)]

def visualize_team_pca(team: str, season: int):

    ## Data ##
    # Get slice from offensive tendencies
    team_sl = offense_tendencies.loc[(offense_tendencies.index.get_level_values('posteam') == team) &
                                     (offense_tendencies.index.get_level_values('season') == season), :]
    
    # PCA Component %iles
    team_component_pct_ranks = team_sl[COMPONENT_PERCENTILES].values.tolist()[0]

    # Feature values
    team_feature_vals = team_sl[OFFENSE_FEATURES].values.tolist()[0]
    
    # Feature value percentiles
    vals_fmt = []
    pct_scores = []
    for i in range(len(OFFENSE_FEATURES)):
        feature = OFFENSE_FEATURES[i]
        val = team_feature_vals[i]
        pct_score = percentileofscore(offense_tendencies[feature].tolist(), val, kind='weak') / 100
        
        val_fmt = f'{val:.1%}' if feature[0] == '%' else f'{val:.2f}'
        vals_fmt.append(val_fmt)
        pct_scores.append(f'{pct_score:.1%}')

    ## Figure ##

    fig = make_subplots(
        rows=1, cols=2, 
        column_widths=[4,3],
        horizontal_spacing=0.1,
        specs=[[{"type": "polar"}, {"type": "domain"}]]
    )

    fig.add_trace(
        go.Scatterpolar(
            r=team_component_pct_ranks,
            theta=COMPONENT_NAMES,
            opacity=0.7,
            fill='toself'
        ),
        row=1, col=1
    )
    fig.update_layout(
        title_text=f"Team: {season} {team}",
        polar=dict(radialaxis_range=(0,1)),
        margin=dict(b=50, r=50, l=75, t=75)
    )

    fig.add_trace(
        go.Table(
            columnwidth=[2,1,1],
            header={
                "values": ['Component', 'Value', 'Percentile'],
            },
            cells={
                "values": [OFFENSE_FEATURES, vals_fmt, pct_scores]
            }
        ),
        row=1, col=2
    )

    fig.show()


visualize_team_pca('DET', 2024)

# # Visualize top teams from each component
# for n in range(1, PCA_N_COMPONENTS + 1):
#     component = f'Component {n} Percentile'

#     top_teams = offense_tendencies.sort_values(by=component, ascending=False).head(10)
    
#     top_team = top_teams.index[0]
#     visualize_team_pca(top_team[0], top_team[1])

#     print(top_teams.to_string())

In [None]:
fig = px.histogram(
    data_frame=offense_tendencies,
    x='MaxTargetShare'
)
fig.show()

In [None]:
''' Visualize PCA Components - 3D '''

# sl = offense_tendencies.sample(frac=0.25, random_state=42)

fig = px.scatter_3d(
    data_frame=offense_tendencies,
    x='Component 1',
    y='Component 3',
    z='Component 5',
    title='PCA Components - Top 3'
)
fig.show()

# KMeans

In [None]:
''' Kmeans Clustering '''

# Try kmeans clustering with up to 20 clusters, keep track of inertia (basically cluster variance)
n_clusters = range(2,10)
inertia_values = []
silhouette_scores = []

for i in n_clusters:
    # Model
    kmeans = KMeans(n_clusters=i, n_init='auto', init='k-means++', random_state=42)

    # Fit
    kmeans.fit(pca_component_df)

    # Score
    ss = silhouette_score(pca_component_df, kmeans.labels_)   #, sample_size=int(len(pca_component_df) * 0.25))

    inertia_values.append(kmeans.inertia_)
    silhouette_scores.append(ss)

# Create scatter of inertia
fig = px.line(
    x=[i + 1 for i in range(len(inertia_values))],
    y=inertia_values,
    title="Kmeans - Inertia by Number of Clusters"
)
fig.update_layout(xaxis_title="Num Clusters", yaxis_title="Inertia")
fig.show()

# Create a line plot of `silhouette_scores` vs `n_clusters`
fig = px.line(
    x=n_clusters,
    y=silhouette_scores,
    title="K-Means Model: Silhouette Score vs Number of Clusters"
)
fig.update_layout(xaxis_title="Num Clusters", yaxis_title="Silhouette Score")
fig.show()

In [None]:
''' Clustering - Final '''

N_CLUSTERS = 4

# Once optimal num clusters is found, create the final cluster model
kmeans_final = KMeans(n_clusters=N_CLUSTERS, n_init='auto', init='k-means++', random_state=42)
kmeans_final.fit(pca_component_df)

# Find distances to centroids
labels = kmeans_final.labels_
distances_array = kmeans_final.transform(pca_component_df)

distances_to_centroid = []
for i in range(len(pca_component_df)):
    # Get sku cluster / distances
    label = labels[i]
    centroid_distances = distances_array[i]

    # Get distance to cluster center
    distance_to_cluster_centroid = centroid_distances[label]
    
    # Append
    distances_to_centroid.append(distance_to_cluster_centroid)


# Add cluster labels to original dataframe
offense_tendencies['Cluster KMEANS'] = labels + 1
offense_tendencies['Cluster KMEANS'] = offense_tendencies['Cluster KMEANS'].astype(str)
offense_tendencies['Distance to KMEANS Centroid'] = distances_to_centroid

print(offense_tendencies['Cluster KMEANS'].value_counts().sort_index().to_string())

In [None]:
''' Visualize KMeans - 3D PCA '''

fig = px.scatter_3d(
    data_frame=offense_tendencies,
    x='Component 1',
    y='Component 2',
    z='Component 3',
    title='KMeans Clusters',
    color='Cluster KMEANS',
)
fig.show()

In [None]:
''' Visualize Clusters '''
## Spider Chart each cluster

# Cluster feature averages
agg_dict = {feature: 'mean' for feature in OFFENSE_FEATURES}
agg_dict['Cluster KMEANS'] = 'size'

avgs_by_cluster = offense_tendencies.groupby('Cluster KMEANS').aggregate(agg_dict)#.sort_values(by='SKU', ascending=False)
avgs_by_cluster = avgs_by_cluster.rename(columns={'Cluster KMEANS': '# Teams'})
print(avgs_by_cluster.head().to_string())

# visualize_team_pca()


In [None]:
print(OFFENSE_FEATURES)

In [None]:
''' Visualize a team '''

VIZ_FEATURES = ['Plays / Game', '% Pass', 'Scrambles / Game', 
                '% Plays Plays_11_Personnel', '% Plays Plays_Mult_RBs', '% Plays Plays_Mult_TEs',
                '% Under Center', 'Shotgun % Pass', 'Under Center % Pass', 
                'ADOT', 'Avg Time to Throw', '% Passes Behind LOS', '% Passes Deep', 'MaxTargetShare', 
                '% Rush Outside', 'MaxRushAttemptsShare']


def offense_team_spider_chart(team: str, season: int):

    ## Data ##
    # Get slice from offensive tendencies
    team_sl = offense_tendencies.loc[(offense_tendencies.index.get_level_values('posteam') == team) &
                                     (offense_tendencies.index.get_level_values('season') == season), :]
    
    # Feature values
    team_feature_vals = team_sl[VIZ_FEATURES].values.tolist()[0]
    
    # Feature value percentiles
    vals_fmt = []
    pct_scores = []
    pct_scores_fmt = []
    for i in range(len(VIZ_FEATURES)):
        feature = VIZ_FEATURES[i]
        val = team_feature_vals[i]
        pct_score = percentileofscore(offense_tendencies[feature].tolist(), val, kind='weak') / 100
        
        val_fmt = f'{val:.1%}' if feature[0] == '%' else f'{val:.2f}'
        vals_fmt.append(val_fmt)
        pct_scores.append(pct_score)
        pct_scores_fmt.append(f'{pct_score:.1%}')

    ## Figure ##

    fig = make_subplots(
        rows=1, cols=2, 
        column_widths=[4,3],
        horizontal_spacing=0.1,
        specs=[[{"type": "polar"}, {"type": "domain"}]]
    )

    fig.add_trace(
        go.Scatterpolar(
            r=pct_scores,
            theta=VIZ_FEATURES,
            opacity=0.7,
            fill='toself'
        ),
        row=1, col=1
    )
    fig.update_layout(
        title_text=f"Team: {season} {team}",
        polar=dict(radialaxis_range=(0,1)),
        margin=dict(b=50, r=50, l=75, t=75)
    )

    fig.add_trace(
        go.Table(
            columnwidth=[2,1,1],
            header={
                "values": ['Component', 'Value', 'Percentile'],
            },
            cells={
                "values": [VIZ_FEATURES, vals_fmt, pct_scores_fmt]
            }
        ),
        row=1, col=2
    )

    fig.show()

offense_team_spider_chart('MIN', 2024)

In [None]:
''' Visualize Clusters '''
## Spider Chart each cluster



# Cluster feature averages
agg_dict = {feature: 'mean' for feature in OFFENSE_FEATURES}
agg_dict['Cluster KMEANS'] = 'size'

avgs_by_cluster = offense_tendencies.groupby('Cluster KMEANS').aggregate(agg_dict)#.sort_values(by='SKU', ascending=False)
avgs_by_cluster = avgs_by_cluster.rename(columns={'Cluster KMEANS': '# Teams'})
print(avgs_by_cluster.head().to_string())
# avgs_by_cluster['SKU'] = 'Cluster ' + avgs_by_cluster['Cluster KMEANS'] + ' Averages'
# print(avgs_by_cluster.to_string())

# Percentile of cluster feature averages
feature_pct_ranks = pd.concat([kmeans_sku_profiles[['SKU'] + FEATURES], avgs_by_cluster[['SKU'] + FEATURES]])
for feature in FEATURES:
    col = f'{feature} Pct Rank'
    feature_pct_ranks[col] = feature_pct_ranks[feature].replace(0, pd.NA).rank(method='max', na_option='keep', pct=True).replace(math.nan, 0)

# feature_pct_ranks.to_excel('percentile ranks.xlsx', index=False)

# Create spider chart of clusters, using percentile rank of each feature
for cluster in range(1, N_CLUSTERS + 1):

    ## Data ##

    # Get cluster
    cluster_slice = kmeans_sku_profiles.loc[kmeans_sku_profiles['Cluster KMEANS'] == str(cluster), :]
    num_skus = len(cluster_slice)

    # Percentiles of cluster feature averages
    pct_rank_cols = [f'{feature} Pct Rank' for feature in FEATURES]
    cluster_feature_ranks = feature_pct_ranks.loc[feature_pct_ranks['SKU'] == f'Cluster {cluster} Averages', pct_rank_cols].values[0]

    # Cluster feature averages
    cluster_feature_averages = avgs_by_cluster.loc[avgs_by_cluster['SKU'] == f'Cluster {cluster} Averages', FEATURES].transpose()
    cluster_feature_averages = cluster_feature_averages.rename(columns={cluster_feature_averages.columns[0]: 'Average Values'})

    # Velocity spread
    velocities = cluster_slice['Velocity'].value_counts().sort_index()

    ## Figure ##

    fig = make_subplots(rows=1, cols=2, specs=[[{"type": "polar"}, {"type": "domain"}]])
    
    # Spider
    fig.add_trace(
        go.Barpolar(
            r=cluster_feature_ranks,
            theta=FEATURES,
            opacity=0.7
        ),
        row=1, col=1
    )
    fig.update_layout(title_text=f"Cluster {cluster}: {num_skus} SKUs", polar=dict(radialaxis_range=(0,1)))

    # Table
    fig.add_trace(
        go.Table(
            header={
                "values": ['Feature', 'Average Value'],
            },
            cells={
                "values": [cluster_feature_averages.index, cluster_feature_averages['Average Values'].round(2)]
            }
        ),
        row=1, col=2
    )

    ## Show ##

    # Fig
    fig.show()

    # Velocities
    print(velocities.to_string())

    # Most typical SKUs of cluster
    print(f'Avg Distance to Centroid: {cluster_slice["Distance to KMEANS Centroid"].mean():.3f}')
    print(f'Min Distance to Centroid: {cluster_slice["Distance to KMEANS Centroid"].min():.3f}')
    print(f'Max Distance to Centroid: {cluster_slice["Distance to KMEANS Centroid"].max():.3f}')

    print(cluster_slice.sort_values(by='Distance to KMEANS Centroid', ascending=True).head(15).to_string())
    


# DBSCAN

In [None]:
''' K Nearest Neighbors '''
## GOAL: estimate ideal parameter values for epsilon (eps)

# Init model
k_neighbors_model = NearestNeighbors(n_neighbors=len(OFFENSE_FEATURES)*2)

# Fit and find nearest neighbors
k_neighbors_model.fit(pca_component_df)
distances, indices = k_neighbors_model.kneighbors(pca_component_df)

# Plot sorted distances, find the "elbow"
distances = np.sort(distances, axis=0)
distances = distances[:,1]

fig = px.line(
    data_frame=distances
)
fig.update_layout(xaxis_title='Number of SKUs', yaxis_title='Distances (EPS)')
fig.show()

In [None]:
''' Run DBSCAN '''

# Params
eps_options = [i / 10.0 for i in range(28, 37, 1)]              # Based on above elbow, trying 1.2 to 1.5
min_samples_options = [i for i in range(len(OFFENSE_FEATURES)*2, len(OFFENSE_FEATURES)*3 + 1, 1)]

# Lists
models = []
ss_scores = []
davies_scores = []

for eps in eps_options:
    for min_samples in min_samples_options:
        # Create model
        dbscan_model = DBSCAN(eps=eps, min_samples=min_samples)

        # Fit
        print(f'Fitting')
        dbscan_model.fit(pca_component_df)
        labels = dbscan_model.labels_

        print(f'Processing')
        # Number of clusters, ignoring noise
        n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
        n_noise_ = list(labels).count(-1)

        ## Score
        non_noise_idxs = np.where(labels != -1)[0]
        df_scaled_no_noise = pca_component_df.loc[non_noise_idxs, :]
        labels_no_noise = labels[labels != -1]

        print(f'Scoring')
        ss_score = 0
        davies_score = 0
        if len(np.unique(labels_no_noise)) > 1:
            print(f'Silhouette')
            # Silhouette
            ss_score = silhouette_score(df_scaled_no_noise, labels_no_noise)    #, sample_size=int(len(scaled_data_df) * 0.25))

            print(f'Davies Bouldin')
            # Davies-Bouldin
            davies_score = davies_bouldin_score(df_scaled_no_noise, labels_no_noise)
        else:
            # If there's only 1 cluster, don't report score
            ss_scores.append(0)

        model = {
            'eps': eps,
            'min_samples': min_samples,
            'n_clusters': n_clusters_,
            'n_noise': n_noise_,
            'silhouette': ss_score,
            'davies': davies_score
        }
        models.append(model)


        # Print results
        print(f'------------- Model -------------')
        print(f'Eps: {eps}')
        print(f'Min Samples: {min_samples}')
        print()
        print(f'Number of clusters: {n_clusters_}')
        print(f'Number of noisy points: {n_noise_}')
        print(f"Silhouette Coefficient: {ss_score:.3f}")
        print(f"Davies-Bouldin Score: {davies_score:.3f}")
        print()

results_df = pd.DataFrame.from_records(data=models)

print(f'\n------------ Top Silhouette Scores ------------')
print(results_df.sort_values(by='silhouette', ascending=False).head(10).to_string())

print(f'\n------------ Top Davies Scores ------------')
print(results_df.sort_values(by='davies', ascending=True).head(10).to_string())
