Start with

- Formations (under center, shotgun, pistol)
- Personnel (% 11, % mult TEs, % no TEs, % mult RBs, % no RBs, % extra OL)

- % Pass
- % Pass neutral downs
- QB Scrambles

- ADOT
- % Screens
- % Long
- % passes from play-action
- % passes from under center vs shotgun vs pistol
- number receivers in top 80% targets

- % runs middle, guard/tackle, edge
- % rushes from under center vs. shotgun vs. pistol
- number rushers to account for 20% rushes

In [19]:
''' Imports '''

import pandas as pd
import polars as pl
import numpy as np
import math

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots

from scipy.stats.mstats import trimmed_var
from scipy.stats import percentileofscore

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans, SpectralClustering, DBSCAN, HDBSCAN, OPTICS
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_score, davies_bouldin_score

from prep_data import load_pbp_participation_data, load_stats_team_tendencies_offense, load_stats_team_tendencies_defense

# Get Data

In [2]:
offense_tendencies = load_stats_team_tendencies_offense()

print(offense_tendencies.head().to_string())

                Games  Drives  Plays  Neutral_Down_Plays  Pass_Plays  Neutral_Down_Pass  Pass_Attempts  QBScrambles     IAY  IAY_ToSticks  TotalTimeToThrow  Pass_BehindLOS  Pass_Deep  Sacks  Rush_Plays  Rush_Attempts  Rush_Inside  Rush_Outside  Plays_11_Personnel  Plays_Heavy_Personnel  Plays_Mult_RBs  Plays_Zero_RBs  Plays_Mult_TEs  Plays_Zero_TEs  Plays_Extra_OL  Plays / Game  Drives / Game    % Pass  % Pass Neutral Downs  Scrambles / Game      ADOT  ADOT to Sticks  Avg Time to Throw  % Passes Behind LOS  % Passes Deep  % Rush Inside  % Rush Outside  % Plays 11 Personnel  % Plays Heavy Personnel  % Plays Mult RBs  % Plays Zero RBs  % Plays Mult TEs  % Plays Zero TEs  % Plays Extra OL  Shotgun Plays  Under Center Plays  Shotgun Neutral_Down_Plays  Under Center Neutral_Down_Plays  Shotgun % Pass  Under Center % Pass  % Under Center  % Shotgun  % Under Center Neutral Downs  % Shotgun Neutral Downs  MaxTargets  MaxTargetShare  N_Receivers_FivePctTargetShare  MaxRushAttempts  MaxRushAttem

In [3]:
''' Features '''
# '% Pass Neutral Downs', '% Under Center Neutral Downs', '% Shotgun Neutral Downs',

OFFENSE_FEATURES = [
    'Plays / Game', 'Drives / Game', 
    '% Pass',  'Scrambles / Game',
    '% Plays 11 Personnel', '% Plays Mult RBs', '% Plays Zero RBs', '% Plays Mult TEs', '% Plays Zero TEs', '% Plays Extra OL',
    '% Under Center', '% Shotgun', 'Shotgun % Pass', 'Under Center % Pass',
    'ADOT', 'ADOT to Sticks', 'Avg Time to Throw', '% Passes Behind LOS', '% Passes Deep', 'MaxTargetShare',
    '% Rush Inside', '% Rush Outside', 'MaxRushAttemptsShare',
]

VIZ_FEATURES = ['Plays / Game', '% Pass', 'Scrambles / Game', 
                '% Plays Plays_11_Personnel',
                '% Under Center', 'ADOT', 'Avg Time to Throw', 'MaxTargetShare', 
                '% Rush Outside', 'MaxRushAttemptsShare']


# Preprocessing & Transformation

In [4]:
''' Transform and Scale '''

# ## Log transform data
# transformed_data = pd.DataFrame(np.log(offense_tendencies[OFFENSE_FEATURES]), columns=OFFENSE_FEATURES).replace(math.inf, 0).replace(-(math.inf), 0)

## Scale data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(offense_tendencies[OFFENSE_FEATURES])

# Put `scaled_data` into DataFrame
scaled_data_df = pd.DataFrame(scaled_data, columns=OFFENSE_FEATURES)

print("scaled DF type:", type(scaled_data_df))
print("scaled DF shape:", scaled_data_df.shape)
scaled_data_df.head()

scaled DF type: <class 'pandas.core.frame.DataFrame'>
scaled DF shape: (288, 23)


Unnamed: 0,Plays / Game,Drives / Game,% Pass,Scrambles / Game,% Plays 11 Personnel,% Plays Mult RBs,% Plays Zero RBs,% Plays Mult TEs,% Plays Zero TEs,% Plays Extra OL,...,Under Center % Pass,ADOT,ADOT to Sticks,Avg Time to Throw,% Passes Behind LOS,% Passes Deep,MaxTargetShare,% Rush Inside,% Rush Outside,MaxRushAttemptsShare
0,0.753261,1.547027,-0.124128,-1.524084,-0.128782,-0.415819,-0.456262,0.086337,2.085008,-0.766646,...,0.417428,2.060821,1.669651,-1.494379,-2.29763,0.119912,0.508965,-1.080209,-0.528517,1.955732
1,0.811265,1.547027,0.139443,-0.815344,-0.6789,-0.770463,0.174885,0.656896,2.427424,-0.213182,...,0.878225,1.694859,1.534729,-1.087927,-1.056597,-0.261757,1.331324,-1.290108,-1.367631,-1.300776
2,-2.378942,2.039292,-0.033871,-0.714096,0.444104,0.095923,-0.504965,-0.171846,-0.531032,-0.712385,...,0.31971,0.35444,0.371642,-0.019875,0.401781,0.203802,0.560056,-1.958417,-1.963273,1.691972
3,0.811265,1.054761,1.284133,0.197141,-1.418218,-0.46433,0.320567,-0.21119,6.632459,-0.628432,...,-0.312585,-0.806263,-0.78233,-0.31025,1.630594,0.234546,-0.744999,-2.758313,1.91719,-1.751149
4,1.449306,0.316363,0.820029,2.019614,-0.544652,-0.523518,-0.410866,0.169814,2.738886,0.337401,...,1.268601,-0.159931,-0.509666,0.133184,0.709167,-0.135225,1.295998,-2.264137,0.744357,0.173475


In [5]:
''' PCA '''

# Set number of PCA components to use after initial try
PCA_N_COMPONENTS = 8
COMPONENT_COLS = [f'Component {n}' for n in range(1, PCA_N_COMPONENTS + 1)]

# Instantiate transformer
pca_final = PCA(n_components=PCA_N_COMPONENTS, random_state=42)

# Transform sku profiles
pca_component_data_final = pca_final.fit_transform(scaled_data_df)

# Evaluate components
total_variance = scaled_data_df.var().sum()
expl_variance = pca_final.explained_variance_.sum()

print(f'Data set variance: {total_variance:,.3f}')
print(f'PCA explained variance: {expl_variance:,.3f} ({round((expl_variance / total_variance) * 100, 2)}%)')

pcs = pd.DataFrame(pca_final.components_, columns=OFFENSE_FEATURES)

# Create bar charts of contribution
for n in range(2):
    pc = pcs.transpose()[n].sort_values(ascending=False)

    fig = px.bar(
        x=pc,
        y=pc.index,
        title=f"PC{n+1}: Greatest contributors"
    )
    fig.update_layout(
        xaxis_title="Correlation", 
        yaxis_title="Features", 
        yaxis={'dtick': 1, 'categoryorder':'total ascending'},
    )
    fig.show()

    comp_expl_variance = pca_final.explained_variance_[n]
    print(f'Explained variance: {comp_expl_variance:,} ({round((comp_expl_variance / total_variance) * 100, 2)}%)')
                                                                                                                                                                                                              
# Make df of PCA scores
pca_component_df = pd.DataFrame(data=pca_component_data_final, columns=[f'Component {i}' for i in range(1, PCA_N_COMPONENTS + 1)])
print(pca_component_df.shape)
print(pca_component_df.head().to_string())

# Add PCA scores to original dataframe
offense_tendencies = offense_tendencies.drop(columns=list(filter(lambda x: x.startswith("Component"), offense_tendencies.columns)))
offense_tendencies = offense_tendencies.reset_index().merge(pca_component_df, left_index=True, right_index=True, how='left').set_index(['posteam', 'season'])


print(f'PCA values')
print(offense_tendencies.head().to_string())

Data set variance: 23.080
PCA explained variance: 16.057 (69.57%)


Explained variance: 3.311568411489921 (14.35%)


Explained variance: 3.0242110924776706 (13.1%)
(288, 8)
   Component 1  Component 2  Component 3  Component 4  Component 5  Component 6  Component 7  Component 8
0     3.842643     1.454941     2.237340     0.062339    -0.062076     0.267089    -2.481289     0.746971
1     3.749161     1.064472     1.907287     0.895464     1.478514     0.661571    -1.258971     0.158441
2     1.779855    -0.705578     0.737906    -1.275948    -1.421019    -0.474468    -1.426616    -0.595679
3    -3.392099     0.402182     0.697725     2.887922     3.695395     3.589385    -1.327002    -2.289453
4    -3.498633     1.408981     0.697070     1.094104     2.523561     1.370010     0.098868     0.099497
PCA values
                Games  Drives  Plays  Neutral_Down_Plays  Pass_Plays  Neutral_Down_Pass  Pass_Attempts  QBScrambles     IAY  IAY_ToSticks  TotalTimeToThrow  Pass_BehindLOS  Pass_Deep  Sacks  Rush_Plays  Rush_Attempts  Rush_Inside  Rush_Outside  Plays_11_Personnel  Plays_Heavy_Personnel  Plays_Mul

In [6]:
''' t-SNE '''

# Parameters
PERPLEXITY = 3
TSNE_N_COMPONENTS = 3
TSNE_N_COMPONENT_NAMES = [f'TSNE Component {n+1}' for n in range(TSNE_N_COMPONENTS)]

# Model
tsne_model = TSNE(n_components=TSNE_N_COMPONENTS, perplexity=PERPLEXITY, random_state=42)

# Fit
y = tsne_model.fit_transform(scaled_data_df)
print(f'Divergence:', tsne_model.kl_divergence_)

# Results df
tsne_df = pd.DataFrame(y, columns=TSNE_N_COMPONENT_NAMES)

# Visualize
def tsne_chart(colors: list = None):

    fig = go.Figure()
    if TSNE_N_COMPONENTS == 2:
        fig = px.scatter(
            x=tsne_df['TSNE Component 1'],
            y=tsne_df['TSNE Component 2'],
            color=colors,
        )
    else:
        fig = px.scatter_3d(
            x=tsne_df['TSNE Component 1'],
            y=tsne_df['TSNE Component 2'],
            z=tsne_df['TSNE Component 3'],
            color=colors,
        )
    fig.update_layout(
        xaxis_title='TSNE Component 1', 
        yaxis_title='TSNE Component 2'
    )
        
    return fig

# Show
print(tsne_df.shape)
print(tsne_df.head().to_string())

fig = tsne_chart()
fig.show()

Divergence: 1.3754627704620361
(288, 3)
   TSNE Component 1  TSNE Component 2  TSNE Component 3
0         -9.827073         -4.631319        -21.720177
1        -10.529233         -4.237568        -21.582500
2         12.725067        -10.481440         -7.266101
3          5.337359          1.106098         22.323767
4          4.048865         -0.909762         22.221865


# Clustering

In [9]:
''' Input '''

cluster_input = pca_component_df.copy()
cluster_n_features = len(cluster_input.columns)

print(f'{cluster_n_features = }')
print(cluster_input.head().to_string())

cluster_n_features = 8
   Component 1  Component 2  Component 3  Component 4  Component 5  Component 6  Component 7  Component 8
0     3.842643     1.454941     2.237340     0.062339    -0.062076     0.267089    -2.481289     0.746971
1     3.749161     1.064472     1.907287     0.895464     1.478514     0.661571    -1.258971     0.158441
2     1.779855    -0.705578     0.737906    -1.275948    -1.421019    -0.474468    -1.426616    -0.595679
3    -3.392099     0.402182     0.697725     2.887922     3.695395     3.589385    -1.327002    -2.289453
4    -3.498633     1.408981     0.697070     1.094104     2.523561     1.370010     0.098868     0.099497


In [17]:
''' Model '''

# Params
MODEL_NAME = 'Spectral Clustering'

PARAMS = dict(
    n_clusters=4
)

# Model
cluster_model = SpectralClustering(
    n_clusters=PARAMS['n_clusters'],
    eigen_solver="arpack",
    affinity="nearest_neighbors",
    random_state=42,
)

# Fit
cluster_model.fit(cluster_input)

# Labels
labels = cluster_model.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

offense_tendencies['Cluster'] = labels

# Visualize
fig = tsne_chart(labels.astype(str))

params_str = ' | '.join([f'{key} = {val}' for key,val in PARAMS.items()])
fig.update_layout(
    title=f'{MODEL_NAME}<br><sup>{params_str}</sup>'
)
fig.show()

In [25]:
''' Visualize Clusters '''

CLUSTER_COL = 'Cluster'
CLUSTERS_LIST = offense_tendencies[CLUSTER_COL].sort_values(ascending=True).unique().tolist()

# Average the features for each cluster
agg_dict: dict = {feature: ['min', 'mean', 'max'] for feature in OFFENSE_FEATURES}
agg_dict[CLUSTER_COL] = 'size'

avgs_by_cluster = offense_tendencies.groupby(CLUSTER_COL).aggregate(agg_dict)
avgs_by_cluster.columns = [' '.join(col) for col in avgs_by_cluster.columns]
avgs_by_cluster = avgs_by_cluster.rename(columns={f'{CLUSTER_COL} size': '# Teams'})
print(avgs_by_cluster.head().to_string())


def get_feature_pct_scores(features: list, feature_vals: list):
    # Feature value percentiles
    pct_scores = []
    for i in range(len(features)):
        feature = features[i]
        val = feature_vals[i]
        pct_score = percentileofscore(offense_tendencies[feature].tolist(), val, kind='weak') / 100
        
        pct_scores.append(pct_score)
    
    return pct_scores


def visualize_cluster_features(cluster: int):

    # features = OFFENSE_FEATURES
    feature_cols = [f'{feature} mean' for feature in OFFENSE_FEATURES]

    ## Data ##

    # Filter to cluster
    cluster_slice = offense_tendencies.loc[offense_tendencies[CLUSTER_COL] == cluster, :]
    cluster_avgs_slice = avgs_by_cluster.loc[avgs_by_cluster.index.get_level_values(CLUSTER_COL) == cluster, :]

    # Cluster stats
    n_teams = len(cluster_slice)

    # Feature averages
    cluster_feature_vals = cluster_avgs_slice[feature_cols].values.tolist()[0]
    cluster_feature_vals_fmt = []
    for i in range(len(OFFENSE_FEATURES)):
        val = cluster_feature_vals[i]
        cluster_feature_vals_fmt.append(f'{val:.1%}') if OFFENSE_FEATURES[i][0] == '%' else cluster_feature_vals_fmt.append(f'{val:,.2f}')
    
    # Feature percentiles
    cluster_feature_pctiles = get_feature_pct_scores(features=OFFENSE_FEATURES, feature_vals=cluster_feature_vals)
    cluster_feature_pctiles_fmt = [f'{s:.1%}' for s in cluster_feature_pctiles]
    color_scale_len = len(px.colors.diverging.PRGn) - 1
    
    ## Figure ##

    # Radar
    radar = px.line_polar(
        r=cluster_feature_pctiles,
        theta=OFFENSE_FEATURES,
        line_close=True,
        color_discrete_sequence=['#44546a'],
        
    )

    # Table
    pctile_colors = [px.colors.diverging.PRGn[int(p * color_scale_len)] for p in cluster_feature_pctiles]
    text_colors = []
    for p in cluster_feature_pctiles:
        if p <= .15 or p >= 0.85: text_colors.append('white')
        else: text_colors.append('#323232')

    tbl = go.Table(
        columnwidth=[2,1,1],
        header=dict(
            fill_color='#CCCCCC',
            font=dict(weight='bold'),
            line=dict(color='#323232', width=1),
            values=['Feature', 'Avg Value', 'Percentile'],
        ),
        cells=dict(
            fill_color=['white', 'white', pctile_colors],
            font=dict(weight=['bold', 'normal', 'normal'], color=['#323232', '#323232', text_colors]),
            line=dict(color='#323232', width=1),
            values=[OFFENSE_FEATURES, cluster_feature_vals_fmt, cluster_feature_pctiles_fmt]
        )
    )
    
    # Figure
    fig = make_subplots(
        rows=1, cols=2,
        column_widths=[4,3],
        horizontal_spacing=0.1,
        specs=[[{"type": "polar"}, {"type": "domain"}]]
    )
    
    for trace in radar.data:
        fig.add_trace(
            trace, row=1, col=1
        )
    fig.add_trace(
        tbl, row=1, col=2
    )


    # Formatting
    fig.update_traces(
        fill='toself',
        opacity=0.7,
        mode='lines+markers+text',
        marker_size=7,
        row=1, col=1
    )
    
    fig.update_polars(
        bgcolor='#e1e1e1',
        angularaxis=dict(
            linecolor='#CCCCCC',
            showgrid=True,
            gridcolor='#fafafa',
            showticklabels=True,
            ticks=""
        ),
        radialaxis=dict(
            gridcolor='#fafafa',
        )
    )

    fig.update_layout(
        title_text=f"<b>Cluster {cluster}</b><br><sup>{n_teams = :,}</sup>",
        polar=dict(radialaxis_range=(0,1)),
        margin=dict(b=50, r=50, l=75, t=75),
        height=500,
        width=1400,
        showlegend=True,
    )

    fig.show()
    # pio.write_image(fig, f'Cluster 2.png', scale=6, height=500, width=1400)

    print(cluster_slice.sort_values(by='% Pass', ascending=False).head(10).to_string())


for cluster in CLUSTERS_LIST:
    visualize_cluster_features(cluster=cluster)

         Plays / Game min  Plays / Game mean  Plays / Game max  Drives / Game min  Drives / Game mean  Drives / Game max  % Pass min  % Pass mean  % Pass max  Scrambles / Game min  Scrambles / Game mean  Scrambles / Game max  % Plays 11 Personnel min  % Plays 11 Personnel mean  % Plays 11 Personnel max  % Plays Mult RBs min  % Plays Mult RBs mean  % Plays Mult RBs max  % Plays Zero RBs min  % Plays Zero RBs mean  % Plays Zero RBs max  % Plays Mult TEs min  % Plays Mult TEs mean  % Plays Mult TEs max  % Plays Zero TEs min  % Plays Zero TEs mean  % Plays Zero TEs max  % Plays Extra OL min  % Plays Extra OL mean  % Plays Extra OL max  % Under Center min  % Under Center mean  % Under Center max  % Shotgun min  % Shotgun mean  % Shotgun max  Shotgun % Pass min  Shotgun % Pass mean  Shotgun % Pass max  Under Center % Pass min  Under Center % Pass mean  Under Center % Pass max  ADOT min  ADOT mean   ADOT max  ADOT to Sticks min  ADOT to Sticks mean  ADOT to Sticks max  Avg Time to Throw min  

                Games  Drives  Plays  Neutral_Down_Plays  Pass_Plays  Neutral_Down_Pass  Pass_Attempts  QBScrambles     IAY  IAY_ToSticks  TotalTimeToThrow  Pass_BehindLOS  Pass_Deep  Sacks  Rush_Plays  Rush_Attempts  Rush_Inside  Rush_Outside  Plays_11_Personnel  Plays_Heavy_Personnel  Plays_Mult_RBs  Plays_Zero_RBs  Plays_Mult_TEs  Plays_Zero_TEs  Plays_Extra_OL  Plays / Game  Drives / Game    % Pass  % Pass Neutral Downs  Scrambles / Game       ADOT  ADOT to Sticks  Avg Time to Throw  % Passes Behind LOS  % Passes Deep  % Rush Inside  % Rush Outside  % Plays 11 Personnel  % Plays Heavy Personnel  % Plays Mult RBs  % Plays Zero RBs  % Plays Mult TEs  % Plays Zero TEs  % Plays Extra OL  Shotgun Plays  Under Center Plays  Shotgun Neutral_Down_Plays  Under Center Neutral_Down_Plays  Shotgun % Pass  Under Center % Pass  % Under Center  % Shotgun  % Under Center Neutral Downs  % Shotgun Neutral Downs  MaxTargets  MaxTargetShare  N_Receivers_FivePctTargetShare  MaxRushAttempts  MaxRushAtte

                Games  Drives  Plays  Neutral_Down_Plays  Pass_Plays  Neutral_Down_Pass  Pass_Attempts  QBScrambles     IAY  IAY_ToSticks  TotalTimeToThrow  Pass_BehindLOS  Pass_Deep  Sacks  Rush_Plays  Rush_Attempts  Rush_Inside  Rush_Outside  Plays_11_Personnel  Plays_Heavy_Personnel  Plays_Mult_RBs  Plays_Zero_RBs  Plays_Mult_TEs  Plays_Zero_TEs  Plays_Extra_OL  Plays / Game  Drives / Game    % Pass  % Pass Neutral Downs  Scrambles / Game      ADOT  ADOT to Sticks  Avg Time to Throw  % Passes Behind LOS  % Passes Deep  % Rush Inside  % Rush Outside  % Plays 11 Personnel  % Plays Heavy Personnel  % Plays Mult RBs  % Plays Zero RBs  % Plays Mult TEs  % Plays Zero TEs  % Plays Extra OL  Shotgun Plays  Under Center Plays  Shotgun Neutral_Down_Plays  Under Center Neutral_Down_Plays  Shotgun % Pass  Under Center % Pass  % Under Center  % Shotgun  % Under Center Neutral Downs  % Shotgun Neutral Downs  MaxTargets  MaxTargetShare  N_Receivers_FivePctTargetShare  MaxRushAttempts  MaxRushAttem

                Games  Drives  Plays  Neutral_Down_Plays  Pass_Plays  Neutral_Down_Pass  Pass_Attempts  QBScrambles     IAY  IAY_ToSticks  TotalTimeToThrow  Pass_BehindLOS  Pass_Deep  Sacks  Rush_Plays  Rush_Attempts  Rush_Inside  Rush_Outside  Plays_11_Personnel  Plays_Heavy_Personnel  Plays_Mult_RBs  Plays_Zero_RBs  Plays_Mult_TEs  Plays_Zero_TEs  Plays_Extra_OL  Plays / Game  Drives / Game    % Pass  % Pass Neutral Downs  Scrambles / Game      ADOT  ADOT to Sticks  Avg Time to Throw  % Passes Behind LOS  % Passes Deep  % Rush Inside  % Rush Outside  % Plays 11 Personnel  % Plays Heavy Personnel  % Plays Mult RBs  % Plays Zero RBs  % Plays Mult TEs  % Plays Zero TEs  % Plays Extra OL  Shotgun Plays  Under Center Plays  Shotgun Neutral_Down_Plays  Under Center Neutral_Down_Plays  Shotgun % Pass  Under Center % Pass  % Under Center  % Shotgun  % Under Center Neutral Downs  % Shotgun Neutral Downs  MaxTargets  MaxTargetShare  N_Receivers_FivePctTargetShare  MaxRushAttempts  MaxRushAttem

                Games  Drives  Plays  Neutral_Down_Plays  Pass_Plays  Neutral_Down_Pass  Pass_Attempts  QBScrambles     IAY  IAY_ToSticks  TotalTimeToThrow  Pass_BehindLOS  Pass_Deep  Sacks  Rush_Plays  Rush_Attempts  Rush_Inside  Rush_Outside  Plays_11_Personnel  Plays_Heavy_Personnel  Plays_Mult_RBs  Plays_Zero_RBs  Plays_Mult_TEs  Plays_Zero_TEs  Plays_Extra_OL  Plays / Game  Drives / Game    % Pass  % Pass Neutral Downs  Scrambles / Game      ADOT  ADOT to Sticks  Avg Time to Throw  % Passes Behind LOS  % Passes Deep  % Rush Inside  % Rush Outside  % Plays 11 Personnel  % Plays Heavy Personnel  % Plays Mult RBs  % Plays Zero RBs  % Plays Mult TEs  % Plays Zero TEs  % Plays Extra OL  Shotgun Plays  Under Center Plays  Shotgun Neutral_Down_Plays  Under Center Neutral_Down_Plays  Shotgun % Pass  Under Center % Pass  % Under Center  % Shotgun  % Under Center Neutral Downs  % Shotgun Neutral Downs  MaxTargets  MaxTargetShare  N_Receivers_FivePctTargetShare  MaxRushAttempts  MaxRushAttem

In [None]:
''' Visualize Clusters '''
## Spider Chart each cluster


def visualize_cluster(cluster: int, alg_name: str):

    features = VIZ_FEATURES
    cluster_col = f'Cluster {alg_name}'

    ## Data ##

    # Cluster feature averages
    agg_dict = {feature: 'mean' for feature in OFFENSE_FEATURES}
    agg_dict[cluster_col] = 'size'

    avgs_by_cluster = offense_tendencies.groupby(cluster_col).aggregate(agg_dict)
    avgs_by_cluster = avgs_by_cluster.rename(columns={cluster_col: '# Teams'})
    print(avgs_by_cluster.head().to_string())

    # Get slice from offensive tendencies
    cluster_sl = avgs_by_cluster.loc[avgs_by_cluster.index.get_level_values(cluster_col) == str(cluster), :]
    n_teams = cluster_sl['# Teams'].values[0]

    # Feature values
    cluster_avg_vals = cluster_sl[features].values.tolist()[0]
    
    # Feature value percentiles
    vals_fmt = []
    pct_scores = []
    pct_scores_fmt = []
    for i in range(len(features)):
        feature = features[i]

        val = cluster_avg_vals[i]
        pct_score = percentileofscore(offense_tendencies[feature].tolist(), val, kind='weak') / 100

        val_fmt = f'{val:.1%}' if feature[0] == '%' else f'{val:.2f}'
        vals_fmt.append(val_fmt)
        pct_scores.append(pct_score)
        pct_scores_fmt.append(f'{pct_score:.1%}')

    ## Figure ##

    fig = make_subplots(
        rows=1, cols=2, 
        column_widths=[4,3],
        horizontal_spacing=0.1,
        specs=[[{"type": "polar"}, {"type": "domain"}]]
    )

    fig.add_trace(
        go.Scatterpolar(
            r=pct_scores,
            theta=features,
            opacity=0.7,
            fill='toself'
        ),
        row=1, col=1
    )
    fig.update_layout(
        title_text=f"Cluster {cluster}: {n_teams} teams",
        polar=dict(radialaxis_range=(0,1)),
        margin=dict(b=50, r=50, l=75, t=75)
    )

    fig.add_trace(
        go.Table(
            columnwidth=[2,1,1],
            header={
                "values": ['Feature', 'Value', 'Percentile'],
            },
            cells={
                "values": [features, vals_fmt, pct_scores_fmt]
            }
        ),
        row=1, col=2
    )

    fig.show()

# for cluster in range(1, N_CLUSTERS+1):
#     visualize_cluster(cluster=cluster, alg_name='KMEANS')

#     cluster_teams = offense_tendencies.loc[offense_tendencies['Cluster KMEANS'] == str(cluster),:]
#     cluster_teams = cluster_teams.sort_values(by='Distance to KMEANS Centroid', ascending=True)

#     print(cluster_teams[OFFENSE_FEATURES].head().to_string())


In [None]:
''' Viz Clusters '''


for cluster in range(0, N_CLUSTERS_FINAL):
    visualize_cluster(cluster=cluster, alg_name='DBSCAN')

    cluster_teams = offense_tendencies.loc[offense_tendencies['Cluster DBSCAN'] == str(cluster),:]
    # cluster_teams = cluster_teams.sort_values(by='Distance to KMEANS Centroid', ascending=True)

    print(cluster_teams[OFFENSE_FEATURES].head().to_string())