Start with

- Formations (under center, shotgun, pistol)
- Personnel (% 11, % mult TEs, % no TEs, % mult RBs, % no RBs, % extra OL)

- % Pass
- % Pass neutral downs
- QB Scrambles

- ADOT
- % Screens
- % Long
- % passes from play-action
- % passes from under center vs shotgun vs pistol
- number receivers in top 80% targets

- % runs middle, guard/tackle, edge
- % rushes from under center vs. shotgun vs. pistol
- number rushers to account for 20% rushes

In [None]:
''' Imports '''

import pandas as pd
import polars as pl
import numpy as np
import math

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from scipy.stats.mstats import trimmed_var
from sklearn.cluster import KMeans, DBSCAN
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.preprocessing import StandardScaler
from scipy.stats import percentileofscore

from prep_data import load_pbp_participation_data, load_stats_team_tendencies_offense, load_stats_team_tendencies_defense

# Get Data

In [None]:
offense_tendencies = load_stats_team_tendencies_offense()

print(offense_tendencies.head().to_string())

In [None]:
''' Features '''
# '% Pass Neutral Downs', '% Under Center Neutral Downs', '% Shotgun Neutral Downs',

OFFENSE_FEATURES = [
    'Plays / Game', 'Drives / Game', 
    '% Pass',  'Scrambles / Game',
    '% Plays 11 Personnel', '% Plays Mult RBs', '% Plays Zero RBs', '% Plays Mult TEs', '% Plays Zero TEs', '% Plays Extra OL',
    '% Under Center', '% Shotgun', 'Shotgun % Pass', 'Under Center % Pass',
    'ADOT', 'ADOT to Sticks', 'Avg Time to Throw', '% Passes Behind LOS', '% Passes Deep', 'MaxTargetShare',
    '% Rush Inside', '% Rush Outside', 'MaxRushAttemptsShare',
]

VIZ_FEATURES = ['Plays / Game', '% Pass', 'Scrambles / Game', 
                '% Plays Plays_11_Personnel',
                '% Under Center', 'ADOT', 'Avg Time to Throw', 'MaxTargetShare', 
                '% Rush Outside', 'MaxRushAttemptsShare']

# VIZ_FEATURES = ['Plays / Game', '% Pass', 'Scrambles / Game', 
#                 '% Plays Plays_11_Personnel', '% Plays Plays_Mult_RBs', '% Plays Plays_Mult_TEs',
#                 '% Under Center', 'Shotgun % Pass', 'Under Center % Pass', 
#                 'ADOT', 'Avg Time to Throw', '% Passes Behind LOS', '% Passes Deep', 'MaxTargetShare', 
#                 '% Rush Outside', 'MaxRushAttemptsShare']

# Preprocessing & Transformation

In [None]:
''' Transform and Scale '''

# ## Log transform data
# transformed_data = pd.DataFrame(np.log(offense_tendencies[OFFENSE_FEATURES]), columns=OFFENSE_FEATURES).replace(math.inf, 0).replace(-(math.inf), 0)

## Scale data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(offense_tendencies[OFFENSE_FEATURES])

# Put `scaled_data` into DataFrame
scaled_data_df = pd.DataFrame(scaled_data, columns=OFFENSE_FEATURES)

print("scaled DF type:", type(scaled_data_df))
print("scaled DF shape:", scaled_data_df.shape)
scaled_data_df.head()

In [None]:
''' PCA '''

# Set number of PCA components to use after initial try
PCA_N_COMPONENTS = 8
COMPONENT_COLS = [f'Component {n}' for n in range(1, PCA_N_COMPONENTS + 1)]

# Instantiate transformer
pca_final = PCA(n_components=PCA_N_COMPONENTS, random_state=42)

# Transform sku profiles
pca_component_data_final = pca_final.fit_transform(scaled_data_df)

# Evaluate components
total_variance = scaled_data_df.var().sum()
expl_variance = pca_final.explained_variance_.sum()

print(f'Data set variance: {total_variance:,.3f}')
print(f'PCA explained variance: {expl_variance:,.3f} ({round((expl_variance / total_variance) * 100, 2)}%)')

pcs = pd.DataFrame(pca_final.components_, columns=OFFENSE_FEATURES)

# Create bar charts of contribution
for n in range(2):      #PCA_N_COMPONENTS):
    pc = pcs.transpose()[n].sort_values(ascending=False)

    fig = px.bar(
        x=pc,
        y=pc.index,
        title=f"PC{n+1}: Greatest contributors"
    )
    fig.update_layout(
        xaxis_title="Correlation", 
        yaxis_title="Features", 
        yaxis={'dtick': 1, 'categoryorder':'total ascending'},
    )
    fig.show()

    comp_expl_variance = pca_final.explained_variance_[n]
    print(f'Explained variance: {comp_expl_variance:,} ({round((comp_expl_variance / total_variance) * 100, 2)}%)')
                                                                                                                                                                                                              
# Make df of PCA scores
pca_component_df = pd.DataFrame(data=pca_component_data_final, columns=[f'Component {i}' for i in range(1, PCA_N_COMPONENTS + 1)])
print(pca_component_df.shape)
print(pca_component_df.head().to_string())

# Add PCA scores to original dataframe
offense_tendencies = offense_tendencies.drop(columns=list(filter(lambda x: x.startswith("Component"), offense_tendencies.columns)))
offense_tendencies = offense_tendencies.reset_index().merge(pca_component_df, left_index=True, right_index=True, how='left').set_index(['posteam', 'season'])


print(f'PCA values')
print(offense_tendencies.head().to_string())

In [None]:
''' t-SNE '''

# Clustering

In [None]:
''' KMeans Input '''

kmeans_input = scaled_data_df

In [None]:
''' Kmeans Clustering '''

# Try kmeans clustering with up to 20 clusters, keep track of inertia (basically cluster variance)
n_clusters = range(2,10)
inertia_values = []
silhouette_scores = []

for i in n_clusters:
    # Model
    kmeans = KMeans(n_clusters=i, n_init='auto', init='k-means++', random_state=42)

    # Fit
    kmeans.fit(kmeans_input)

    # Score
    ss = silhouette_score(kmeans_input, kmeans.labels_)   #, sample_size=int(len(pca_component_df) * 0.25))

    inertia_values.append(kmeans.inertia_)
    silhouette_scores.append(ss)

# Create scatter of inertia
fig = px.line(
    x=[i + 1 for i in range(len(inertia_values))],
    y=inertia_values,
    title="Kmeans - Inertia by Number of Clusters"
)
fig.update_layout(xaxis_title="Num Clusters", yaxis_title="Inertia")
fig.show()

# Create a line plot of `silhouette_scores` vs `n_clusters`
fig = px.line(
    x=n_clusters,
    y=silhouette_scores,
    title="K-Means Model: Silhouette Score vs Number of Clusters"
)
fig.update_layout(xaxis_title="Num Clusters", yaxis_title="Silhouette Score")
fig.show()

In [None]:
''' Clustering - Final '''

N_CLUSTERS = 5

# Once optimal num clusters is found, create the final cluster model
kmeans_final = KMeans(n_clusters=N_CLUSTERS, n_init='auto', init='k-means++', random_state=42)
kmeans_final.fit(kmeans_input)

# Find distances to centroids
labels = kmeans_final.labels_
distances_array = kmeans_final.transform(kmeans_input)

distances_to_centroid = []
for i in range(len(kmeans_input)):
    # Get sku cluster / distances
    label = labels[i]
    centroid_distances = distances_array[i]

    # Get distance to cluster center
    distance_to_cluster_centroid = centroid_distances[label]
    
    # Append
    distances_to_centroid.append(distance_to_cluster_centroid)


# Add cluster labels to original dataframe
offense_tendencies['Cluster KMEANS'] = labels + 1
offense_tendencies['Cluster KMEANS'] = offense_tendencies['Cluster KMEANS'].astype(str)
offense_tendencies['Distance to KMEANS Centroid'] = distances_to_centroid

print(offense_tendencies['Cluster KMEANS'].value_counts().sort_index().to_string())

In [None]:
''' Visualize KMeans - 3D PCA '''

fig = px.scatter_3d(
    data_frame=offense_tendencies,
    x='Component 1',
    y='Component 2',
    z='Component 3',
    title='KMeans Clusters',
    color='Cluster KMEANS',
)
fig.show()

In [None]:
''' Visualize Clusters '''
## Spider Chart each cluster


def visualize_cluster(cluster: int, alg_name: str):

    features = VIZ_FEATURES
    cluster_col = f'Cluster {alg_name}'

    ## Data ##

    # Cluster feature averages
    agg_dict = {feature: 'mean' for feature in OFFENSE_FEATURES}    # + COMPONENT_PERCENTILES}
    agg_dict[cluster_col] = 'size'

    avgs_by_cluster = offense_tendencies.groupby(cluster_col).aggregate(agg_dict)#.sort_values(by='SKU', ascending=False)
    avgs_by_cluster = avgs_by_cluster.rename(columns={cluster_col: '# Teams'})
    print(avgs_by_cluster.head().to_string())

    # Get slice from offensive tendencies
    cluster_sl = avgs_by_cluster.loc[avgs_by_cluster.index.get_level_values(cluster_col) == str(cluster), :]
    n_teams = cluster_sl['# Teams'].values[0]

    # PCA Component %iles
    # cluster_component_pct_ranks = cluster_sl[COMPONENT_PERCENTILES].values.tolist()[0]

    # Feature values
    cluster_avg_vals = cluster_sl[features].values.tolist()[0]
    
    # Feature value percentiles
    vals_fmt = []
    pct_scores = []
    pct_scores_fmt = []
    for i in range(len(features)):
        feature = features[i]

        val = cluster_avg_vals[i]
        pct_score = percentileofscore(offense_tendencies[feature].tolist(), val, kind='weak') / 100

        val_fmt = f'{val:.1%}' if feature[0] == '%' else f'{val:.2f}'
        vals_fmt.append(val_fmt)
        pct_scores.append(pct_score)
        pct_scores_fmt.append(f'{pct_score:.1%}')

    ## Figure ##

    fig = make_subplots(
        rows=1, cols=2, 
        column_widths=[4,3],
        horizontal_spacing=0.1,
        specs=[[{"type": "polar"}, {"type": "domain"}]]
    )

    fig.add_trace(
        go.Scatterpolar(
            r=pct_scores,     # cluster_component_pct_ranks,
            theta=features,     # COMPONENT_NAMES,
            opacity=0.7,
            fill='toself'
        ),
        row=1, col=1
    )
    fig.update_layout(
        title_text=f"Cluster {cluster}: {n_teams} teams",
        polar=dict(radialaxis_range=(0,1)),
        margin=dict(b=50, r=50, l=75, t=75)
    )

    fig.add_trace(
        go.Table(
            columnwidth=[2,1,1],
            header={
                "values": ['Feature', 'Value', 'Percentile'],
            },
            cells={
                "values": [features, vals_fmt, pct_scores_fmt]
            }
        ),
        row=1, col=2
    )

    fig.show()

for cluster in range(1, N_CLUSTERS+1):
    visualize_cluster(cluster=cluster, alg_name='KMEANS')

    cluster_teams = offense_tendencies.loc[offense_tendencies['Cluster KMEANS'] == str(cluster),:]
    cluster_teams = cluster_teams.sort_values(by='Distance to KMEANS Centroid', ascending=True)

    print(cluster_teams[OFFENSE_FEATURES].head().to_string())


# DBSCAN

In [None]:
dbscan_input = scaled_data_df

In [None]:
''' K Nearest Neighbors '''
## GOAL: estimate ideal parameter values for epsilon (eps)

# Init model
k_neighbors_model = NearestNeighbors()

# Fit and find nearest neighbors
k_neighbors_model.fit(dbscan_input)
distances, indices = k_neighbors_model.kneighbors(dbscan_input)

# Plot sorted distances, find the "elbow"
distances = np.sort(distances, axis=0)
distances = distances[:,1]

fig = px.line(
    data_frame=distances
)
fig.update_layout(xaxis_title='Number of SKUs', yaxis_title='Distances (EPS)')
fig.show()

In [None]:
''' Run DBSCAN '''

# Params
eps_options = [i / 10.0 for i in range(40, 50, 1)]              # Based on above elbow, trying 1.2 to 1.5
# min_samples_options = [i for i in range(2, (PCA_N_COMPONENTS*2) + 1, 1)]
min_samples_options = [5]

# Lists
models = []
ss_scores = []
davies_scores = []

for eps in eps_options:
    for min_samples in min_samples_options:
        # Create model
        dbscan_model = DBSCAN(eps=eps, min_samples=min_samples)

        # Fit
        print(f'Fitting')
        dbscan_model.fit(dbscan_input)
        labels = dbscan_model.labels_

        print(f'Processing')
        # Number of clusters, ignoring noise
        n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
        n_noise_ = list(labels).count(-1)

        ## Score
        non_noise_idxs = np.where(labels != -1)[0]
        df_scaled_no_noise = dbscan_input.loc[non_noise_idxs, :]
        labels_no_noise = labels[labels != -1]

        print(f'Scoring')
        ss_score = 0
        davies_score = 0
        if len(np.unique(labels_no_noise)) > 1:
            print(f'Silhouette')
            # Silhouette
            ss_score = silhouette_score(df_scaled_no_noise, labels_no_noise)    #, sample_size=int(len(scaled_data_df) * 0.25))

            print(f'Davies Bouldin')
            # Davies-Bouldin
            davies_score = davies_bouldin_score(df_scaled_no_noise, labels_no_noise)
        else:
            # If there's only 1 cluster, don't report score
            ss_scores.append(0)

        model = {
            'eps': eps,
            'min_samples': min_samples,
            'n_clusters': n_clusters_,
            'n_noise': n_noise_,
            'silhouette': ss_score,
            'davies': davies_score
        }
        models.append(model)


        # Print results
        print(f'------------- Model -------------')
        print(f'Eps: {eps}')
        print(f'Min Samples: {min_samples}')
        print()
        print(f'Number of clusters: {n_clusters_}')
        print(f'Number of noisy points: {n_noise_}')
        print(f"Silhouette Coefficient: {ss_score:.3f}")
        print(f"Davies-Bouldin Score: {davies_score:.3f}")
        print()

results_df = pd.DataFrame.from_records(data=models)

print(f'\n------------ Top Silhouette Scores ------------')
print(results_df.sort_values(by='silhouette', ascending=False).head(10).to_string())

print(f'\n------------ Top Davies Scores ------------')
print(results_df.sort_values(by='davies', ascending=True).head(10).to_string())


In [None]:
''' Final DBSCAN '''

OPTIMAL_EPS = 4.7 #0.58
OPTIMAL_MIN_SAMPLES = 5 #9

# Create model
dbscan_model_final = DBSCAN(eps=OPTIMAL_EPS, min_samples=OPTIMAL_MIN_SAMPLES)

# Fit
dbscan_model_final.fit(dbscan_input)
labels = dbscan_model_final.labels_

# Number of clusters, ignoring noise
N_CLUSTERS_FINAL = len(set(labels)) - (1 if -1 in labels else 0)
N_NOISE_FINAL = list(labels).count(-1)

print(f'Number of clusters: {N_CLUSTERS_FINAL}')
print(f'Number of noisy points: {N_NOISE_FINAL}')

## Add clusters to original dataframe
offense_tendencies['Cluster DBSCAN'] = labels
offense_tendencies['Cluster DBSCAN'] = offense_tendencies['Cluster DBSCAN'].astype(str)

print(offense_tendencies['Cluster DBSCAN'].value_counts().sort_index().to_string())

In [None]:
''' Visualize DBSCAN - 3D PCA '''

fig = px.scatter_3d(
    data_frame=offense_tendencies,
    x='Component 1',
    y='Component 2',
    z='Component 3',
    title='DBSCAN Clusters',
    color='Cluster DBSCAN',
)
fig.show()

In [None]:
print(offense_tendencies.loc[offense_tendencies['Cluster DBSCAN'] == '1',:].to_string())

In [None]:
''' Viz Clusters '''


for cluster in range(0, N_CLUSTERS_FINAL):
    visualize_cluster(cluster=cluster, alg_name='DBSCAN')

    cluster_teams = offense_tendencies.loc[offense_tendencies['Cluster DBSCAN'] == str(cluster),:]
    # cluster_teams = cluster_teams.sort_values(by='Distance to KMEANS Centroid', ascending=True)

    print(cluster_teams[OFFENSE_FEATURES].head().to_string())