In [15]:
''' Imports '''

import pandas as pd
import polars as pl
import numpy as np
import math

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from scipy.stats import percentileofscore
from scipy.stats.mstats import trimmed_var

from sklearn.cluster import KMeans, DBSCAN
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.preprocessing import StandardScaler, Normalizer

from prep_data import load_pbp_participation_data, load_stats_team_tendencies_offense, load_stats_team_tendencies_defense

# Get Data

In [2]:
offense_tendencies = load_stats_team_tendencies_offense()

print(offense_tendencies.head().to_string())

                Games  Drives  Plays  Neutral_Down_Plays  Pass_Plays  Neutral_Down_Pass  Pass_Attempts  QBScrambles     IAY  IAY_ToSticks  TotalTimeToThrow  Pass_BehindLOS  Pass_Deep  Sacks  Rush_Plays  Rush_Attempts  Rush_Inside  Rush_Outside  Plays_11_Personnel  Plays_Heavy_Personnel  Plays_Mult_RBs  Plays_Zero_RBs  Plays_Mult_TEs  Plays_Zero_TEs  Plays_Extra_OL  Plays / Game  Drives / Game    % Pass  % Pass Neutral Downs  Scrambles / Game      ADOT  ADOT to Sticks  Avg Time to Throw  % Passes Behind LOS  % Passes Deep  % Rush Inside  % Rush Outside  % Plays 11 Personnel  % Plays Heavy Personnel  % Plays Mult RBs  % Plays Zero RBs  % Plays Mult TEs  % Plays Zero TEs  % Plays Extra OL  Shotgun Plays  Under Center Plays  Shotgun Neutral_Down_Plays  Under Center Neutral_Down_Plays  Shotgun % Pass  Under Center % Pass  % Under Center  % Shotgun  % Under Center Neutral Downs  % Shotgun Neutral Downs  MaxTargets  MaxTargetShare  N_Receivers_FivePctTargetShare  MaxRushAttempts  MaxRushAttem

In [3]:
''' Features '''
# '% Pass Neutral Downs', '% Under Center Neutral Downs', '% Shotgun Neutral Downs',

OFFENSE_FEATURES = [
    'Plays / Game', 'Drives / Game', 
    '% Pass',  'Scrambles / Game',
    '% Plays 11 Personnel', '% Plays Heavy Personnel', '% Plays Mult RBs', '% Plays Zero RBs', '% Plays Mult TEs', '% Plays Zero TEs', '% Plays Extra OL',
    '% Under Center', '% Shotgun', 'Shotgun % Pass', 'Under Center % Pass',
    'ADOT', 'ADOT to Sticks', 'Avg Time to Throw', '% Passes Behind LOS', '% Passes Deep', 'MaxTargetShare',
    '% Rush Inside', '% Rush Outside', 'MaxRushAttemptsShare',
]

VIZ_FEATURES = ['Plays / Game', '% Pass', 'Scrambles / Game', 
                '% Plays 11 Personnel', '% Plays Heavy Personnel',
                '% Under Center', 'ADOT', 'Avg Time to Throw', 'MaxTargetShare', 
                '% Rush Outside', 'MaxRushAttemptsShare']

# Visualize

In [10]:
''' Feature Distributions '''

def feature_distributions(feature_df: pd.DataFrame, features: list[str]) -> go.Figure:

    # Hists
    histograms: list[go.Figure] = []
    for feature in features:
        hist = px.histogram(
            x=feature_df[feature],
        )
        histograms.append(hist)

    # Create fig
    N_COLS = 3
    N_ROWS = math.ceil(len(features) / N_COLS)
    V_SPACING = 0.5 / N_ROWS
    fig = make_subplots(
        rows=N_ROWS,
        cols=N_COLS,
        vertical_spacing=V_SPACING,
        subplot_titles=features,
        specs=[[{"type": "xy"}]*N_COLS]*N_ROWS
    )

    for h in range(len(histograms)):
        row = (h // N_COLS) + 1
        col = (h % N_COLS) + 1
        hist = histograms[h]

        for trace in hist.data:
            fig.add_trace(trace, row=row, col=col)

    # Format
    fig.update_annotations(font=dict(size=12, weight='bold'))
    fig.update_traces(
        marker=dict(color='#44546a', opacity=0.8)
    )
    fig.update_xaxes(
        linecolor='#f0f0f0', mirror=True,
    )
    fig.update_yaxes(
        linecolor='#f0f0f0', mirror=True,
    )
    fig.update_layout(
        title=f'<b>Feature Distributions</b>',
        margin=dict(t=100, l=25, r=25, b=25),
        width=900,
        height=1500
    )

    return fig

fig = feature_distributions(offense_tendencies, OFFENSE_FEATURES)
fig.show()

In [11]:
''' Correlation Matrix '''

corr_matrix = offense_tendencies[OFFENSE_FEATURES].corr()

fig = px.imshow(
    corr_matrix,
    color_continuous_scale=px.colors.diverging.PRGn,
    aspect="auto"
)
fig.update_xaxes(side="top")
fig.update_coloraxes(
    cmid=0,
    showscale=False,
)
fig.update_layout(
    title='Feature Correlations',
    margin=dict(r=25, b=25)
)
fig.show()

In [13]:
''' Variance '''
# NOTE - AKA features with largest number scale

# Calculate variance, get 10 largest features
top_ten_variance = offense_tendencies[OFFENSE_FEATURES].var().sort_values().tail(10)

# Create horizontal bar chart of `top_ten_var`
fig = px.bar(
    x=top_ten_variance,
    y=top_ten_variance.index,
    title="Offense Tendencies: High Variance Features"
)
fig.update_layout(xaxis_title="Variance", yaxis_title="Features")
fig.show()

# Calculate trimmed variance
top_ten_trim_variance = offense_tendencies[OFFENSE_FEATURES].apply(trimmed_var).sort_values().tail(10)

# Create horizontal bar chart of `top_ten_trim_var`
fig = px.bar(
    x=top_ten_trim_variance,
    y=top_ten_trim_variance.index,
    title="Offense Tendencies: High Variance Features (Trimmed)"
)
fig.update_layout(xaxis_title="Trimmed Variance", yaxis_title="Features")
fig.show()


# Preprocessing

- The percentages features with low values are pretty skewed (right), but everything else is fairly normal even without scaling
    - % Plays Mult RB, the personnel ones

- Standardization and normalization produce similar results, norm just produces some really small value ranges
- Go with whichever is recommended for algorithm (e.g., standardization for PCA, perhaps normalization for hdbscan)


In [17]:
''' Transform and Scale '''
# NOTE - distributions aren't overly skewed or large scale, so probably no log transforms needed

# Scale data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(offense_tendencies[OFFENSE_FEATURES])
scaled_data_df = pd.DataFrame(scaled_data, columns=OFFENSE_FEATURES)

print(scaled_data_df.shape)
print(scaled_data_df.head().to_string())

# Normalize data
norm = Normalizer()
normalized_data = norm.fit_transform(offense_tendencies[OFFENSE_FEATURES])
normalized_data_df = pd.DataFrame(normalized_data, columns=OFFENSE_FEATURES)

print(normalized_data_df.shape)
print(normalized_data_df.head().to_string())

(288, 24)
   Plays / Game  Drives / Game    % Pass  Scrambles / Game  % Plays 11 Personnel  % Plays Heavy Personnel  % Plays Mult RBs  % Plays Zero RBs  % Plays Mult TEs  % Plays Zero TEs  % Plays Extra OL  % Under Center  % Shotgun  Shotgun % Pass  Under Center % Pass      ADOT  ADOT to Sticks  Avg Time to Throw  % Passes Behind LOS  % Passes Deep  MaxTargetShare  % Rush Inside  % Rush Outside  MaxRushAttemptsShare
0      0.753261       1.547027 -0.124128         -1.524084             -0.128782                -0.145093         -0.415819         -0.456262          0.086337          2.085008         -0.766646        1.423627  -1.423627        1.427770             0.417428  2.060821        1.669651          -1.494379            -2.297630       0.119912        0.508965      -1.080209       -0.528517              1.955732
1      0.811265       1.547027  0.139443         -0.815344             -0.678900                -0.004189         -0.770463          0.174885          0.656896          2

In [19]:
''' Distributions After Processing '''

# StandardScaler
fig = feature_distributions(scaled_data_df, OFFENSE_FEATURES)
fig.update_layout(title_text='Feature Distributions - StandardScaler')
fig.show()

# Normalizer
fig = feature_distributions(normalized_data_df, OFFENSE_FEATURES)
fig.update_layout(title_text='Feature Distributions - Normalizer')
fig.show()

In [24]:
fig = px.histogram(
    normalized_data_df['% Plays 11 Personnel']
)
fig.show()

In [34]:
''' Variance after processing '''

# Calculate variance, get 10 largest features
top_ten_variance = normalized_data_df[OFFENSE_FEATURES].var().sort_values().tail(10)

# Create horizontal bar chart of `top_ten_var`
fig = px.bar(
    x=top_ten_variance,
    y=top_ten_variance.index,
    title="Normalized Data: High Variance Features"
)
fig.update_layout(xaxis_title="Variance", yaxis_title="Features")
fig.show()

# t-SNE

Not typically used for feature extraction for subsequent learning models because data is not deterministic (i.e., t-sne function doesn't return same exact values every time)

Use roughly sqroot of N (num samples) as perplexity?

t-SNE on PCA:  
https://stats.stackexchange.com/questions/263539/clustering-on-the-output-of-t-sne

How to use t-SNE:  
https://distill.pub/2016/misread-tsne/

Seems that t-SNE is most safely used as a visualization tool. Visualizes high-dimensional, non-linear data in 2 or 3d while preserving local proximities. Could be used before:
1. Exploration, reduce dimensions to 2 or 3 for visualization and get shape of data
1. After PCA, to further reduce dimensions to 2 or 3 for visualization
1. To visualize clusters after clustering

In [None]:
''' t-SNE Experiment '''

perplexities = [i for i in range(5, 50, 5)]

for n_components in [2,3]:
    divergences = []

    for p in perplexities:
        # Model
        tsne_model = TSNE(n_components=n_components, perplexity=p, random_state=42)
        
        # Fit
        y = tsne_model.fit_transform(normalized_data_df)

        # Divergence
        divergences.append(tsne_model.kl_divergence_)

    # Graph divergence
    fig = px.line(
        x=perplexities, 
        y=divergences,
        markers=True,
        title=f't-SNE Perplexity - {n_components} Components'
    )
    fig.update_layout(xaxis_title="Perplexity Values", yaxis_title="Divergence")
    fig.update_traces(line_color="red", line_width=1)
    fig.show()

In [30]:
''' t-SNE Final '''

PERPLEXITY = 5
TSNE_N_COMPONENTS = 2
TSNE_N_COMPONENT_NAMES = [f'TSNE Component {n+1}' for n in range(TSNE_N_COMPONENTS)]

# Model
tsne_model = TSNE(n_components=TSNE_N_COMPONENTS, perplexity=PERPLEXITY, random_state=42)

# Fit
y = tsne_model.fit_transform(normalized_data_df)
print(f'Divergence:', tsne_model.kl_divergence_)

# Results df
tsne_df = pd.DataFrame(y, columns=TSNE_N_COMPONENT_NAMES)

print(tsne_df.shape)
print(tsne_df.head().to_string())

Divergence: 0.7807890176773071
(288, 2)
   TSNE Component 1  TSNE Component 2
0         -1.857478         59.293114
1         -1.513246         60.363895
2         23.078861         36.652054
3         -5.649301        -40.041359
4        -12.626587        -38.758751


In [31]:

def tsne_chart(colors: list = None):
    fig = go.Figure()
    if TSNE_N_COMPONENTS == 2:
        fig = px.scatter(
            data_frame=tsne_df,
            x='TSNE Component 1',
            y='TSNE Component 2',
            color=colors
        )
    else:
        fig = px.scatter_3d(
            data_frame=tsne_df,
            x='TSNE Component 1',
            y='TSNE Component 2',
            z='TSNE Component 3',
            color=colors
        )

    return fig

fig = tsne_chart()
fig.show()