In [49]:
''' Imports '''

import pandas as pd
import polars as pl
import numpy as np
import math

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from scipy.stats import percentileofscore
from scipy.stats.mstats import trimmed_var

from sklearn.cluster import KMeans, DBSCAN
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.preprocessing import StandardScaler

from prep_data import load_pbp_participation_data, load_stats_team_tendencies_offense, load_stats_team_tendencies_defense

In [50]:
offense_tendencies = load_stats_team_tendencies_offense()

print(offense_tendencies.head().to_string())

                Games  Drives  Plays  Neutral_Down_Plays  Pass_Plays  Neutral_Down_Pass  Pass_Attempts  QBScrambles     IAY  IAY_ToSticks  TotalTimeToThrow  Pass_BehindLOS  Pass_Deep  Sacks  Rush_Plays  Rush_Attempts  Rush_Inside  Rush_Outside  Plays_11_Personnel  Plays_Heavy_Personnel  Plays_Mult_RBs  Plays_Zero_RBs  Plays_Mult_TEs  Plays_Zero_TEs  Plays_Extra_OL  Plays / Game  Drives / Game    % Pass  % Pass Neutral Downs  Scrambles / Game      ADOT  ADOT to Sticks  Avg Time to Throw  % Passes Behind LOS  % Passes Deep  % Rush Inside  % Rush Outside  % Plays 11 Personnel  % Plays Heavy Personnel  % Plays Mult RBs  % Plays Zero RBs  % Plays Mult TEs  % Plays Zero TEs  % Plays Extra OL  Shotgun Plays  Under Center Plays  Shotgun Neutral_Down_Plays  Under Center Neutral_Down_Plays  Shotgun % Pass  Under Center % Pass  % Under Center  % Shotgun  % Under Center Neutral Downs  % Shotgun Neutral Downs  MaxTargets  MaxTargetShare  N_Receivers_FivePctTargetShare  MaxRushAttempts  MaxRushAttem

In [51]:
''' Features '''
# '% Pass Neutral Downs', '% Under Center Neutral Downs', '% Shotgun Neutral Downs',

OFFENSE_FEATURES = [
    'Plays / Game', 'Drives / Game', 
    '% Pass',  'Scrambles / Game',
    '% Plays 11 Personnel', '% Plays Heavy Personnel', '% Plays Mult RBs', '% Plays Zero RBs', '% Plays Mult TEs', '% Plays Zero TEs', '% Plays Extra OL',
    '% Under Center', '% Shotgun', 'Shotgun % Pass', 'Under Center % Pass',
    'ADOT', 'ADOT to Sticks', 'Avg Time to Throw', '% Passes Behind LOS', '% Passes Deep', 'MaxTargetShare',
    '% Rush Inside', '% Rush Outside', 'MaxRushAttemptsShare',
]

VIZ_FEATURES = ['Plays / Game', '% Pass', 'Scrambles / Game', 
                '% Plays 11 Personnel', '% Plays Heavy Personnel',
                '% Under Center', 'ADOT', 'Avg Time to Throw', 'MaxTargetShare', 
                '% Rush Outside', 'MaxRushAttemptsShare']

# VIZ_FEATURES = ['Plays / Game', '% Pass', 'Scrambles / Game', 
#                 '% Plays Plays_11_Personnel', '% Plays Plays_Mult_RBs', '% Plays Plays_Mult_TEs',
#                 '% Under Center', 'Shotgun % Pass', 'Under Center % Pass', 
#                 'ADOT', 'Avg Time to Throw', '% Passes Behind LOS', '% Passes Deep', 'MaxTargetShare', 
#                 '% Rush Outside', 'MaxRushAttemptsShare']

# Visualize

In [52]:
''' Correlation Matrix '''

corr_matrix = offense_tendencies[OFFENSE_FEATURES].corr()

fig = px.imshow(
    corr_matrix,
    color_continuous_scale=px.colors.diverging.PRGn,
    aspect="auto"
)
fig.update_xaxes(side="top")
fig.update_coloraxes(
    cmid=0,
    showscale=False,
)
fig.update_layout(
    title='Feature Correlations',
    margin=dict(r=25, b=25)
)
fig.show()

In [53]:
''' Variance '''

# Calculate variance, get 10 largest features
top_ten_variance = offense_tendencies[OFFENSE_FEATURES].var().sort_values().tail(10)

# Create horizontal bar chart of `top_ten_var`
fig = px.bar(
    x=top_ten_variance,
    y=top_ten_variance.index,
    title="Offense Tendencies: High Variance Features"
)
fig.update_layout(xaxis_title="Variance", yaxis_title="Features")
fig.show()

# Calculate trimmed variance
top_ten_trim_variance = offense_tendencies[OFFENSE_FEATURES].apply(trimmed_var).sort_values().tail(10)

# Create horizontal bar chart of `top_ten_trim_var`
fig = px.bar(
    x=top_ten_trim_variance,
    y=top_ten_trim_variance.index,
    title="Offense Tendencies: High Variance Features (Trimmed)"
)
fig.update_layout(xaxis_title="Trimmed Variance", yaxis_title="Features")
fig.show()


# Model Preprocessing

In [None]:
''' Transform and Scale '''
# NOTE - use standard scaling for PCA

## Scale data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(offense_tendencies[OFFENSE_FEATURES])
scaled_data_df = pd.DataFrame(scaled_data, columns=OFFENSE_FEATURES)

print(scaled_data_df.shape)
print(scaled_data_df.head().to_string())

scaled DF type: <class 'pandas.core.frame.DataFrame'>
scaled DF shape: (224, 24)


Unnamed: 0,Plays / Game,Drives / Game,% Pass,Scrambles / Game,% Plays 11 Personnel,% Plays Heavy Personnel,% Plays Mult RBs,% Plays Zero RBs,% Plays Mult TEs,% Plays Zero TEs,...,Under Center % Pass,ADOT,ADOT to Sticks,Avg Time to Throw,% Passes Behind LOS,% Passes Deep,MaxTargetShare,% Rush Inside,% Rush Outside,MaxRushAttemptsShare
0,-2.331593,2.343784,-0.039531,-0.796092,0.375049,-0.092093,0.152784,-0.5272,-0.170789,-0.524651,...,0.367672,0.523527,0.517104,-0.171047,0.25909,0.241612,0.55476,-2.019394,-2.054735,1.714133
1,0.919371,1.291758,1.210869,0.118792,-1.451306,-0.379128,-0.39639,0.396326,-0.208703,7.602769,...,-0.266346,-0.691822,-0.665958,-0.475782,1.573069,0.271512,-0.736058,-2.837903,1.963484,-1.725784
2,1.569563,0.502738,0.770569,1.94856,-0.594612,-0.141177,-0.454407,-0.421931,0.158448,3.185271,...,1.319149,-0.015061,-0.386421,-0.010418,0.58778,-0.088111,1.282674,-2.332227,0.749015,0.197049
3,0.50735,-0.518347,0.777948,0.48355,-0.436346,-0.23242,-0.34663,-0.057475,-0.011029,3.191052,...,-0.346983,-0.306309,-0.245658,-0.553849,1.821468,1.36597,-1.358682,-2.35504,0.313172,-0.613659
4,1.147112,0.843099,0.854663,0.387875,0.197556,-0.184903,-0.436722,-0.5272,0.072687,0.92436,...,-2.479953,-0.97047,-1.079155,-1.117736,1.134455,0.196839,-1.34598,-1.439858,-0.504901,-0.343834


In [56]:
''' Variance of Scaled / Transformed Data '''

# Calculate variance, get 10 largest features
top_ten_variance = scaled_data_df.var().sort_values().tail(10)

# Create horizontal bar chart of `top_ten_var`
fig = px.bar(
    x=top_ten_variance,
    y=top_ten_variance.index,
    title="Scaled Data: High Variance Features"
)
fig.update_layout(xaxis_title="Variance", yaxis_title="Features")
fig.show()

# PCA

In [57]:
''' PCA '''

# Instantiate transformer
pca = PCA(random_state=42)

# Transform data with pa
pca_component_data = pca.fit_transform(scaled_data_df)

print('Total variance:', scaled_data_df.var().sum())
print(f'Singular values:\n', pca.singular_values_)
print(f'Explained variance:\n', pca.explained_variance_.round(5))
print(f'Ratio:\n', pca.explained_variance_ratio_.round(3))
print(pca.feature_names_in_)

# Create horizontal bar chart of explained variance
fig = px.line(
    x=[i + 1 for i in range(len(pca.explained_variance_ratio_))],
    y=pca.explained_variance_ratio_.cumsum(),
    title="Explained variance"
)
fig.update_layout(xaxis_title="Principal Component", yaxis_title="Cumulative Explained Variance (%)")
fig.show()

Total variance: 24.10762331838565
Singular values:
 [2.78336619e+01 2.68058057e+01 2.50252777e+01 2.02933055e+01
 1.94727558e+01 1.82699157e+01 1.70402455e+01 1.63405143e+01
 1.59044997e+01 1.44548814e+01 1.36975454e+01 1.34397006e+01
 1.25450995e+01 1.14254609e+01 1.11387505e+01 1.07372387e+01
 9.60174666e+00 8.57664446e+00 5.46656752e+00 3.34913831e+00
 2.42907134e+00 1.74699247e+00 1.03067296e+00 2.85698686e-15]
Explained variance:
 [3.47405 3.2222  2.80836 1.84672 1.7004  1.49682 1.30211 1.19737 1.13432
 0.93697 0.84136 0.80998 0.70574 0.58539 0.55638 0.51699 0.41342 0.32986
 0.13401 0.0503  0.02646 0.01369 0.00476 0.     ]
Ratio:
 [0.144 0.134 0.116 0.077 0.071 0.062 0.054 0.05  0.047 0.039 0.035 0.034
 0.029 0.024 0.023 0.021 0.017 0.014 0.006 0.002 0.001 0.001 0.    0.   ]
['Plays / Game' 'Drives / Game' '% Pass' 'Scrambles / Game'
 '% Plays 11 Personnel' '% Plays Heavy Personnel' '% Plays Mult RBs'
 '% Plays Zero RBs' '% Plays Mult TEs' '% Plays Zero TEs'
 '% Plays Extra OL' '%

In [58]:
''' PCA - final '''

# Set number of PCA components to use after initial try
PCA_N_COMPONENTS = 8
COMPONENT_COLS = [f'Component {n}' for n in range(1, PCA_N_COMPONENTS + 1)]

# Instantiate transformer
pca_final = PCA(n_components=PCA_N_COMPONENTS, random_state=42)

# Transform sku profiles
pca_component_data_final = pca_final.fit_transform(scaled_data_df)

# Evaluate components
total_variance = scaled_data_df.var().sum()
expl_variance = pca_final.explained_variance_.sum()

print(f'Data set variance: {total_variance:,.3f}')
print(f'PCA explained variance: {expl_variance:,.3f} ({round((expl_variance / total_variance) * 100, 2)}%)')

pcs = pd.DataFrame(pca_final.components_, columns=OFFENSE_FEATURES)

# Create bar charts of contribution
for n in range(2):      #PCA_N_COMPONENTS):
    pc = pcs.transpose()[n].sort_values(ascending=False)

    fig = px.bar(
        x=pc,
        y=pc.index,
        title=f"PC{n+1}: Greatest contributors"
    )
    fig.update_layout(
        xaxis_title="Correlation", 
        yaxis_title="Features", 
        yaxis={'dtick': 1, 'categoryorder':'total ascending'},
    )
    fig.show()

    comp_expl_variance = pca_final.explained_variance_[n]
    print(f'Explained variance: {comp_expl_variance:,} ({round((comp_expl_variance / total_variance) * 100, 2)}%)')
                                                                                                                                                                                                              
# Make df of PCA scores
pca_component_df = pd.DataFrame(data=pca_component_data_final, columns=[f'Component {i}' for i in range(1, PCA_N_COMPONENTS + 1)])
print(pca_component_df.shape)
print(pca_component_df.head().to_string())

# Add PCA scores to original dataframe
offense_tendencies = offense_tendencies.drop(columns=list(filter(lambda x: x.startswith("Component"), offense_tendencies.columns)))
offense_tendencies = offense_tendencies.reset_index().merge(pca_component_df, left_index=True, right_index=True, how='left').set_index(['posteam', 'season'])


print(f'PCA values')
print(offense_tendencies.head().to_string())

Data set variance: 24.108
PCA explained variance: 17.048 (70.72%)


Explained variance: 3.4740481340483558 (14.41%)


Explained variance: 3.222202788519839 (13.37%)
(224, 8)
   Component 1  Component 2  Component 3  Component 4  Component 5  Component 6  Component 7  Component 8
0    -1.174916    -1.414879     1.749422    -0.534725     0.874502    -0.466387    -1.961966    -0.768966
1     2.907426     1.041547    -2.534225     1.033480    -1.286341     6.208372    -1.656764     3.034321
2     2.909212     1.744247    -1.583371     1.825540    -0.306463     2.587239     0.286811     2.006733
3     2.968790     1.307310    -2.042675     0.444853    -0.356500     2.745625    -1.558250     1.592760
4     3.112452    -0.114299    -2.719902     0.835783    -1.142196     0.693303    -1.657998    -0.695891
PCA values
                Games  Drives  Plays  Neutral_Down_Plays  Pass_Plays  Neutral_Down_Pass  Pass_Attempts  QBScrambles     IAY  IAY_ToSticks  TotalTimeToThrow  Pass_BehindLOS  Pass_Deep  Sacks  Rush_Plays  Rush_Attempts  Rush_Inside  Rush_Outside  Plays_11_Personnel  Plays_Heavy_Personnel  Plays_Mul

In [59]:
''' Visualize PCA - Correlation Chart '''

# COMPONENT_NAMES = ['Small Cartons', 'High Volume', 'High Inventory - Light', 'High Inventory - Heavy']
COMPONENT_NAMES = COMPONENT_COLS

# Create "correlation matrix" - correlation with each of the components to each of the features
corr_matrix = pd.DataFrame.from_records(pca_final.components_, index=COMPONENT_NAMES, columns=OFFENSE_FEATURES).transpose()

# Visualize
fig = px.imshow(
    corr_matrix,
    color_continuous_scale=px.colors.diverging.PRGn,
    aspect="auto"
)
fig.update_xaxes(side="top")
fig.update_coloraxes(
    showscale=False,
    cmid=0,
)
fig.update_layout(
    title='Offense Tendency Factors',
    margin=dict(r=25, b=25, t=75),
    height=700,
    width=900

)
fig.show()

In [60]:
''' Visualize PCA - Items '''

# Percentile each component
for n in range(1, PCA_N_COMPONENTS + 1):
    offense_tendencies[f'Component {n} Percentile'] = offense_tendencies[f'Component {n}'].rank(pct=True)

COMPONENT_PERCENTILES = [f'Component {n} Percentile' for n in range(1, PCA_N_COMPONENTS + 1)]

def visualize_team_pca(team: str, season: int):

    ## Data ##
    # Get slice from offensive tendencies
    team_sl = offense_tendencies.loc[(offense_tendencies.index.get_level_values('posteam') == team) &
                                     (offense_tendencies.index.get_level_values('season') == season), :]
    
    # PCA Component %iles
    team_component_pct_ranks = team_sl[COMPONENT_PERCENTILES].values.tolist()[0]

    # Feature values
    team_feature_vals = team_sl[OFFENSE_FEATURES].values.tolist()[0]
    
    # Feature value percentiles
    vals_fmt = []
    pct_scores = []
    for i in range(len(OFFENSE_FEATURES)):
        feature = OFFENSE_FEATURES[i]
        val = team_feature_vals[i]
        pct_score = percentileofscore(offense_tendencies[feature].tolist(), val, kind='weak') / 100
        
        val_fmt = f'{val:.1%}' if feature[0] == '%' else f'{val:.2f}'
        vals_fmt.append(val_fmt)
        pct_scores.append(f'{pct_score:.1%}')

    ## Figure ##

    fig = make_subplots(
        rows=1, cols=2, 
        column_widths=[4,3],
        horizontal_spacing=0.1,
        specs=[[{"type": "polar"}, {"type": "domain"}]]
    )

    fig.add_trace(
        go.Scatterpolar(
            r=team_component_pct_ranks,
            theta=COMPONENT_NAMES,
            opacity=0.7,
            fill='toself'
        ),
        row=1, col=1
    )
    fig.update_layout(
        title_text=f"Team: {season} {team}",
        polar=dict(radialaxis_range=(0,1)),
        margin=dict(b=50, r=50, l=75, t=75)
    )

    fig.add_trace(
        go.Table(
            columnwidth=[2,1,1],
            header={
                "values": ['Component', 'Value', 'Percentile'],
            },
            cells={
                "values": [OFFENSE_FEATURES, vals_fmt, pct_scores]
            }
        ),
        row=1, col=2
    )

    fig.show()


visualize_team_pca('DET', 2024)

# # Visualize top teams from each component
# for n in range(1, PCA_N_COMPONENTS + 1):
#     component = f'Component {n} Percentile'

#     top_teams = offense_tendencies.sort_values(by=component, ascending=False).head(10)
    
#     top_team = top_teams.index[0]
#     visualize_team_pca(top_team[0], top_team[1])

#     print(top_teams.to_string())

In [61]:
fig = px.histogram(
    data_frame=offense_tendencies,
    x='MaxTargetShare'
)
fig.show()

In [62]:
''' Visualize PCA Components - Scatter '''

# sl = offense_tendencies.sample(frac=0.25, random_state=42)

fig = px.scatter(
    data_frame=offense_tendencies,
    x='Component 1',
    y='Component 2',
    title='PCA Components - Top 2'
)
fig.show()

fig = px.scatter_3d(
    data_frame=offense_tendencies,
    x='Component 1',
    y='Component 2',
    z='Component 3',
    title='PCA Components - Top 3'
)
fig.show()

# t-SNE

Not typically used for feature extraction for subsequent learning models because data is not deterministic (i.e., t-sne function doesn't return same exact values every time)

Use roughly sqroot of N (num samples) as perplexity?

t-SNE on PCA:  
https://stats.stackexchange.com/questions/263539/clustering-on-the-output-of-t-sne

How to use t-SNE:  
https://distill.pub/2016/misread-tsne/

Seems that t-SNE is most safely used as a visualization tool. Visualizes high-dimensional, non-linear data in 2 or 3d while preserving local proximities. Could be used before:
1. Exploration, reduce dimensions to 2 or 3 for visualization and get shape of data
1. After PCA, to further reduce dimensions to 2 or 3 for visualization
1. To visualize clusters after clustering

In [73]:
''' t-SNE '''


perplexities = [i for i in range(5, 150, 5)]

for n_components in [2,3]:
    divergences = []

    for p in perplexities:
        # Model
        tsne_model = TSNE(n_components=n_components, perplexity=p, random_state=42)
        
        # Fit
        y = tsne_model.fit_transform(scaled_data_df)

        # Divergence
        divergences.append(tsne_model.kl_divergence_)

    # Graph divergence
    fig = px.line(
        x=perplexities, 
        y=divergences,
        markers=True,
        title=f't-SNE Perplexity - {n_components} Components'
    )
    fig.update_layout(xaxis_title="Perplexity Values", yaxis_title="Divergence")
    fig.update_traces(line_color="red", line_width=1)
    fig.show()

In [74]:
''' t-SNE Final '''

PERPLEXITY = 30
TSNE_N_COMPONENTS = 2
TSNE_N_COMPONENT_NAMES = [f'TSNE Component {n+1}' for n in range(TSNE_N_COMPONENTS)]

# Model
tsne_model = TSNE(n_components=TSNE_N_COMPONENTS, perplexity=PERPLEXITY, random_state=42)

# Fit
y = tsne_model.fit_transform(scaled_data_df)
print(f'Divergence:', tsne_model.kl_divergence_)

# Results df
tsne_df = pd.DataFrame(y, columns=TSNE_N_COMPONENT_NAMES)

print(tsne_df.shape)
print(tsne_df.head().to_string())

Divergence: 0.9622150659561157
(224, 2)
   TSNE Component 1  TSNE Component 2
0          6.305297         -1.726348
1        -11.902147         -1.672906
2        -11.353816         -1.134172
3        -11.203395         -1.481916
4         -9.270196         -2.297225


In [75]:
fig = go.Figure()
if TSNE_N_COMPONENTS == 2:
    fig = px.scatter(
        data_frame=tsne_df,
        x='TSNE Component 1',
        y='TSNE Component 2',
    )
else:
    fig = px.scatter_3d(
        data_frame=tsne_df,
        x='TSNE Component 1',
        y='TSNE Component 2',
        z='TSNE Component 3',
    )

fig.show()

# PCA --> t-SNE

In [70]:
''' t-SNE '''

TSNE_N_COMPONENTS = 3
TSNE_N_COMPONENT_NAMES = [f'TSNE Component {n+1}' for n in range(TSNE_N_COMPONENTS)]

perplexities = [i for i in range(5, 150, 5)]
divergences = []

for p in perplexities:
    # Model
    tsne_model = TSNE(n_components=TSNE_N_COMPONENTS, perplexity=p, random_state=42)
    
    # Fit
    y = tsne_model.fit_transform(pca_component_df)

    # Divergence
    divergences.append(tsne_model.kl_divergence_)

# Graph divergence
fig = px.line(
    x=perplexities, 
    y=divergences,
    markers=True
)
fig.update_layout(xaxis_title="Perplexity Values", yaxis_title="Divergence")
fig.update_traces(line_color="red", line_width=1)
fig.show()

In [67]:
''' PCA --> t-SNE Final '''

PERPLEXITY = 130

# Model
tsne_model = TSNE(n_components=TSNE_N_COMPONENTS, perplexity=PERPLEXITY, random_state=42)

# Fit
y = tsne_model.fit_transform(pca_component_df)
print(f'Divergence:', tsne_model.kl_divergence_)

# Results df
pca_to_tsne_df = pd.DataFrame(y, columns=TSNE_N_COMPONENT_NAMES)

print(pca_to_tsne_df.shape)
print(pca_to_tsne_df.head().to_string())

Divergence: 0.14924678206443787
(224, 3)
   TSNE Component 1  TSNE Component 2  TSNE Component 3
0         -0.929687          0.684162         -0.176145
1          2.412799         -1.113160         -0.642777
2          2.136342         -0.752396         -0.570859
3          2.099987         -0.946875         -0.585605
4          1.206051         -1.496678         -0.551259


In [68]:
fig = px.scatter_3d(
    data_frame=pca_to_tsne_df,
    x='TSNE Component 1',
    y='TSNE Component 2',
    z='TSNE Component 3',
)
fig.show()