In [1]:
''' Imports '''

import pandas as pd
import polars as pl
import numpy as np
import math
from time import time
from datetime import timedelta

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from scipy.stats.mstats import trimmed_var
from scipy.stats import percentileofscore

from sklearn.cluster import KMeans, AffinityPropagation, MeanShift, SpectralClustering, DBSCAN, HDBSCAN, OPTICS, estimate_bandwidth
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.preprocessing import StandardScaler

from prep_data import load_pbp_participation_data, load_stats_team_tendencies_offense, load_stats_team_tendencies_defense

In [2]:
offense_tendencies = load_stats_team_tendencies_offense()

print(offense_tendencies.head().to_string())

                Games  Drives  Plays  Neutral_Down_Plays  Pass_Plays  Neutral_Down_Pass  Pass_Attempts  QBScrambles     IAY  IAY_ToSticks  TotalTimeToThrow  Pass_BehindLOS  Pass_Deep  Sacks  Rush_Plays  Rush_Attempts  Rush_Inside  Rush_Outside  Plays_11_Personnel  Plays_Heavy_Personnel  Plays_Mult_RBs  Plays_Zero_RBs  Plays_Mult_TEs  Plays_Zero_TEs  Plays_Extra_OL  Plays / Game  Drives / Game    % Pass  % Pass Neutral Downs  Scrambles / Game      ADOT  ADOT to Sticks  Avg Time to Throw  % Passes Behind LOS  % Passes Deep  % Rush Inside  % Rush Outside  % Plays 11 Personnel  % Plays Heavy Personnel  % Plays Mult RBs  % Plays Zero RBs  % Plays Mult TEs  % Plays Zero TEs  % Plays Extra OL  Shotgun Plays  Under Center Plays  Shotgun Neutral_Down_Plays  Under Center Neutral_Down_Plays  Shotgun % Pass  Under Center % Pass  % Under Center  % Shotgun  % Under Center Neutral Downs  % Shotgun Neutral Downs  MaxTargets  MaxTargetShare  N_Receivers_FivePctTargetShare  MaxRushAttempts  MaxRushAttem

In [3]:
''' Features '''
# '% Pass Neutral Downs', '% Under Center Neutral Downs', '% Shotgun Neutral Downs',

OFFENSE_FEATURES = [
    'Plays / Game', 'Drives / Game', 
    '% Pass',  'Scrambles / Game',
    '% Plays 11 Personnel', '% Plays Heavy Personnel', '% Plays Mult RBs', '% Plays Zero RBs', '% Plays Mult TEs', '% Plays Zero TEs', '% Plays Extra OL',
    '% Under Center', '% Shotgun', 'Shotgun % Pass', 'Under Center % Pass',
    'ADOT', 'ADOT to Sticks', 'Avg Time to Throw', '% Passes Behind LOS', '% Passes Deep', 'MaxTargetShare',
    '% Rush Inside', '% Rush Outside', 'MaxRushAttemptsShare',
]

VIZ_FEATURES = ['Plays / Game', '% Pass', 'Scrambles / Game', 
                '% Plays Plays_11_Personnel',
                '% Under Center', 'ADOT', 'Avg Time to Throw', 'MaxTargetShare', 
                '% Rush Outside', 'MaxRushAttemptsShare']

# VIZ_FEATURES = ['Plays / Game', '% Pass', 'Scrambles / Game', 
#                 '% Plays Plays_11_Personnel', '% Plays Plays_Mult_RBs', '% Plays Plays_Mult_TEs',
#                 '% Under Center', 'Shotgun % Pass', 'Under Center % Pass', 
#                 'ADOT', 'Avg Time to Throw', '% Passes Behind LOS', '% Passes Deep', 'MaxTargetShare', 
#                 '% Rush Outside', 'MaxRushAttemptsShare']

# Initial Visualization

In [4]:
''' Correlation Matrix '''

corr_matrix = offense_tendencies[OFFENSE_FEATURES].corr()

fig = px.imshow(
    corr_matrix,
    color_continuous_scale=px.colors.diverging.PRGn,
    aspect="auto"
)
fig.update_xaxes(side="top")
fig.update_coloraxes(
    cmid=0,
    showscale=False,
)
fig.update_layout(
    title='Feature Correlations',
    margin=dict(r=25, b=25)
)
fig.show()

In [5]:
''' Variance '''

# Calculate variance, get 10 largest features
top_ten_variance = offense_tendencies[OFFENSE_FEATURES].var().sort_values().tail(10)

# Create horizontal bar chart of `top_ten_var`
fig = px.bar(
    x=top_ten_variance,
    y=top_ten_variance.index,
    title="Offense Tendencies: High Variance Features"
)
fig.update_layout(xaxis_title="Variance", yaxis_title="Features")
fig.show()

# Calculate trimmed variance
top_ten_trim_variance = offense_tendencies[OFFENSE_FEATURES].apply(trimmed_var).sort_values().tail(10)

# Create horizontal bar chart of `top_ten_trim_var`
fig = px.bar(
    x=top_ten_trim_variance,
    y=top_ten_trim_variance.index,
    title="Offense Tendencies: High Variance Features (Trimmed)"
)
fig.update_layout(xaxis_title="Trimmed Variance", yaxis_title="Features")
fig.show()


In [None]:
''' Feature Distributions '''

# fig = px.histogram(
#     data_frame=offense_tendencies,
#     x='Plays / Game',
#     title='Plays / Game'
# )
# fig.show()

# Model Preprocessing

In [57]:
model_input = offense_tendencies[OFFENSE_FEATURES].copy()

print(model_input.shape)
print(model_input.head().to_string())

(288, 24)
                Plays / Game  Drives / Game    % Pass  Scrambles / Game  % Plays 11 Personnel  % Plays Heavy Personnel  % Plays Mult RBs  % Plays Zero RBs  % Plays Mult TEs  % Plays Zero TEs  % Plays Extra OL  % Under Center  % Shotgun  Shotgun % Pass  Under Center % Pass      ADOT  ADOT to Sticks  Avg Time to Throw  % Passes Behind LOS  % Passes Deep  MaxTargetShare  % Rush Inside  % Rush Outside  MaxRushAttemptsShare
posteam season                                                                                                                                                                                                                                                                                                                                                                                                                        
ARI     2016         42.6875          7.875  0.585652            0.1250              0.554905                 0.368960          0.060029        

In [58]:
''' Transform and Scale '''

# Log transform data
# transformed_data = pd.DataFrame(np.log(offense_tendencies[OFFENSE_FEATURES]), columns=OFFENSE_FEATURES).replace(math.inf, 0).replace(-(math.inf), 0)

# Scale data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(model_input)

# Put `scaled_data` into DataFrame
scaled_data_df = pd.DataFrame(scaled_data, columns=model_input.columns)

print("scaled DF type:", type(scaled_data_df))
print("scaled DF shape:", scaled_data_df.shape)
scaled_data_df.head()

scaled DF type: <class 'pandas.core.frame.DataFrame'>
scaled DF shape: (288, 24)


Unnamed: 0,Plays / Game,Drives / Game,% Pass,Scrambles / Game,% Plays 11 Personnel,% Plays Heavy Personnel,% Plays Mult RBs,% Plays Zero RBs,% Plays Mult TEs,% Plays Zero TEs,...,Under Center % Pass,ADOT,ADOT to Sticks,Avg Time to Throw,% Passes Behind LOS,% Passes Deep,MaxTargetShare,% Rush Inside,% Rush Outside,MaxRushAttemptsShare
0,0.753261,1.547027,-0.124128,-1.524084,-0.128782,-0.145093,-0.415819,-0.456262,0.086337,2.085008,...,0.417428,2.060821,1.669651,-1.494379,-2.29763,0.119912,0.508965,-1.080209,-0.528517,1.955732
1,0.811265,1.547027,0.139443,-0.815344,-0.6789,-0.004189,-0.770463,0.174885,0.656896,2.427424,...,0.878225,1.694859,1.534729,-1.087927,-1.056597,-0.261757,1.331324,-1.290108,-1.367631,-1.300776
2,-2.378942,2.039292,-0.033871,-0.714096,0.444104,-0.134363,0.095923,-0.504965,-0.171846,-0.531032,...,0.31971,0.35444,0.371642,-0.019875,0.401781,0.203802,0.560056,-1.958417,-1.963273,1.691972
3,0.811265,1.054761,1.284133,0.197141,-1.418218,-0.429171,-0.46433,0.320567,-0.21119,6.632459,...,-0.312585,-0.806263,-0.78233,-0.31025,1.630594,0.234546,-0.744999,-2.758313,1.91719,-1.751149
4,1.449306,0.316363,0.820029,2.019614,-0.544652,-0.184776,-0.523518,-0.410866,0.169814,2.738886,...,1.268601,-0.159931,-0.509666,0.133184,0.709167,-0.135225,1.295998,-2.264137,0.744357,0.173475


In [59]:
''' Variance of Scaled / Transformed Data '''

# Calculate variance, get 10 largest features
top_ten_variance = scaled_data_df.var().sort_values().tail(10)

# Create horizontal bar chart of `top_ten_var`
fig = px.bar(
    x=top_ten_variance,
    y=top_ten_variance.index,
    title="Scaled Data: High Variance Features"
)
fig.update_layout(xaxis_title="Variance", yaxis_title="Features")
fig.show()

In [60]:
''' t-SNE '''

## Experiment ##
# perplexities = [i for i in range(5, 50, 5)]
perplexities = [i for i in range(3, 10, 1)]

for n_components in [2]:    #,3]:
    divergences = []

    for p in perplexities:
        # Model
        tsne_model = TSNE(n_components=n_components, perplexity=p, random_state=42)
        
        # Fit
        y = tsne_model.fit_transform(scaled_data_df)

        # Divergence
        divergences.append(tsne_model.kl_divergence_)

        fig = px.scatter(
            x=[r[0] for r in y],
            y=[r[1] for r in y],
        )
        fig.update_layout(title=f'TSNE<br><sup>perplexity = {p}</sup>',
                          xaxis_title='TSNE Component 1', yaxis_title='TSNE Component 2',
                          height=300)
        fig.show()

    # # Graph divergence
    # fig = px.line(
    #     x=perplexities, 
    #     y=divergences,
    #     markers=True,
    #     title=f't-SNE Perplexity - {n_components} Components'
    # )
    # fig.update_layout(xaxis_title="Perplexity Values", yaxis_title="Divergence")
    # fig.update_traces(line_color="red", line_width=1)
    # fig.show()


In [61]:
## Final ##
PERPLEXITY = 3
TSNE_N_COMPONENTS = 3
TSNE_N_COMPONENT_NAMES = [f'TSNE Component {n+1}' for n in range(TSNE_N_COMPONENTS)]

# Model
tsne_model = TSNE(n_components=TSNE_N_COMPONENTS, perplexity=PERPLEXITY, random_state=42)

# Fit
y = tsne_model.fit_transform(scaled_data_df)
print(f'Divergence:', tsne_model.kl_divergence_)

# Results df
tsne_df = pd.DataFrame(y, columns=TSNE_N_COMPONENT_NAMES)

print(tsne_df.shape)
print(tsne_df.head().to_string())

Divergence: 1.111440896987915
(288, 3)
   TSNE Component 1  TSNE Component 2  TSNE Component 3
0         -1.848989        -12.595859         19.566944
1         -2.228079        -12.805877         19.685261
2          2.961018        -10.405793          1.730735
3         -9.918005         14.153601         -4.530596
4        -10.032577         12.465304         -5.676982


In [62]:

def tsne_chart(colors: list = None):

    fig = go.Figure()
    if TSNE_N_COMPONENTS == 2:
        fig = px.scatter(
            x=tsne_df['TSNE Component 1'],
            y=tsne_df['TSNE Component 2'],
            color=colors,
        )
    else:
        fig = px.scatter_3d(
            x=tsne_df['TSNE Component 1'],
            y=tsne_df['TSNE Component 2'],
            z=tsne_df['TSNE Component 3'],
            color=colors,
        )
    fig.update_layout(
        xaxis_title='TSNE Component 1', 
        yaxis_title='TSNE Component 2'
    )
        
    return fig


fig = tsne_chart()
fig.show()

# Feature Selection

1. Raw Features
    1. Could do Pearson feature selection variant; instead of features with highest correlation to Y, features with most variance, then remove features with collinearity > some threshold
1. PCA

In [63]:
''' PCA '''

# Instantiate transformer
pca = PCA(random_state=42)

# Transform data with pa
pca_component_data = pca.fit_transform(scaled_data_df)

print('Total variance:', scaled_data_df.var().sum())
print(f'Singular values:\n', pca.singular_values_)
print(f'Explained variance:\n', pca.explained_variance_.round(5))
print(f'Ratio:\n', pca.explained_variance_ratio_.round(3))
print(pca.feature_names_in_)

# Create horizontal bar chart of explained variance
fig = px.line(
    x=[i + 1 for i in range(len(pca.explained_variance_ratio_))],
    y=pca.explained_variance_ratio_.cumsum(),
    title="Explained variance"
)
fig.update_layout(xaxis_title="Principal Component", yaxis_title="Cumulative Explained Variance (%)")
fig.show()

Total variance: 24.083623693379792
Singular values:
 [3.20369384e+01 2.95276734e+01 2.85297044e+01 2.27546587e+01
 2.25571021e+01 2.07322119e+01 1.94703016e+01 1.82701242e+01
 1.77868255e+01 1.62653683e+01 1.56951284e+01 1.51599986e+01
 1.39930397e+01 1.36205780e+01 1.26891401e+01 1.20669209e+01
 1.07817970e+01 9.79707909e+00 6.49558018e+00 3.93526793e+00
 3.12306319e+00 2.06230672e+00 1.23832053e+00 2.14711774e-07]
Explained variance:
 [3.57619 3.03792 2.83604 1.80409 1.7729  1.49765 1.32088 1.16306 1.10234
 0.92182 0.85832 0.80079 0.68225 0.64641 0.56103 0.50735 0.40504 0.33443
 0.14701 0.05396 0.03398 0.01482 0.00534 0.     ]
Ratio:
 [0.148 0.126 0.118 0.075 0.074 0.062 0.055 0.048 0.046 0.038 0.036 0.033
 0.028 0.027 0.023 0.021 0.017 0.014 0.006 0.002 0.001 0.001 0.    0.   ]
['Plays / Game' 'Drives / Game' '% Pass' 'Scrambles / Game'
 '% Plays 11 Personnel' '% Plays Heavy Personnel' '% Plays Mult RBs'
 '% Plays Zero RBs' '% Plays Mult TEs' '% Plays Zero TEs'
 '% Plays Extra OL' '

In [64]:
''' PCA - final '''

# Set number of PCA components to use after initial try
PCA_N_COMPONENTS = 8
PCA_COMPONENT_COLS = [f'PCA Component {n}' for n in range(1, PCA_N_COMPONENTS + 1)]

# Instantiate transformer
pca_final = PCA(n_components=PCA_N_COMPONENTS, random_state=42)

# Transform sku profiles
pca_component_data_final = pca_final.fit_transform(scaled_data_df)

# Evaluate components
total_variance = scaled_data_df.var().sum()
expl_variance = pca_final.explained_variance_.sum()

print(f'Data set variance: {total_variance:,.3f}')
print(f'PCA explained variance: {expl_variance:,.3f} ({round((expl_variance / total_variance) * 100, 2)}%)')

pcs = pd.DataFrame(pca_final.components_, columns=OFFENSE_FEATURES)

# Create bar charts of contribution
for n in range(2):      #PCA_N_COMPONENTS):
    pc = pcs.transpose()[n].sort_values(ascending=False)

    fig = px.bar(
        x=pc,
        y=pc.index,
        title=f"PC{n+1}: Greatest contributors"
    )
    fig.update_layout(
        xaxis_title="Correlation", 
        yaxis_title="Features", 
        yaxis={'dtick': 1, 'categoryorder':'total ascending'},
    )
    fig.show()

    comp_expl_variance = pca_final.explained_variance_[n]
    print(f'Explained variance: {comp_expl_variance:,} ({round((comp_expl_variance / total_variance) * 100, 2)}%)')
                                                                                                                                                                                                              
# Make df of PCA scores
pca_component_df = pd.DataFrame(data=pca_component_data_final, columns=PCA_COMPONENT_COLS)
print(pca_component_df.shape)
print(pca_component_df.head().to_string())

# Add PCA scores to original dataframe
offense_tendencies = offense_tendencies.drop(columns=list(filter(lambda x: x.startswith("Component"), offense_tendencies.columns)))
offense_tendencies = offense_tendencies.reset_index().merge(pca_component_df, left_index=True, right_index=True, how='left').set_index(['posteam', 'season'])


print(f'PCA values')
print(offense_tendencies.head().to_string())

Data set variance: 24.084
PCA explained variance: 17.009 (70.62%)


Explained variance: 3.5761861415223444 (14.85%)


Explained variance: 3.037921589269329 (12.61%)
(288, 8)
   PCA Component 1  PCA Component 2  PCA Component 3  PCA Component 4  PCA Component 5  PCA Component 6  PCA Component 7  PCA Component 8
0         2.630879         0.250048        -3.795316         0.427976         0.216030         0.161758        -2.516539         0.888566
1         2.795776         0.001950        -3.152501         1.846107         0.766897         0.705557        -1.260217         0.290878
2         1.029318        -1.222550        -1.430417        -1.643194        -0.148779        -0.626092        -1.457965         0.021756
3        -2.920131         1.081205         1.300987         4.445241         1.015209         3.836069        -1.283636        -2.083147
4        -2.881399         2.070766         1.064544         2.529379         1.381222         1.482659         0.105243        -0.057758
PCA values
                Games  Drives  Plays  Neutral_Down_Plays  Pass_Plays  Neutral_Down_Pass  Pass_Attempts  Q

# Clustering Experiment

https://scikit-learn.org/stable/modules/clustering.html  
Geometry *appears* to be spherical, which would mean non-flat. Which would lend to:
1. Affinity Propagation
1. Mean-shift
1. Spectral Clustering
1. DBSCAN
1. HDBSCAN
1. OPTICS

Not sure on number of clusters (few vs. many). I don't see many obvious clusters based on PCA / t-SNE visualization

The last 3 are density based and suited to very large N samples, which isn't this

In [69]:
''' Input '''

# Input DF
cluster_input = pca_component_df.copy()
# cluster_input.index = offense_tendencies.index

# Constants
cluster_n_features = len(cluster_input.columns)
teams = [f'{i[0]} {i[1]}' for i in offense_tendencies.index]

print(teams)
print(f'{cluster_n_features = }')
print(cluster_input.head().to_string())

['ARI 2016', 'ARI 2017', 'ARI 2018', 'ARI 2019', 'ARI 2020', 'ARI 2021', 'ARI 2022', 'ARI 2023', 'ARI 2024', 'ATL 2016', 'ATL 2017', 'ATL 2018', 'ATL 2019', 'ATL 2020', 'ATL 2021', 'ATL 2022', 'ATL 2023', 'ATL 2024', 'BAL 2016', 'BAL 2017', 'BAL 2018', 'BAL 2019', 'BAL 2020', 'BAL 2021', 'BAL 2022', 'BAL 2023', 'BAL 2024', 'BUF 2016', 'BUF 2017', 'BUF 2018', 'BUF 2019', 'BUF 2020', 'BUF 2021', 'BUF 2022', 'BUF 2023', 'BUF 2024', 'CAR 2016', 'CAR 2017', 'CAR 2018', 'CAR 2019', 'CAR 2020', 'CAR 2021', 'CAR 2022', 'CAR 2023', 'CAR 2024', 'CHI 2016', 'CHI 2017', 'CHI 2018', 'CHI 2019', 'CHI 2020', 'CHI 2021', 'CHI 2022', 'CHI 2023', 'CHI 2024', 'CIN 2016', 'CIN 2017', 'CIN 2018', 'CIN 2019', 'CIN 2020', 'CIN 2021', 'CIN 2022', 'CIN 2023', 'CIN 2024', 'CLE 2016', 'CLE 2017', 'CLE 2018', 'CLE 2019', 'CLE 2020', 'CLE 2021', 'CLE 2022', 'CLE 2023', 'CLE 2024', 'DAL 2016', 'DAL 2017', 'DAL 2018', 'DAL 2019', 'DAL 2020', 'DAL 2021', 'DAL 2022', 'DAL 2023', 'DAL 2024', 'DEN 2016', 'DEN 2017', 'DE

In [70]:
''' Nearest Neighbors '''
## GOAL: estimate ideal parameter values for epsilon (eps)

# Init model
k_neighbors_model = NearestNeighbors(n_neighbors=cluster_n_features)

# Fit and find nearest neighbors
k_neighbors_model.fit(cluster_input)
distances, indices = k_neighbors_model.kneighbors(cluster_input)

# Plot sorted distances, find the "elbow"
distances = np.sort(distances, axis=0)
distances = distances[:,1]

fig = px.line(
    data_frame=distances
)
fig.update_layout(xaxis_title='N Neighbors', yaxis_title='Distances (EPS)')
fig.show()

In [77]:
''' Experiment '''
# https://scikit-learn.org/stable/auto_examples/cluster/plot_cluster_comparison.html


## Parameters

params = {
    "quantile": 0.3,
    "eps": 3.0,
    "damping": 0.9,
    "preference": -200,
    "n_neighbors": 3,
    "n_clusters": 3,
    "min_samples": 7,
    "xi": 0.05,
    "min_cluster_size": 0.1,
    "allow_single_cluster": True,
    "hdbscan_min_cluster_size": 15,
    "hdbscan_min_samples": 3,
    "random_state": 42,
}

# estimate bandwidth for mean shift
bandwidth = estimate_bandwidth(X, quantile=params["quantile"])

## Models ##

affinity_propagation = AffinityPropagation(
    damping=params["damping"],
    preference=params["preference"],
    random_state=params["random_state"],
)

ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)

spectral = SpectralClustering(
    n_clusters=params["n_clusters"],
    eigen_solver="arpack",
    affinity="nearest_neighbors",
    random_state=params["random_state"],
)
dbscan = DBSCAN(eps=params["eps"])

hdbscan = HDBSCAN(
    min_samples=params["hdbscan_min_samples"],
    min_cluster_size=params["hdbscan_min_cluster_size"],
    allow_single_cluster=params["allow_single_cluster"],
    copy=True,
)

optics = OPTICS(
    min_samples=params["min_samples"],
    xi=params["xi"],
    min_cluster_size=params["min_cluster_size"],
)

clustering_algorithms = (
    ("Affinity\nPropagation", affinity_propagation),
    ("MeanShift", ms),
    ("Spectral\nClustering", spectral),
    ("DBSCAN", dbscan),
    ("HDBSCAN", hdbscan),
    ("OPTICS", optics)
)

## Evaluate ##

for name, algorithm in clustering_algorithms:
    print(name)

    # Fit
    t0 = time()
    algorithm.fit(cluster_input)
    t1 = time()

    # Labels
    y_pred = None
    if hasattr(algorithm, "labels_"):
        y_pred = algorithm.labels_.astype(int)
    else:
        y_pred = algorithm.predict(cluster_input)
    
    n_clusters_ = len(set(y_pred)) - (1 if -1 in y_pred else 0)

    # Visualize
    fig = tsne_chart(colors=y_pred.astype(str))
    fig.update_layout(
        title=f'{name}<br><sup>{n_clusters_ = }</sup>',
    )
    fig.show()

Affinity
Propagation


MeanShift


Spectral
Clustering


DBSCAN


HDBSCAN


OPTICS


# Spectral

In [78]:
spectral_n_clusters = [4, 6, 8, 10]

for n_clusters in spectral_n_clusters:
    # Model
    spectral = SpectralClustering(
        n_clusters=n_clusters,
        eigen_solver="arpack",
        affinity="nearest_neighbors",
        random_state=42,
    )
    
    # Fit
    spectral.fit(cluster_input)

    # Labels
    labels = spectral.labels_.astype(str)

    # Visualize
    fig = tsne_chart(colors=labels)
    fig.update_layout(
        title=f'Spectral Clustering<br><sup>{n_clusters = }</sup>'
    )
    fig.show()

# HDBSCAN

In [76]:
''' Experiment '''

min_cluster_size_options = [3, 5, 7, 9]
min_samples_options = [3, 5, 7, 9]
cluster_selection_method = 'eom'

for min_cluster_size in min_cluster_size_options:
    for min_samples in min_samples_options:
        # Create model
        cluster_model = HDBSCAN(
            copy=True,
            min_cluster_size=min_cluster_size,
            min_samples=min_samples,
            cluster_selection_method=cluster_selection_method
        )

        # Fit
        cluster_model.fit(cluster_input)

        # Number of clusters, ignoring noise
        labels = cluster_model.labels_
        n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
        n_noise_ = list(labels).count(-1)

        # Visualize
        fig = tsne_chart(labels.astype(str))
        fig.update_layout(
            title=f'HDBSCAN<br><sup>{n_clusters_ = } | {n_noise_ = } | {min_cluster_size = } | {min_samples = } | {cluster_selection_method = }</sup>'
        )
        fig.show()
