# Route Clustering with DTW and Statistical Analysis (Phase 1)



## 1. Setup & Imports

In [None]:
!pip install tslearn

In [None]:
import os
import sys
import re

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.manifold import MDS

from tslearn.metrics import cdist_dtw, dtw
from tslearn.clustering import TimeSeriesKMeans
from tslearn.preprocessing import TimeSeriesResampler

from collections import defaultdict, Counter

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
os.chdir('/content/drive/My Drive/ClusterRoute')
print("Current working directory:", os.getcwd())

In [None]:
from src.final_clean_cluster_utils import (
    load_tracking_data, load_pass_plays, load_players,
    create_display_names, get_receivers_df, get_tracking_receivers,
    get_snap_frames, filter_post_snap,
    clean_route, extract_clean_routes, preprocess_routes,
    compute_total_distance, compute_straightness, compute_curvature_density,
    dtw_distance,
    plot_clusters_with_average, plot_centroids, plot_cluster_distribution,
    average_trajectory
)

In [None]:
os.makedirs("cluster_visualizations", exist_ok=True)
os.makedirs("saved_data", exist_ok=True)

## 2. Load & Prepare Tracking Data


In [None]:
tracking_df = load_tracking_data()
pass_plays_df = load_pass_plays()
players_df = create_display_names(load_players())
receivers_df = get_receivers_df(players_df)

In [None]:
tracking_df = tracking_df.merge(
    players_df[['displayName', 'PositionAbbr']],
    on='displayName',
    how='left'
)
tracking_df = tracking_df.rename(columns={'PositionAbbr': 'position'})

In [None]:
tracking_receivers = get_tracking_receivers(tracking_df, receivers_df)
snap_frames = get_snap_frames(tracking_receivers)
tracking_post_snap = filter_post_snap(tracking_receivers, snap_frames)

## 3. Route Path Extraction & Preprocessing

In [None]:
grouped_routes = tracking_post_snap.groupby(['displayName', 'playId'])


In [None]:
# Clean and extract raw route paths
route_paths, route_keys = extract_clean_routes(tracking_post_snap)

# Preprocess routes
routes_scaled, filtered_keys, filtered_routes = preprocess_routes(
    route_paths, route_keys, target_len=50, return_raw=True
)


## 4.Feature Engineering (DTW + Statistics)

In [None]:
# Build distance matrix using y-coordinates
np.random.seed(42)
data = [route[:, 1] for route in filtered_routes]

dtw_distances = np.zeros((len(data), len(data)))
for i in range(len(data)):
    for j in range(i + 1, len(data)):
        dist = dtw_distance(data[i], data[j])
        dtw_distances[i, j] = dist
        dtw_distances[j, i] = dist

In [None]:
from sklearn.manifold import MDS

mds = MDS(n_components=2, dissimilarity='precomputed', random_state=42)
dtw_features = mds.fit_transform(dtw_distances)


In [None]:
feature_matrix = []
for ts in data:
    stats = [
        np.mean(ts), np.std(ts),
        np.min(ts), np.max(ts),
        np.median(ts),
        np.percentile(ts, 25),
        np.percentile(ts, 75)
    ]
    feature_matrix.append(stats)
feature_matrix = np.array(feature_matrix)


In [None]:
combined_features = np.hstack((dtw_features, feature_matrix))
scaler = StandardScaler()
combined_scaled = scaler.fit_transform(combined_features)


## 5. Route Clustering with KMeans

In [None]:
kmeans = KMeans(n_clusters=6, random_state=42)
labels = kmeans.fit_predict(combined_scaled)

In [None]:
pca = PCA(n_components=2)
pca_proj = pca.fit_transform(combined_scaled)

plt.figure(figsize=(8, 5))
sns.scatterplot(
    x=pca_proj[:, 0], y=pca_proj[:, 1],
    hue=labels, palette='viridis', s=100
)
plt.title("Route Type Clustering via DTW + Stats")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.legend(title="Cluster")
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd

# Get feature names
feature_names = ['DTW_x', 'DTW_y', 'mean', 'std', 'min', 'max', 'median', '25%', '75%']

# Show elements
pca_components = pd.DataFrame(pca.components_, columns=feature_names)
pca_components.index = ['PCA Component 1', 'PCA Component 2']
print(pca_components)


## 6. Sample Cluster Routes

In [None]:
# Sample routes per cluster
fig, axes = plt.subplots(len(np.unique(labels)), 3, figsize=(9, len(np.unique(labels)) * 3))

for row, cluster_id in enumerate(np.unique(labels)):
    cluster_indices = np.where(labels == cluster_id)[0]
    selected = np.random.choice(cluster_indices, size=3, replace=False)
    for col, i in enumerate(selected):
        route = filtered_routes[i]
        ax = axes[row, col] if len(labels) > 1 else axes[col]
        ax.plot(route[:, 0], route[:, 1], linewidth=2)
        ax.set_title(f"Cluster {cluster_id} – Route {i}")
        ax.axis('equal')
        ax.axis('off')
plt.tight_layout()
plt.show()


# 7. Route Summary Table

In [None]:
cluster_name_dict = {
    0: "Extended Zig",
    1: "Angular Stretch",
    2: "Linear Dart",
    3: "Compact Curve",
    4: "Burst Drop",
    5: "Sneak Hook",
}

route_summary = []
for i in range(len(labels)):
    cluster_id = labels[i]
    cluster_label = cluster_name_dict.get(cluster_id, "Unknown")
    player, playId = filtered_keys[i]
    stats = feature_matrix[i] if 'feature_matrix' in globals() else ["–"]*4

    route_summary.append({
        "Player": player,
        "Play ID": playId,
        "Cluster": cluster_id,
        "Route Type": cluster_label,
        "Length": stats[0],
        "Curvature": stats[1],
        "Angle": stats[2],
        "Duration": stats[3]
    })

import pandas as pd
df_routes = pd.DataFrame(route_summary)
print(df_routes.head())

# 8. Stat Summary by Cluster ("Route Type")

In [None]:
avg_stats = df_routes.groupby("Route Type")[["Length", "Curvature", "Angle", "Duration"]].mean().reset_index()
print(avg_stats)


In [None]:
import importlib
import src.final_clean_cluster_utils
importlib.reload(src.final_clean_cluster_utils)
from src.final_clean_cluster_utils import save_boxplots_by_metric
from src.final_clean_cluster_utils import save_barplots_by_metric

In [None]:
metrics = ["Length", "Curvature", "Angle", "Duration"]
colors = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728"]

save_boxplots_by_metric(df_routes, metrics)
save_barplots_by_metric(avg_stats, metrics, colors)

## 9. More Cluster Visuals

In [None]:
plot_clusters_with_average(routes_scaled, labels)
plot_centroids(kmeans)
plot_cluster_distribution(labels)


## 10. Cluster Qualitty Eval

In [None]:
X = combined_scaled
k_range = range(2, 11)
metrics = {'k': [], 'silhouette_score': [], 'inertia': [], 'davies_bouldin_index': []}

for k in k_range:
    model = KMeans(n_clusters=k, random_state=42)
    k_labels = model.fit_predict(X)
    metrics['k'].append(k)
    metrics['silhouette_score'].append(silhouette_score(X, k_labels))
    metrics['inertia'].append(model.inertia_)
    metrics['davies_bouldin_index'].append(davies_bouldin_score(X, k_labels))

df_metrics = pd.DataFrame(metrics)
df_metrics.to_csv('clustering_metrics_table.csv', index=False)
print(df_metrics)


In [None]:
def plot_metric(df, metric, color, marker, title, filename):
    plt.figure(figsize=(8, 5))
    plt.plot(df['k'], df[metric], marker=marker, color=color)
    plt.title(f'{title} vs k')
    plt.xlabel('Number of Clusters (k)')
    plt.ylabel(title)
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(filename)
    plt.show()

plot_metric(df_metrics, 'silhouette_score', 'blue', 'o', 'Silhouette Score', 'silhouette_score_plot.png')
plot_metric(df_metrics, 'inertia', 'green', 's', 'Inertia', 'inertia_plot.png')
plot_metric(df_metrics, 'davies_bouldin_index', 'red', '^', 'Davies-Bouldin Index', 'davies_bouldin_plot.png')


## 11. Save DataFrames

In [None]:
df_routes.to_csv("saved_data/route_summary.csv", index=False)
df_metrics.to_csv("saved_data/clustering_metrics_table.csv", index=False)