In [31]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import warnings
from functools import reduce
from sklearn.exceptions import ConvergenceWarning
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering, AffinityPropagation
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from common import map_labels_to_plays, plot_clusters

In [3]:
# Load up the data
df = pd.read_csv("./data/tracking_week_1.csv")
play_df = pd.read_csv("./data/plays.csv")
players = pd.read_csv("./data/players.csv")

formation_df = df[df["event"] == "ball_snap"]
formation_df = formation_df.merge(
    play_df[["gameId", "playId", "possessionTeam"]], on=["gameId", "playId"], how="left"
)
position_df = formation_df.merge(players[["position", "nflId"]], on="nflId", how="left")
position_df = position_df[(position_df["possessionTeam"] == position_df["club"])]

# # Make sure only one Center per play
center_positions = position_df[position_df["position"] == "C"].groupby(
    ["gameId", "playId"]
).nth(0)[
    ["gameId", "playId", "frameId", "x", "y"]
]
center_positions = center_positions.rename(columns={"x": "center_x", "y": "center_y"})

distance_df = position_df.merge(
    center_positions, on=["gameId", "playId", "frameId"], how="left"
)
distance_df = distance_df[distance_df["displayName"] != "football"]

off_df = distance_df[~(distance_df["displayName"] == "football")]
off_df["distance_from_C"] = np.sqrt(
    (off_df["x"] - off_df["center_x"]) ** 2 + (off_df["y"] - off_df["center_y"]) ** 2
)
off_df["rel_x_c"] = np.abs(off_df["center_x"] - off_df["x"])
off_df["rel_y_c"] = off_df["center_y"] - off_df["y"]
print(off_df.shape)
off_df.drop(
    columns=[
        "nflId",
        "displayName",
        "frameId",
        "frameType",
        "time",
        "jerseyNumber",
        "club",
        "playDirection",
        "o",
        "dir",
        "event",
        "possessionTeam",
        "s",
        "a",
        "dis",
        "center_x",
        "center_y",
    ],
    inplace=True,
)
off_df = off_df.set_index(['gameId', 'playId'])
print(off_df.head())
print(off_df.shape)

(21428, 25)
                       x      y position  distance_from_C  rel_x_c  rel_y_c
gameId     playId                                                          
2022091200 64      37.94  23.86       QB         1.246154     1.23    -0.20
           64      38.52  22.21        G         1.589025     0.65     1.45
           64      39.41  14.39       WR         9.273106     0.24     9.27
           64      39.17  23.66        C         0.000000     0.00     0.00
           64      32.15  23.82       RB         7.021823     7.02    -0.16
(21428, 6)


In [4]:
# Collapse each play into one row with distance for WR1, WR2, WR3, WR4, WR5, QB, RB1, RB2, FB, TE1, TE2, TE3
# Ideally I would be able to use more granular wide receiver positions (x, y, slot, etc.) but I don't have the metadata
# to accomplish that
# def position_agg_sum(series):
#     return reduce(lambda x, y: x + y, series['position'] == 'WR')
game_play_group = off_df.groupby(['gameId', 'playId'])

# I don't like iterating through each row in pandas but this might be the only way to do what I'm trying to accomplish
dicts = []
s_template = {
    'index': 0,
    'wr_count': 0,
    'qb_x': 0,
    'te_count': 0,
    'blocker_count': 0
}
scaled_df = off_df.copy()
scaled_df['rel_x_c'] = (scaled_df['rel_x_c'] /5).round().astype(int) * 5
scaled_df['rel_y_c'] = (scaled_df['rel_y_c'] /3).round().astype(int) * 3
# scaled_df[['rel_x_c', 'rel_y_c']] = StandardScaler().fit_transform(scaled_df[['rel_x_c', 'rel_y_c']])
# print(scaled_df.head())
for (gameId, playId), groups in game_play_group:
    s = s_template.copy()
    s['index'] = (gameId, playId)
    indexed_df = scaled_df.loc[[(gameId, playId)]]
    wr_df = indexed_df[indexed_df['position'] == 'WR'].sort_values(by=['rel_y_c']).head(5)
    s['wr_count'] = len(wr_df)

    te_df = indexed_df[indexed_df['position'] == 'TE'].sort_values(by=['rel_y_c']).head(3)
    s['te_count'] = len(te_df)
    qb_df = indexed_df[indexed_df['position'] == 'QB'].head(1)
    s[f'qb_x'] = qb_df.iloc[0]['rel_x_c']

    blocker_df = indexed_df[(indexed_df['position'].isin(['FB', 'T']))]
    s['blocker_count'] = len(blocker_df)
    dicts.append(s)
model_df = pd.DataFrame(dicts)
model_df = model_df.set_index('index')
print(model_df.head())
print(model_df.shape)



                   wr_count  qb_x  te_count  blocker_count
index                                                     
(2022090800, 56)          3     5         1              2
(2022090800, 80)          3     5         1              2
(2022090800, 101)         2     0         1              3
(2022090800, 122)         3     5         1              2
(2022090800, 167)         3     5         1              2
(1948, 4)


In [29]:
c = 2
silhouette_avg = 0
results = []
unique_clusters = 0
while silhouette_avg < 1 and c < 200:
    with warnings.catch_warnings(record=True) as w:
        clusterer = KMeans(n_clusters=c, random_state=0, n_init="auto", init="k-means++").fit(model_df)
        unique_clusters = len(set(clusterer.labels_))
        if len(w) > 0 and issubclass(w[-1].category, ConvergenceWarning):
            # This means further increase in clusters won't be useful
            c = 201 # will cause loop to stop
            break
        cluster_labels = clusterer.fit_predict(model_df)
        silhouette_avg = silhouette_score(model_df, cluster_labels)
        
        results.append({'n_clusters':c, 'silhouette_avg': silhouette_avg})
        c += 1
pd.DataFrame.from_dict(results).to_csv("./output/kmeans_silhouette.csv")
print(f'Identified {unique_clusters} clusters with a silhouette score of {silhouette_avg}')


Identified 50 clusters with a silhouette score of 0.9928131416837782


In [6]:
clusterer = KMeans(n_clusters=50, random_state=0, n_init="auto", init="random").fit(model_df)
label_map = map_labels_to_plays(cluster_labels, model_df)

# See output/KMeans.png for the plot
plot_clusters(model_df, off_df, label_map, "KMeans")

In [21]:
# Affinity Propogation because it should work for uneven clusters
# I experimented with different preferences and damping values, but no good results came of it.
clustering = AffinityPropagation(preference=-3, random_state=5).fit(model_df[['wr_count', 'qb_x', 'te_count', 'blocker_count']])
cluster_labels = clustering.labels_
silhouette_avg = silhouette_score(model_df, cluster_labels)
cluster_centers_indices = clustering.cluster_centers_indices_

print(f"For n_clusters = {len(cluster_centers_indices)} the average silhouette_score is {silhouette_avg}")

For n_clusters = 1644 the average silhouette_score is 0.023498718660978093




In [8]:
labeled_df = list(zip(cluster_labels, model_df.index.to_list()))
# print(model_df['index'])
print(labeled_df)

[(np.int64(454), (np.int64(2022090800), np.int64(56))), (np.int64(0), (np.int64(2022090800), np.int64(80))), (np.int64(578), (np.int64(2022090800), np.int64(101))), (np.int64(1), (np.int64(2022090800), np.int64(122))), (np.int64(9), (np.int64(2022090800), np.int64(167))), (np.int64(2), (np.int64(2022090800), np.int64(191))), (np.int64(3), (np.int64(2022090800), np.int64(212))), (np.int64(4), (np.int64(2022090800), np.int64(236))), (np.int64(13), (np.int64(2022090800), np.int64(299))), (np.int64(95), (np.int64(2022090800), np.int64(343))), (np.int64(318), (np.int64(2022090800), np.int64(364))), (np.int64(5), (np.int64(2022090800), np.int64(393))), (np.int64(14), (np.int64(2022090800), np.int64(414))), (np.int64(118), (np.int64(2022090800), np.int64(438))), (np.int64(477), (np.int64(2022090800), np.int64(467))), (np.int64(168), (np.int64(2022090800), np.int64(486))), (np.int64(59), (np.int64(2022090800), np.int64(529))), (np.int64(6), (np.int64(2022090800), np.int64(550))), (np.int64(7),

In [None]:
# Let's try DBSCAN clustering
for eps in np.arange(0.8, 1.5, 0.05):
    db = DBSCAN(eps=eps, min_samples=10).fit(model_df)
    labels = db.labels_
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise_ = list(labels).count(-1)

    print(eps)
    print("Estimated number of clusters: %d" % n_clusters_)
    print("Estimated number of noise points: %d" % n_noise_)

    # Get the silhouette score
    if n_clusters_ > 1:
        silhouette_avg = silhouette_score(model_df, labels)
        print("For eps =", eps, "The average silhouette_score is :", silhouette_avg)
    else:
        silhouette_avg = 0

0.8
Estimated number of clusters: 21
Estimated number of noise points: 76
For eps = 0.8 The average silhouette_score is : 0.9323907600119588
0.8500000000000001
Estimated number of clusters: 21
Estimated number of noise points: 76
For eps = 0.8500000000000001 The average silhouette_score is : 0.9323907600119588
0.9000000000000001
Estimated number of clusters: 21
Estimated number of noise points: 76
For eps = 0.9000000000000001 The average silhouette_score is : 0.9323907600119588
0.9500000000000002
Estimated number of clusters: 21
Estimated number of noise points: 76
For eps = 0.9500000000000002 The average silhouette_score is : 0.9323907600119588
1.0000000000000002
Estimated number of clusters: 2
Estimated number of noise points: 1
For eps = 1.0000000000000002 The average silhouette_score is : 0.7155690587768254
1.0500000000000003
Estimated number of clusters: 2
Estimated number of noise points: 1
For eps = 1.0500000000000003 The average silhouette_score is : 0.7155690587768254
1.100000

In [10]:
# Limited number of clusters with Agglomerative clustering for easier visualization
clustering = AgglomerativeClustering(n_clusters=50, linkage='complete', metric='euclidean').fit(model_df)
# Map the zipped list to a map of key=cluster label and values = all game/playids for that label
label_map = map_labels_to_plays(clustering.labels_, model_df)

silhouette_avg = silhouette_score(model_df, clustering.labels_)
print(f"For n_clusters = 50 The average silhouette_score is : {silhouette_avg}")

# See output/agglomerative_clustering.png for the plot
plot_clusters(model_df, off_df, label_map, 'agglomerative_clustering')


For n_clusters = 50 The average silhouette_score is : 0.9928131416837782
