In [11]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from functools import reduce
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering, AffinityPropagation
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from common import map_labels_to_plays, plot_clusters

In [3]:
# Load up the data
df = pd.read_csv('./data/tracking_week_1.csv')
play_df = pd.read_csv('./data/plays.csv')
players = pd.read_csv('./data/players.csv')

formation_df = df[df['event'] == 'ball_snap']
formation_df = formation_df.merge(play_df[['gameId', 'playId', 'possessionTeam']], on=['gameId', 'playId'], how='left')
position_df = formation_df.merge(players[['position', 'nflId']], on='nflId', how='left')

center_positions = position_df[position_df['position'] == 'C'][['gameId', 'playId', 'frameId', 'x', 'y']]
center_positions = center_positions.rename(columns={'x': 'center_x', 'y': 'center_y'})

distance_df = position_df.merge(center_positions, on=['gameId', 'playId', 'frameId'], how='left')
distance_df = distance_df[distance_df['displayName'] != 'football']

off_df = distance_df[(distance_df['possessionTeam'] == distance_df['club'])]
off_df = off_df[~(distance_df['displayName']=='football')]
off_df['distance_from_C'] = np.sqrt((off_df['x'] - off_df['center_x'])**2 + (off_df['y'] - off_df['center_y'])**2)
off_df['rel_x_c'] = np.abs(off_df['center_x'] - off_df['x'])
off_df['rel_y_c'] = off_df['center_y'] - off_df['y']
print(off_df.shape)
off_df.drop(columns=['nflId', 'displayName', 'frameId', 'frameType', 'time', 'jerseyNumber', 'club', 'playDirection', 'o', 'dir', 'event', 'possessionTeam', 's', 'a', 'dis', 'center_x', 'center_y'], inplace=True)
off_df = off_df.set_index(['gameId', 'playId'])
print(off_df.head())
print(off_df.shape)



(25608, 25)
                       x      y position  distance_from_C  rel_x_c  rel_y_c
gameId     playId                                                          
2022091200 64      37.94  23.86       QB         1.246154     1.23    -0.20
           64      38.52  22.21        G         1.589025     0.65     1.45
           64      39.41  14.39       WR         9.273106     0.24     9.27
           64      39.17  23.66        C         0.000000     0.00     0.00
           64      32.15  23.82       RB         7.021823     7.02    -0.16
(25608, 6)


  off_df = off_df[~(distance_df['displayName']=='football')]


In [4]:
# Collapse each play into one row with distance for WR1, WR2, WR3, WR4, WR5, QB, RB1, RB2, FB, TE1, TE2, TE3
# Ideally I would be able to use more granular wide receiver positions (x, y, slot, etc.) but I don't have the metadata
# to accomplish that
# def position_agg_sum(series):
#     return reduce(lambda x, y: x + y, series['position'] == 'WR')
game_play_group = off_df.groupby(['gameId', 'playId'])

# I don't like iterating through each row in pandas but this might be the only way to do what I'm trying to accomplish
dicts = []
s_template = {
    'index': 0,
    'wr1_y': 0,
    'wr2_y': 0,
    'wr3_y': 0,
    'wr4_y': 0,
    'wr5_y': 0,
    'qb_x': 0,
    'te1_y': 0,
    'te2_y': 0,
    'te3_y': 0,
    'rb1_x': 0,
    'rb1_y': 0,
    'rb2_x': 0,
    'rb2_y': 0,
    'fb1_x': 0,
}
scaled_df = off_df.copy()
scaled_df['rel_x_c'] = (scaled_df['rel_x_c'] /5).round().astype(int) * 5
scaled_df['rel_y_c'] = (scaled_df['rel_y_c'] /3).round().astype(int) * 3
# scaled_df[['rel_x_c', 'rel_y_c']] = StandardScaler().fit_transform(scaled_df[['rel_x_c', 'rel_y_c']])
print(scaled_df.head())
for (gameId, playId), groups in game_play_group:
    s = s_template.copy()
    s['index'] = (gameId, playId)
    indexed_df = scaled_df.loc[[(gameId, playId)]]
    wr_df = indexed_df[indexed_df['position'] == 'WR'].sort_values(by=['rel_y_c']).head(5)
    i = 0
    for index, row in wr_df.iterrows():
        s[f'wr{i+1}_y'] = row['rel_y_c']
        i+=1
    te_df = indexed_df[indexed_df['position'] == 'TE'].sort_values(by=['rel_y_c']).head(3)
    i = 0
    for index, row in te_df.iterrows():
        s[f'te{i+1}_y'] = row['rel_y_c']
        i+=1

    rb_df = indexed_df[indexed_df['position'] == 'RB'].sort_values(by=['rel_y_c']).head(2)
    i = 0
    for index, row in rb_df.iterrows():
        s[f'rb{i+1}_x'] = row['rel_x_c']
        s[f'rb{i+1}_y'] = row['rel_y_c']
        i+=1
    qb_df = indexed_df[indexed_df['position'] == 'QB'].head(1)
    s[f'qb_x'] = qb_df.iloc[0]['rel_x_c'] * 5

    fb_df = indexed_df[indexed_df['position'] == 'FB'].sort_values(by=['rel_y_c']).head(1)
    if fb_df.size > 0:
        s[f'fb1_x'] = fb_df.iloc[0]['rel_x_c']
    dicts.append(s)
model_df = pd.DataFrame(dicts)
model_df = model_df.set_index('index')
print(model_df.head())
print(model_df.shape)



                       x      y position  distance_from_C  rel_x_c  rel_y_c
gameId     playId                                                          
2022091200 64      37.94  23.86       QB         1.246154        0        0
           64      38.52  22.21        G         1.589025        0        0
           64      39.41  14.39       WR         9.273106        0        9
           64      39.17  23.66        C         0.000000        0        0
           64      32.15  23.82       RB         7.021823        5        0
                   wr1_y  wr2_y  wr3_y  wr4_y  wr5_y  qb_x  te1_y  te2_y  \
index                                                                      
(2022090800, 56)     -15    -12     12     12     21    25     -6     -3   
(2022090800, 80)       9      9     12     15     18    25     -9     -6   
(2022090800, 101)     -9     -9      9     12      0     0     -6     -3   
(2022090800, 122)     -9     -6     12     15     15    25      3      3   
(2022090800,

In [5]:
# KMeans clustering seems to suffer from the curse of dimensionality
for c in range(2, 50):
    clusterer = KMeans(n_clusters=c, random_state=0, n_init="auto", init="random").fit(model_df)
    cluster_labels = clusterer.fit_predict(model_df)
    silhouette_avg = silhouette_score(model_df, cluster_labels)
    print(
        "For n_clusters =",
        c,
        "The average silhouette_score is :",
        silhouette_avg,
    )
    

For n_clusters = 2 The average silhouette_score is : 0.28839320548951275
For n_clusters = 3 The average silhouette_score is : 0.31041450942631993
For n_clusters = 4 The average silhouette_score is : 0.303979368249855
For n_clusters = 5 The average silhouette_score is : 0.2988365629181663
For n_clusters = 6 The average silhouette_score is : 0.27390317322906743
For n_clusters = 7 The average silhouette_score is : 0.2811541464974151
For n_clusters = 8 The average silhouette_score is : 0.2573232646651551
For n_clusters = 9 The average silhouette_score is : 0.2535994179943608
For n_clusters = 10 The average silhouette_score is : 0.26996523127064787
For n_clusters = 11 The average silhouette_score is : 0.2597683742434656
For n_clusters = 12 The average silhouette_score is : 0.25618587084056327
For n_clusters = 13 The average silhouette_score is : 0.26151054210805813
For n_clusters = 14 The average silhouette_score is : 0.2578995358954339
For n_clusters = 15 The average silhouette_score is : 

In [6]:
labeled_df = list(zip(cluster_labels, model_df.index.to_list()))
# print(model_df['index'])
print(labeled_df)

[(np.int32(19), (np.int64(2022090800), np.int64(56))), (np.int32(32), (np.int64(2022090800), np.int64(80))), (np.int32(12), (np.int64(2022090800), np.int64(101))), (np.int32(43), (np.int64(2022090800), np.int64(122))), (np.int32(19), (np.int64(2022090800), np.int64(167))), (np.int32(12), (np.int64(2022090800), np.int64(191))), (np.int32(19), (np.int64(2022090800), np.int64(212))), (np.int32(24), (np.int64(2022090800), np.int64(236))), (np.int32(25), (np.int64(2022090800), np.int64(299))), (np.int32(0), (np.int64(2022090800), np.int64(343))), (np.int32(43), (np.int64(2022090800), np.int64(364))), (np.int32(16), (np.int64(2022090800), np.int64(393))), (np.int32(3), (np.int64(2022090800), np.int64(414))), (np.int32(25), (np.int64(2022090800), np.int64(438))), (np.int32(16), (np.int64(2022090800), np.int64(467))), (np.int32(2), (np.int64(2022090800), np.int64(486))), (np.int32(6), (np.int64(2022090800), np.int64(529))), (np.int32(0), (np.int64(2022090800), np.int64(550))), (np.int32(0), (n

In [10]:
# Let's try DBSCAN clustering
# It's supposed to 
for eps in np.arange(0.8, 1.5, 0.1):
    db = DBSCAN(eps=eps, min_samples=10).fit(model_df)
    labels = db.labels_
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise_ = list(labels).count(-1)

    print(eps)
    print("Estimated number of clusters: %d" % n_clusters_)
    print("Estimated number of noise points: %d" % n_noise_)

    # Get the silhouette score
    if n_clusters_ > 1:
        silhouette_avg = silhouette_score(model_df, labels)
        print("For eps =", eps, "The average silhouette_score is :", silhouette_avg)
    else:
        silhouette_avg = 0

0.8
Estimated number of clusters: 4
Estimated number of noise points: 1906
For eps = 0.8 The average silhouette_score is : -0.24813166470996734
0.9
Estimated number of clusters: 4
Estimated number of noise points: 1906
For eps = 0.9 The average silhouette_score is : -0.24813166470996734
1.0
Estimated number of clusters: 4
Estimated number of noise points: 1906
For eps = 1.0 The average silhouette_score is : -0.24813166470996734
1.1
Estimated number of clusters: 4
Estimated number of noise points: 1906
For eps = 1.1 The average silhouette_score is : -0.24813166470996734
1.2
Estimated number of clusters: 4
Estimated number of noise points: 1906
For eps = 1.2 The average silhouette_score is : -0.24813166470996734
1.2999999999999998
Estimated number of clusters: 4
Estimated number of noise points: 1906
For eps = 1.2999999999999998 The average silhouette_score is : -0.24813166470996734
1.4
Estimated number of clusters: 4
Estimated number of noise points: 1906
For eps = 1.4 The average silho

In [16]:
# Affinity Propogation because it should work for uneven clusters
# I experimented with different preferences and damping values, but no good results came of it.
clustering = AffinityPropagation(preference=-.5, random_state=5).fit(model_df)
cluster_labels = clustering.labels_
silhouette_avg = silhouette_score(model_df, cluster_labels)
cluster_centers_indices = clustering.cluster_centers_indices_

print(f"For n_clusters = {len(cluster_centers_indices)} the average silhouette_score is {silhouette_avg}")

For n_clusters = 1562 the average silhouette_score is 0.2781427379145837




In [8]:
# Using 50 clusters because kmeans clustering with a smaller model identified 50 clusters
clustering = AgglomerativeClustering(n_clusters=50, linkage='ward', metric='euclidean').fit(model_df)

map = map_labels_to_plays(clustering.labels_, model_df)
silhouette_avg = silhouette_score(model_df, clustering.labels_)
print(f"For n_clusters = 50 The average silhouette_score is : {silhouette_avg}")

# See output/ag_large_model.png for the plot
plot_clusters(model_df, off_df, map, "ag_large_model", show_means=False)


For n_clusters = 50 The average silhouette_score is : 0.2751428315008427
