# Statcast Clustering

Objective: Experiment with various clustering techniques (Kmeans/DBscan) to cluster MLB pitch types. 

Data: Statcast data scraped into postgresql database on a weekly basis.

Algorithms: Kmeans/DBScan

In [1]:
import pandas as pd
import numpy as np
from configparser import ConfigParser
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score


In [2]:
# establish sql engine connection
parser = ConfigParser()
parser.read('nb.ini')
conn_string = parser.get('my_db', 'conn_string')
engine = create_engine(conn_string)

In [3]:
## fetch statcast data from postgresql database

def get_sql_data(engine):

    sql1 = '''
        SELECT *
        FROM statcast_2016
    '''
    sc_16 = pd.read_sql_query(sql1, engine)

    sql2 = '''
        SELECT *
        FROM statcast_2017
    '''
    sc_17 = pd.read_sql_query(sql2, engine)

    sql3 = '''
        SELECT *
        FROM statcast_2018
    '''

    sc_18 = pd.read_sql_query(sql3, engine)

    sql4 = '''
        SELECT *
        FROM statcast_2019
    '''

    sc_19 = pd.read_sql_query(sql4, engine)

    sql5 = '''
        SELECT *
        FROM statcast_2020
    '''

    sc_20 = pd.read_sql_query(sql5, engine)

    sql6 = '''
        SELECT *
        FROM statcast_2021
    '''

    sc_21 = pd.read_sql_query(sql6, engine)

    statcast = pd.concat([sc_16, sc_17, sc_18, sc_19, sc_20, sc_21])

    statcast['events']=statcast['events'].fillna('none')
    statcast['launch_speed']=statcast['launch_speed'].fillna(0)
    statcast['launch_angle']=statcast['launch_angle'].fillna(0)

    return statcast

In [4]:
statcast = get_sql_data(engine)

## Data Prep

In [5]:
# filter only relevant columns
cols = ['player_name', 'home_team', 'away_team', 'inning_topbot', 'p_throws', 'pitch_type', 'game_date', 'events', 'pitcher', 
         'batter', 'description', 'launch_speed', 'launch_angle', 'release_speed', 'release_pos_x', 
         'release_pos_y', 'release_pos_z', 'release_spin_rate', 'release_extension', 'pfx_x', 'pfx_z', 
         'plate_x', 'plate_z', 'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'effective_speed',
         'pitch_name', 'spin_axis', 'delta_run_exp']

sc_cluster = statcast[cols]

# assign pitcher teams
def pitcher_team(row):

	if row['inning_topbot'] == 'Top':
		return row['home_team']
	
	if row['inning_topbot'] == 'Bot':
		return row['away_team']

sc_cluster['pitcher_team'] = sc_cluster.apply(pitcher_team, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sc_cluster['pitcher_team'] = sc_cluster.apply(pitcher_team, axis=1)


In [6]:
# Fastballs and Four Seam Fastballs are the same thing
# Group pitches into similar moving pitches: Fastballs, Moving Fastballs, Slider/Cutter, Curve and Off Speed

sc_cluster['pitch_type'] = sc_cluster['pitch_type'].replace(['FA'],'FF')

# categorize the pitches according to pitcher handedness and pitch type

conditions = [
    ((sc_cluster['p_throws'] == 'R') & (sc_cluster['pitch_type'] == 'FF')),
    ((sc_cluster['p_throws'] == 'R') & (sc_cluster['pitch_type'] == 'FT') | (sc_cluster['p_throws']=='R') & (sc_cluster['pitch_type']=='SI')),
    ((sc_cluster['p_throws'] == 'R') & (sc_cluster['pitch_type'] == 'SL') | (sc_cluster['p_throws']=='R') & (sc_cluster['pitch_type']=='FC')),
    ((sc_cluster['p_throws'] == 'R') & (sc_cluster['pitch_type'] == 'CU') | (sc_cluster['p_throws']=='R') & (sc_cluster['pitch_type']=='KC')),
    ((sc_cluster['p_throws'] == 'R') & (sc_cluster['pitch_type'] == 'CH') | (sc_cluster['p_throws']=='R') & (sc_cluster['pitch_type']=='FS')),
    ((sc_cluster['p_throws'] == 'L') & (sc_cluster['pitch_type'] == 'FF')),
    ((sc_cluster['p_throws'] == 'L') & (sc_cluster['pitch_type'] == 'FT') | (sc_cluster['p_throws']=='L') & (sc_cluster['pitch_type']=='SI')),
    ((sc_cluster['p_throws'] == 'L') & (sc_cluster['pitch_type'] == 'SL') | (sc_cluster['p_throws']=='L') & (sc_cluster['pitch_type']=='FC')),
    ((sc_cluster['p_throws'] == 'L') & (sc_cluster['pitch_type'] == 'CU') | (sc_cluster['p_throws']=='L') & (sc_cluster['pitch_type']=='KC')),
    ((sc_cluster['p_throws'] == 'L') & (sc_cluster['pitch_type'] == 'CH') | (sc_cluster['p_throws']=='L') & (sc_cluster['pitch_type']=='FS'))
    ]

values = ['rhp_ff', 'rhp_mf', 'rhp_slct', 'rhp_cukc', 'rhp_off', 'lhp_ff', 'lhp_mf', 'lhp_slct', 'lhp_cukc', 'lhp_off']

sc_cluster['cat'] = np.select(conditions, values)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sc_cluster['pitch_type'] = sc_cluster['pitch_type'].replace(['FA'],'FF')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sc_cluster['cat'] = np.select(conditions, values)


## Clustering

In [7]:
# create scaler object
scaler = StandardScaler()

# creating a copy to keep original df as is for later
df_clust = sc_cluster.copy()

# features to scale
cols_scale = [
    'release_speed', 'release_spin_rate', 'pfx_x', 'pfx_z', 'spin_axis', 'plate_x', 'plate_z']

# scale the data
scaler = StandardScaler().fit(df_clust[cols_scale])
df_clust[cols_scale] = scaler.transform(df_clust[cols_scale])

In [8]:
# drop nulls

rhp_ff = df_clust.loc[df_clust['cat']=='rhp_ff'].dropna()
rhp_slct = df_clust.loc[df_clust['cat']=='rhp_slct'].dropna()
rhp_off = df_clust.loc[df_clust['cat']=='rhp_off'].dropna()
lhp_ff = df_clust.loc[df_clust['cat']=='lhp_ff'].dropna()
lhp_mf = df_clust.loc[df_clust['cat']=='lhp_mf'].dropna()
lhp_slct = df_clust.loc[df_clust['cat']=='lhp_slct'].dropna()
lhp_cukc = df_clust.loc[df_clust['cat']=='lhp_cukc'].dropna()
lhp_off = df_clust.loc[df_clust['cat']=='lhp_off'].dropna()


df_list = [rhp_ff, rhp_slct, rhp_off, lhp_ff, lhp_mf, lhp_slct, lhp_cukc, lhp_off]

for df in df_list:
    kmeanModel = KMeans(n_clusters=4)
    kmeanModel.fit(df[cols_scale])
    df['cluster_id'] = kmeanModel.labels_
    df['cluster_id'] = df['cluster_id'].astype('str')
    df['cluster_name'] = df['cat'] + '_' + df['cluster_id']

In [9]:
rhp_mf = df_clust.loc[df_clust['cat']=='rhp_mf'].dropna()
rhp_cukc = df_clust.loc[df_clust['cat']=='rhp_cukc'].dropna()

df_list2 = [rhp_mf, rhp_cukc]

for df in df_list2:
    kmeanModel = KMeans(n_clusters=5)
    kmeanModel.fit(df[cols_scale])
    df['cluster_id'] = kmeanModel.labels_
    df['cluster_id'] = df['cluster_id'].astype('str')
    df['cluster_name'] = df['cat'] + '_' + df['cluster_id']

In [10]:
frames = [rhp_mf, rhp_cukc, rhp_ff, rhp_slct, rhp_off, lhp_ff, lhp_mf, lhp_slct, lhp_cukc, lhp_off]

clustering = pd.concat(frames)

In [11]:
# clustering.to_sql('cluster_df', engine, if_exists='replace', 
#                chunksize= 100, method='multi')

In [28]:
from sklearn.decomposition import PCA

pca_df = clustering.copy()
X = pca_df[['release_speed', 'release_spin_rate', 'pfx_x', 'pfx_z', 'spin_axis', 'plate_x', 'plate_z']]

pca = PCA(n_components=2)
pca.fit(X)

X_trans = pca.transform(X)

print(type(X_trans))
print(X_trans.shape)

<class 'numpy.ndarray'>
(3793167, 2)


In [29]:
pca_df['PCA_0'] = X_trans[:,0]
pca_df['PCA_1'] = X_trans[:,1]

In [30]:
pca_df

Unnamed: 0,player_name,home_team,away_team,inning_topbot,p_throws,pitch_type,game_date,events,pitcher,batter,...,effective_speed,pitch_name,spin_axis,delta_run_exp,pitcher_team,cat,cluster_id,cluster_name,PCA_0,PCA_1
26,"Bauer, Trevor",CLE,CHC,Top,R,FT,2016-11-02 00:00:00,none,545333,518792,...,93.1,2-Seam Fastball,0.881355,0.000,CLE,rhp_mf,2,rhp_mf_2,-1.684733,-0.107517
214,"Hendricks, Kyle",CLE,CHC,Bot,R,SI,2016-11-02 00:00:00,strikeout,543294,547379,...,88.6,Sinker,0.619634,-0.144,CHC,rhp_mf,3,rhp_mf_3,-0.707974,0.715015
215,"Hendricks, Kyle",CLE,CHC,Bot,R,SI,2016-11-02 00:00:00,none,543294,547379,...,88.6,Sinker,0.474234,-0.051,CHC,rhp_mf,0,rhp_mf_0,-1.316211,0.525954
217,"Hendricks, Kyle",CLE,CHC,Bot,R,SI,2016-11-02 00:00:00,none,543294,547379,...,88.2,Sinker,0.692334,-0.033,CHC,rhp_mf,2,rhp_mf_2,-1.513894,0.475072
218,"Hendricks, Kyle",CLE,CHC,Bot,R,SI,2016-11-02 00:00:00,none,543294,547379,...,88.3,Sinker,0.576014,0.028,CHC,rhp_mf,3,rhp_mf_3,-0.937106,0.598650
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
720024,"Alexander, Scott",COL,LAD,Bot,L,CH,2021-04-01 00:00:00,none,518397,596115,...,85.0,Changeup,-0.747129,0.000,LAD,lhp_off,0,lhp_off_0,0.726362,-0.157466
720027,"Alexander, Scott",COL,LAD,Bot,L,CH,2021-04-01 00:00:00,none,518397,596115,...,85.8,Changeup,-0.805289,0.037,LAD,lhp_off,2,lhp_off_2,0.981765,0.933933
720029,"Alexander, Scott",COL,LAD,Bot,L,CH,2021-04-01 00:00:00,none,518397,596115,...,85.9,Changeup,-0.790749,0.000,LAD,lhp_off,2,lhp_off_2,1.133615,0.424922
720030,"Alexander, Scott",COL,LAD,Bot,L,CH,2021-04-01 00:00:00,none,518397,596115,...,85.6,Changeup,-0.776209,0.017,LAD,lhp_off,2,lhp_off_2,1.326656,0.792793
