# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [2]:
data2021 = pd.read_csv('statcast21.csv')
data2022 = pd.read_csv('statcast22.csv')

# Testing- NA Handling

In [3]:
features = ['delta_run_exp','player_name','p_throws','stand','pitch_type','pitch_name','pitch_number',
            'home_team','game_date','inning','balls','strikes','outs_when_up','on_3b', 'on_2b', 'on_1b',
            'release_speed','release_extension','effective_speed','release_spin_rate',
            'release_pos_x', 'release_pos_y', 'release_pos_z','spin_axis', 'pfx_x', 'pfx_z', 'plate_x', 'plate_z', 
            'zone', 'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'sz_top', 'sz_bot']

# inferred_axis: 180 / pi * atan(pfx_z / pfx_x) + 90 (where pfx_x is < 0, add 180 degrees.)
# axis_diff: spin_axis - inferred_axis

In [4]:
fill_cols = ['release_speed','release_extension','effective_speed',
     'release_spin_rate','release_pos_x','release_pos_y','release_pos_z','spin_axis']

In [5]:
# Define a function to fill NaN values within groups
def fillna_by_pitcher(df, cols):
    for i in cols:
        mean = df[i].mean()
        df[i].fillna(mean,inplace = True)
    
    return df

In [6]:
test = data2021[features]
test

Unnamed: 0,delta_run_exp,player_name,p_throws,stand,pitch_type,pitch_name,pitch_number,home_team,game_date,inning,...,plate_z,zone,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot
0,-0.073,"Smith, Will",L,R,FF,4-Seam Fastball,4,ATL,2021-10-03,9,...,2.83,1.0,-6.833043,-134.166485,-7.361843,9.708393,26.562803,-14.083224,3.32,1.51
1,-0.027,"Smith, Will",L,R,SL,Slider,3,ATL,2021-10-03,9,...,2.62,4.0,-3.700232,-117.430885,-3.266842,-6.531123,19.793390,-27.369114,3.32,1.51
2,-0.020,"Smith, Will",L,R,CU,Curveball,2,ATL,2021-10-03,9,...,2.46,5.0,-1.977183,-109.901781,-1.155694,-4.872924,20.602334,-36.262184,3.32,1.51
3,0.016,"Smith, Will",L,R,CU,Curveball,1,ATL,2021-10-03,9,...,3.89,12.0,2.375830,-109.205830,2.277617,-5.902656,19.427562,-38.284747,3.31,1.48
4,-0.189,"Smith, Will",L,L,FF,4-Seam Fastball,2,ATL,2021-10-03,9,...,2.80,4.0,-5.868477,-132.500539,-6.486796,8.700586,30.117690,-15.941174,3.49,1.60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
708615,-0.078,"Márquez, Germán",R,R,FF,4-Seam Fastball,5,COL,2021-04-01,1,...,3.09,2.0,5.561971,-139.787362,-4.509214,-3.312501,24.833759,-20.112617,3.22,1.55
708616,0.113,"Márquez, Germán",R,R,FF,4-Seam Fastball,4,COL,2021-04-01,1,...,3.95,12.0,5.875569,-139.018112,-1.839219,-1.810341,24.856947,-23.067890,3.29,1.58
708617,0.052,"Márquez, Germán",R,R,FF,4-Seam Fastball,3,COL,2021-04-01,1,...,3.65,11.0,3.953292,-140.092280,-3.371124,-6.331036,25.767684,-20.177597,3.34,1.64
708618,-0.049,"Márquez, Germán",R,R,FF,4-Seam Fastball,2,COL,2021-04-01,1,...,2.60,5.0,6.352185,-139.293012,-5.428763,-4.405496,25.441128,-19.766641,3.29,1.49


In [7]:
test_fill = test.groupby(['player_name','pitch_type']).apply(fillna_by_pitcher, cols = fill_cols)
test_fill

Unnamed: 0,delta_run_exp,player_name,p_throws,stand,pitch_type,pitch_name,pitch_number,home_team,game_date,inning,...,plate_z,zone,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot
0,-0.073,"Smith, Will",L,R,FF,4-Seam Fastball,4,ATL,2021-10-03,9,...,2.83,1.0,-6.833043,-134.166485,-7.361843,9.708393,26.562803,-14.083224,3.32,1.51
1,-0.027,"Smith, Will",L,R,SL,Slider,3,ATL,2021-10-03,9,...,2.62,4.0,-3.700232,-117.430885,-3.266842,-6.531123,19.793390,-27.369114,3.32,1.51
2,-0.020,"Smith, Will",L,R,CU,Curveball,2,ATL,2021-10-03,9,...,2.46,5.0,-1.977183,-109.901781,-1.155694,-4.872924,20.602334,-36.262184,3.32,1.51
3,0.016,"Smith, Will",L,R,CU,Curveball,1,ATL,2021-10-03,9,...,3.89,12.0,2.375830,-109.205830,2.277617,-5.902656,19.427562,-38.284747,3.31,1.48
4,-0.189,"Smith, Will",L,L,FF,4-Seam Fastball,2,ATL,2021-10-03,9,...,2.80,4.0,-5.868477,-132.500539,-6.486796,8.700586,30.117690,-15.941174,3.49,1.60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
708615,-0.078,"Márquez, Germán",R,R,FF,4-Seam Fastball,5,COL,2021-04-01,1,...,3.09,2.0,5.561971,-139.787362,-4.509214,-3.312501,24.833759,-20.112617,3.22,1.55
708616,0.113,"Márquez, Germán",R,R,FF,4-Seam Fastball,4,COL,2021-04-01,1,...,3.95,12.0,5.875569,-139.018112,-1.839219,-1.810341,24.856947,-23.067890,3.29,1.58
708617,0.052,"Márquez, Germán",R,R,FF,4-Seam Fastball,3,COL,2021-04-01,1,...,3.65,11.0,3.953292,-140.092280,-3.371124,-6.331036,25.767684,-20.177597,3.34,1.64
708618,-0.049,"Márquez, Germán",R,R,FF,4-Seam Fastball,2,COL,2021-04-01,1,...,2.60,5.0,6.352185,-139.293012,-5.428763,-4.405496,25.441128,-19.766641,3.29,1.49


In [8]:
# test: 2021 data w/o cleaning or NA values filled
# i.e. Averages of metrics for pitch_type for Spencer Strider
test[(test.player_name == 'Strider, Spencer')].groupby('pitch_type').agg('mean')[
    ['release_speed','release_extension','effective_speed','release_spin_rate',
     'release_pos_x','release_pos_y','release_pos_z','spin_axis']]

Unnamed: 0_level_0,release_speed,release_extension,effective_speed,release_spin_rate,release_pos_x,release_pos_y,release_pos_z,spin_axis
pitch_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
FF,97.882759,6.613793,98.465517,2356.034483,-1.715667,53.886667,5.860667,200.766667
SL,85.475,6.5,85.95,2107.25,-1.77625,53.98,5.8375,165.375


In [9]:
# test df data before filling in w/ averages
test[(test.player_name == 'Strider, Spencer') & (test.game_date == '2021-10-03') & (test.delta_run_exp == -0.215)][
    ['delta_run_exp','release_speed','release_extension','effective_speed',
     'release_spin_rate','release_pos_x','release_pos_y','release_pos_z','spin_axis']]

Unnamed: 0,delta_run_exp,release_speed,release_extension,effective_speed,release_spin_rate,release_pos_x,release_pos_y,release_pos_z,spin_axis
115,-0.215,,,,,-1.88,53.93,5.8,205.0


In [10]:
# test_fill: 2021 data w/ NA values filled
# i.e. test_fill df after filled in w/ averages
test_fill[(test_fill.player_name == 'Strider, Spencer') & (test_fill.game_date == '2021-10-03') & (test_fill.delta_run_exp == -0.215)][
    ['delta_run_exp','release_speed','release_extension','effective_speed',
     'release_spin_rate','release_pos_x','release_pos_y','release_pos_z','spin_axis']]

Unnamed: 0,delta_run_exp,release_speed,release_extension,effective_speed,release_spin_rate,release_pos_x,release_pos_y,release_pos_z,spin_axis
115,-0.215,97.882759,6.613793,98.465517,2356.034483,-1.88,53.93,5.8,205.0
