In [1]:
from lib.data_util import *
from lib.documentation import *
from lib.data_models import *
from lib.predictions import Predictions
from lib.visualizations import Visualizations
import polars as pl

### Pre-process data

In [2]:
df = read_data("data/statcast_pitch_swing_data_20240402_20240630.arrow")
build_description_table(df, "glossary/data_definitions.md", "glossary/data_dictionary.md", build_plots=False)

In [3]:
pitcher_lookup = player_lookup(df, "pitcher", Pitcher)
batter_lookup = player_lookup(df, "batter", Batter)

There does not appear to be a strong relationship between the percentage of balls that a player swings at, and the percentage of balls that a player hits into play.

### What does a batter's swing profile look like?

#### Drop the batters with very low swing counts

In [7]:
batter_profile_df = df.filter(pl.col("swing_event")).group_by("batter").agg([
  pl.col("bat_speed").min().alias("bat_speed_min"),
  pl.col("bat_speed").max().alias("bat_speed_max"),
  pl.col("bat_speed").median().alias("bat_speed_median"),
  pl.col("swing_length").min().alias("swing_length_min"),
  pl.col("swing_length").max().alias("swing_length_max"),
  pl.col("swing_length").median().alias("swing_length_median"),
  pl.len().alias("swing_count"),
  ((pl.col("description").eq("hit_into_play").sum() / pl.len()) * 100).alias("hit_into_play_percentage"),
  *[((pl.col("description").eq("hit_into_play") & (pl.col("pitch_type").eq(pitch))).sum() * 100 /
      pl.col("pitch_type").eq(pitch).sum()).alias(f"hit_into_play_for_{pitch}")
    for pitch in df["pitch_type"].unique()]
]).filter(pl.col("swing_count") > 30)
batter_profile_df

batter,bat_speed_min,bat_speed_max,bat_speed_median,swing_length_min,swing_length_max,swing_length_median,swing_count,hit_into_play_percentage,hit_into_play_for_FF,hit_into_play_for_CH,hit_into_play_for_SI,hit_into_play_for_KC,hit_into_play_for_SL,hit_into_play_for_ST,hit_into_play_for_FC,hit_into_play_for_CU,hit_into_play_for_SV,hit_into_play_for_FS,hit_into_play_for_PO,hit_into_play_for_FO,hit_into_play_for_FA,hit_into_play_for_KN,hit_into_play_for_EP,hit_into_play_for_CS,hit_into_play_for_
cat,f64,f64,f64,f64,f64,f64,u32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""514888""",6.678485,83.816981,68.493427,2.19618,8.96172,7.90381,580,38.62069,42.222222,32.5,57.446809,25.0,37.226277,21.621622,36.666667,28.571429,66.666667,14.285714,,,,,,,
"""592273""",32.234789,84.34839,72.095169,5.08812,8.9987,7.29083,186,40.322581,25.0,33.333333,55.813953,,50.0,50.0,33.333333,45.454545,,33.333333,,,,,,,
"""663967""",6.370414,73.822905,67.737487,0.83355,8.26926,7.26559,33,39.393939,50.0,75.0,50.0,,14.285714,,20.0,33.333333,,50.0,,,,,,,
"""677587""",3.926271,82.977541,69.0511,0.47298,8.74293,7.16779,411,36.009732,28.346457,40.0,40.384615,50.0,35.714286,27.777778,41.304348,34.285714,,56.25,,,,,,,
"""693049""",38.253033,81.969048,74.519835,4.36617,9.02815,7.35279,74,31.081081,16.666667,66.666667,14.285714,0.0,17.647059,0.0,50.0,60.0,,42.857143,,,,,,,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""624512""",11.462052,79.437998,68.390135,1.79475,9.85368,8.01341,207,35.2657,28.169014,35.714286,52.380952,33.333333,29.411765,57.142857,45.0,36.363636,,38.461538,,,,0.0,,,
"""608596""",62.97311,81.050145,72.080217,5.99858,9.30278,7.84834,51,27.45098,11.111111,0.0,25.0,50.0,75.0,60.0,75.0,0.0,,,,,0.0,,,,
"""650559""",31.652637,82.567982,71.978135,3.9136,9.81733,7.682045,650,33.230769,27.638191,42.465753,40.206186,33.333333,32.55814,24.444444,38.461538,37.037037,0.0,29.411765,,,,33.333333,,,
"""669208""",6.388012,77.640917,70.051808,0.97914,8.28536,6.60969,103,33.009709,23.529412,14.285714,41.176471,0.0,52.173913,25.0,40.0,100.0,,,,,,,,,


### Is there an optimal swing speed to hit a ball into play?

#### There does appear to be a correlation between a batter's swing speed and the probability that they will hit a ball into play.

In [13]:
Visualizations(df).bat_speed_to_hit_into_play_percentage("visualizations/bat_speed")

### Is the an optimal swing length to hit a ball into play?

#### There is a very clear sweet spot.

In [14]:
Visualizations(df).swing_length_to_hit_into_play_percentage("visualizations/swing_length")

### Can we predict an outcome based on bat speed and swing length?

In [4]:
swings = df.filter(pl.col("swing_event"))
print("The percentage of swings that result in a hit: " + str(len(swings.filter(pl.col("hit_into_play"))) * 100 / len(swings)))
with open("outputs/bat_speed_and_swing_length_hit_into_play_predictions.txt", "w") as f:
  predictor = Predictions(swings, ["bat_speed", "swing_length"], "hit_into_play", random_seed=0, output=f)
  predictor.test_models()

The percentage of swings that result in a hit: 36.56650322183144
