In [None]:
# pip install git+https://github.com/Jensen-holm/statcast-era-pitches.git
import statcast_pitches
import polars as pl

In [None]:
# load all pitches from 2015-present
pitches_lf = statcast_pitches.load()

In [None]:
# Print the head of the DataFrame
# Collect the LazyFrame to execute and view data
pitches_df = pitches_lf.collect()

In [None]:
# Print the head of the DataFrame
print(pitches_df.head())
print(pitches_df.columns)

shape: (5, 113)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ pitch_typ ┆ game_date ┆ release_s ┆ release_p ┆ … ┆ api_break ┆ api_break ┆ api_break ┆ arm_angl │
│ e         ┆ ---       ┆ peed      ┆ os_x      ┆   ┆ _z_with_g ┆ _x_arm    ┆ _x_batter ┆ e        │
│ ---       ┆ datetime[ ┆ ---       ┆ ---       ┆   ┆ ravity    ┆ ---       ┆ _in       ┆ ---      │
│ str       ┆ μs]       ┆ f64       ┆ f64       ┆   ┆ ---       ┆ f64       ┆ ---       ┆ f64      │
│           ┆           ┆           ┆           ┆   ┆ f64       ┆           ┆ f64       ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ KC        ┆ 2024-10-3 ┆ 77.5      ┆ -1.11     ┆ … ┆ 5.23      ┆ -1.08     ┆ 1.08      ┆ 53.2     │
│           ┆ 0         ┆           ┆           ┆   ┆           ┆           ┆           ┆          │
│           ┆ 00:00:00  ┆           ┆           ┆   ┆           ┆          

In [None]:
# Count rows for each year in 'game_date'
yearly_counts = (pitches_lf
    .with_columns(pl.col("game_date").dt.year().alias("year"))  # Extract year from game_date
    .group_by("year")  # Group by year
    .agg(pl.count().alias("row_count"))  # Count rows for each year
    .sort("year") # Sort by year
    .collect()  # Collect the result into a DataFrame
)

# Print the yearly counts
print(yearly_counts)

shape: (10, 2)
┌──────┬───────────┐
│ year ┆ row_count │
│ ---  ┆ ---       │
│ i32  ┆ u32       │
╞══════╪═══════════╡
│ 2015 ┆ 712839    │
│ 2016 ┆ 726273    │
│ 2017 ┆ 732477    │
│ 2018 ┆ 731207    │
│ 2019 ┆ 760498    │
│ 2020 ┆ 279660    │
│ 2021 ┆ 763191    │
│ 2022 ┆ 773618    │
│ 2023 ┆ 771057    │
│ 2024 ┆ 757713    │
└──────┴───────────┘


In [None]:
dataset = (pitches_lf
          .filter(pl.col("description") == "hit_into_play")
          .select("events","description","bb_type",
                  "launch_speed","launch_angle")
          .collect())
print(dataset)

shape: (1_242_704, 5)
┌───────────┬───────────────┬─────────────┬──────────────┬──────────────┐
│ events    ┆ description   ┆ bb_type     ┆ launch_speed ┆ launch_angle │
│ ---       ┆ ---           ┆ ---         ┆ ---          ┆ ---          │
│ str       ┆ str           ┆ str         ┆ f64          ┆ f64          │
╞═══════════╪═══════════════╪═════════════╪══════════════╪══════════════╡
│ field_out ┆ hit_into_play ┆ ground_ball ┆ 92.4         ┆ -13.0        │
│ field_out ┆ hit_into_play ┆ ground_ball ┆ 102.7        ┆ 0.0          │
│ field_out ┆ hit_into_play ┆ fly_ball    ┆ 103.3        ┆ 23.0         │
│ single    ┆ hit_into_play ┆ ground_ball ┆ 99.3         ┆ 1.0          │
│ field_out ┆ hit_into_play ┆ fly_ball    ┆ 94.5         ┆ 58.0         │
│ …         ┆ …             ┆ …           ┆ …            ┆ …            │
│ field_out ┆ hit_into_play ┆ ground_ball ┆ null         ┆ null         │
│ double    ┆ hit_into_play ┆ line_drive  ┆ null         ┆ null         │
│ single    ┆ hi

In [None]:
print(dataset.columns)

['events', 'description', 'bb_type', 'launch_speed', 'launch_angle']


In [None]:
# Print unique descriptions (vertically)
for event in dataset["events"].unique().to_list():
    print(event)

sac_bunt_double_play
fielders_choice_out
game_advisory
None
grounded_into_double_play
ejection
single
double
double_play
sac_fly
sac_bunt
triple_play
field_error
home_run
triple
force_out
fielders_choice
sac_fly_double_play
catcher_interf
field_out


In [None]:
# Mutate target column
hits = ["single", "double", "triple", "home_run"]

dataset = dataset.with_columns(
    pl.when(pl.col("events") == "single")
        .then(pl.lit("Single"))
      .when(pl.col("events") == "double")
        .then(pl.lit("Double"))
      .when(pl.col("events") == "triple")
        .then(pl.lit("Triple"))
      .when(pl.col("events") == "home_run")
        .then(pl.lit("Homerun"))
      .when((~pl.col("events").is_in(hits)) & (pl.col("bb_type") == "ground_ball"))
        .then(pl.lit("Groundoutable"))
      .when((~pl.col("events").is_in(hits)) & (pl.col("bb_type") == "fly_ball"))
        .then(pl.lit("Flyoutable"))
      .when((~pl.col("events").is_in(hits)) & (pl.col("bb_type") == "line_drive"))
        .then(pl.lit("Lineoutable"))
      .otherwise(pl.lit("Other"))
      .alias("target")
)

# Display the modified dataset
print(dataset)

dataset = dataset.collect()

shape: (1_242_704, 6)
┌───────────┬───────────────┬─────────────┬──────────────┬──────────────┬───────────────┐
│ events    ┆ description   ┆ bb_type     ┆ launch_speed ┆ launch_angle ┆ target        │
│ ---       ┆ ---           ┆ ---         ┆ ---          ┆ ---          ┆ ---           │
│ str       ┆ str           ┆ str         ┆ f64          ┆ f64          ┆ str           │
╞═══════════╪═══════════════╪═════════════╪══════════════╪══════════════╪═══════════════╡
│ field_out ┆ hit_into_play ┆ ground_ball ┆ 92.4         ┆ -13.0        ┆ Groundoutable │
│ field_out ┆ hit_into_play ┆ ground_ball ┆ 102.7        ┆ 0.0          ┆ Groundoutable │
│ field_out ┆ hit_into_play ┆ fly_ball    ┆ 103.3        ┆ 23.0         ┆ Flyoutable    │
│ single    ┆ hit_into_play ┆ ground_ball ┆ 99.3         ┆ 1.0          ┆ Single        │
│ field_out ┆ hit_into_play ┆ fly_ball    ┆ 94.5         ┆ 58.0         ┆ Flyoutable    │
│ …         ┆ …             ┆ …           ┆ …            ┆ …            ┆ …   

In [None]:
print(dataset["target"].head())

shape: (10,)
Series: 'target' [str]
[
	"Groundoutable"
	"Groundoutable"
	"Flyoutable"
	"Single"
	"Flyoutable"
	"Double"
	"Groundoutable"
	"Flyoutable"
	"Flyoutable"
	"Single"
]


In [None]:
dataset = (dataset
           .filter(pl.col("target") != "Other")
           .filter(~pl.col("launch_speed").is_null())
           .filter(~pl.col("launch_angle").is_null())
           .select("launch_speed","launch_angle","target"))
print(dataset)

row_count = dataset.shape[0]
print(f"Number of rows: {row_count}")

shape: (1_114_608, 3)
┌──────────────┬──────────────┬───────────────┐
│ launch_speed ┆ launch_angle ┆ target        │
│ ---          ┆ ---          ┆ ---           │
│ f64          ┆ f64          ┆ str           │
╞══════════════╪══════════════╪═══════════════╡
│ 92.4         ┆ -13.0        ┆ Groundoutable │
│ 102.7        ┆ 0.0          ┆ Groundoutable │
│ 103.3        ┆ 23.0         ┆ Flyoutable    │
│ 99.3         ┆ 1.0          ┆ Single        │
│ 94.5         ┆ 58.0         ┆ Flyoutable    │
│ …            ┆ …            ┆ …             │
│ 88.5         ┆ -12.0        ┆ Groundoutable │
│ 76.4         ┆ 50.0         ┆ Double        │
│ 99.2         ┆ 31.0         ┆ Flyoutable    │
│ 85.0         ┆ 25.0         ┆ Double        │
│ 83.6         ┆ 13.0         ┆ Single        │
└──────────────┴──────────────┴───────────────┘
Number of rows: 1114608


In [None]:
# Write to csv
dataset.write_csv("dataset.csv")