In [4]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pybaseball
from pybaseball import statcast, playerid_lookup, statcast_pitcher, statcast_batter
from datetime import datetime

In [5]:
def calculate_cumulative_batting_average(data):
    data['game_date'] = pd.to_datetime(data['game_date'])
    data = data.sort_values(by=['batter', 'game_date'])

    # Initialize lists to store cumulative hits and at-bats
    cumulative_hits = []
    cumulative_at_bats = []
    cumulative_batting_averages = []

    # Track hits and at-bats
    hits = 0
    at_bats = 0

    # Iterate through each row in the DataFrame
    for index, row in data.iterrows():
        if pd.notna(row['events']) and row['events'] in ['single', 'double', 'triple', 'home_run']:
            hits += 1
        if pd.notna(row['events']):
            at_bats += 1

        # Calculate batting average
        batting_average = hits / at_bats if at_bats > 0 else 0
        
        cumulative_hits.append(hits)
        cumulative_at_bats.append(at_bats)
        cumulative_batting_averages.append(batting_average)

    # Add the cumulative statistics to the DataFrame
    data['cumulative_hits'] = cumulative_hits
    data['cumulative_at_bats'] = cumulative_at_bats
    data['cumulative_batting_average'] = cumulative_batting_averages

    return data

def calculate_cumulative_pitcher_metrics(data):
    data['game_date'] = pd.to_datetime(data['game_date'])
    data = data.sort_values(by=['pitcher', 'game_date'])
    
    # Initialize lists to store cumulative hits allowed and batters faced
    cumulative_hits_allowed = []
    cumulative_batters_faced = []
    cumulative_pitcher_metric = []

    # Track hits allowed and batters faced
    hits_allowed = 0
    batters_faced = 0

    # Iterate through each row in the DataFrame
    for index, row in data.iterrows():
        if pd.notna(row['events']) and row['events'] in ['single', 'double', 'triple', 'home_run']:
            hits_allowed += 1
        if pd.notna(row['events']):
            batters_faced += 1

        # Calculate pitcher metric (hits allowed / batters faced)
        pitcher_metric = hits_allowed / batters_faced if batters_faced > 0 else 0
        
        cumulative_hits_allowed.append(hits_allowed)
        cumulative_batters_faced.append(batters_faced)
        cumulative_pitcher_metric.append(pitcher_metric)

    # Add the cumulative statistics to the DataFrame
    data['cumulative_hits_allowed'] = cumulative_hits_allowed
    data['cumulative_batters_faced'] = cumulative_batters_faced
    data['cumulative_pitcher_metric'] = cumulative_pitcher_metric

    return data

def fetch_and_save_data(start_date, end_date):
    # Create directory if it does not exist
    folder_name = 'batting_data'
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
    
    # Query Statcast data
    pybaseball.cache.enable()
    data = statcast(start_date, end_date)

    # Convert game_date to datetime
    data['game_date'] = pd.to_datetime(data['game_date'])
    
    # Calculate cumulative batting averages
    data_with_cumulative_stats = calculate_cumulative_batting_average(data)
    
    # Define the full path for the CSV file
    file_path = os.path.join(folder_name, "batting_data_with_cumulative_avg.csv")
    
    # Save to CSV
    data_with_cumulative_stats.to_csv(file_path, index=False)
    
    print("********************************************")
    print(f"Matchup data with cumulative batting averages saved to {file_path}")
    print("********************************************")
    print(data_with_cumulative_stats.head())
    print("********************************************")
    print(data_with_cumulative_stats.columns)

fetch_and_save_data('2024-07-16', datetime.now().strftime('%Y-%m-%d'))

This is a large query, it may take a moment to complete


100%|██████████| 65/65 [00:00<00:00, 72.03it/s]
  final_data = pd.concat(dataframe_list, axis=0).convert_dtypes(convert_string=False)


********************************************
Matchup data with cumulative batting averages saved to batting_data/batting_data_with_cumulative_avg.csv
********************************************
     pitch_type  game_date  release_speed  release_pos_x  release_pos_z  \
2508         FF 2024-07-19           96.4          -2.41           5.91   
2551         FS 2024-07-19           87.3          -2.81           5.78   
511          CH 2024-07-19           82.8          -2.27           6.04   
527          FF 2024-07-19           93.8           -2.2           6.14   
547          FF 2024-07-19           93.0           -2.3           6.15   

        player_name  batter  pitcher     events    description  ...  \
2508    Smith, Cade  444482   671922  field_out  hit_into_play  ...   
2551    Smith, Cade  444482   671922       None           ball  ...   
511   Bibee, Tanner  444482   676440  field_out  hit_into_play  ...   
527   Bibee, Tanner  444482   676440       None  called_strike  ...   