In [1]:
import time
import pandas as pd
import polars as pl
import numpy as np
import pickle as pkl
import sys
import json
import re
import pyarrow

from scipy import stats
from collections import defaultdict
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from datetime import timedelta
from IPython.display import clear_output
from pathlib import Path
from datetime import datetime as dt

from multimodal_communication import cloud_functions as cf
from mlb_simulation.build_datasets import constants
from mlb_simulation.build_datasets.utils_polars import (_correct_home_away_swap,
                   _get_wind_direction,
                   _convert_wind_direction,
                   _pull_full_weather,
                   _segregate_plays_by_pitbat_combo
)
from mlb_simulation.build_datasets.dataset_builder_polars import DatasetBuilder

In [2]:
with open('../../../../../../Documents/MLB-Data/raw_pitches/pitches_2017.pkl', 'rb') as fpath:
    df = pkl.load(fpath)

In [5]:
class DatasetBuilder():

    def __init__(self, rolling_windows=[75, 504], verbose=False,  gcloud_upload=False,
                 gcloud_upload_path='', local_save=False, local_save_dir_path=''):
        
        self.rolling_windows = rolling_windows
        self.verbose = verbose
        self.gcloud_upload = gcloud_upload
        self.gcloud_upload_path = gcloud_upload_path
        self.local_save = local_save
        self.local_save_dir_path = local_save_dir_path

    def build_training_dataset(self, raw_pitches, save_coefficients=False, coef_save_path=''):
        """
        Cleans raw pitch data, generates neutralization coefficients, anad build a final
        machine readable dataset.

        Args:
            raw_pitches (dict): Raw pitch data for each 'pitbat' combo.
            suffix (str): Suffix for file names.
            save_coefficients (bool): Whether to save neutralization coefficients.

        Returns:
            dict: Training dataset dictionary containing features and target values.

        FUNCTION CONNECTIONS:
        ----------------------
        Calls On: _clean_raw_pitches()
                  _build_neutralization_coefficient_dictionaries()
                  _make_final_dataset()
        """

        # Clean raw pitches and return a cleaned pitches DataFrame
        cleaned_data = self._clean_raw_pitches(raw_pitches)

        # Create a neutralization coefficients dictionary
        coef_dicts = self.build_neutralization_coefficient_dictionaries(cleaned_data)

        if save_coefficients:
            # Format the windows in a variable to help with the naming conventions while saving
            windows = '_'.join([window for window in self.rolling_windows])

            if self.gcloud_upload:
                cf.CloudHelper(obj=coef_dicts).upload_to_cloud(
                    'simulation_training_data', f"neutralization_coefficients_dict_{windows}")
            if self.local_save:
                if not coef_save_path: # Ensure a path is given for a local save
                    raise ValueError('In order to save the coefficients locally, a path to a directory must be provided')
                
                base_path = Path(coef_save_path)
                filename = f'/neutralization_coefficients_dict_{windows}.pkl'
                full_path = base_path + filename
                base_path.mkdir(parents=True, exist_ok=True)

                with open(full_path, 'wb') as f:
                    pkl.dump(coef_dicts, f)

        # Build the final dataset
        final_dataset = self._make_final_dataset(cleaned_data, coef_dicts)
        if self.gcloud_upload:
            cf.CloudHelper(obj=final_dataset).upload_to_cloud(
                'simulation_training_data', f"Final Datasets/final_dataset_{windows}")
        if self.local_save:
            base_path = Path(self.local_save_dir_path)
            filename = f'/daily_stats_df_updated_{dt.today().strftime("%Y-%m-%d")}.pkl'
            full_path = base_path + filename
            base_path.mkdir(parents=True, exist_ok=True)

            with open(full_path, 'wb') as f:
                pkl.dump(final_dataset, f)
        
        return final_dataset
    
    ######################################################################################
    # Clean Pitch Data
    ######################################################################################
    def clean_raw_pitches(self, raw_pitches_df: pd.DataFrame) -> pl.LazyFrame:
        """
        Cleans a DataFrame of raw pitch data, filtering and transforming it into a usable format 
        for subsequent analyses, including attaching weather and ballpark information.

        Parameters:
            raw_pitches_df (DataFrame): A DataFrame of uncleaned pitch data from the Statcast API.

        Returns:
            dict: A dictionary with 4 keys ("RR", "RL", "LR", "LL"), each containing a DataFrame 
            of pitches divided by batter-pitcher handedness combination.

        FUNCTION CONNECTIONS:
        ----------------------
        Calls On: 
        """

        if self.verbose:
            print("Cleaning Data")

        # Convert the raw_pitches file to a LazyFrame
        raw_pitches_df = pl.from_pandas(raw_pitches_df).lazy()

        # Filter down to only regular season games
        raw_pitches_df = raw_pitches_df.filter(pl.col('game_type') == 'R')
    
        # Correct home and away mistakes in the pitch data
        #raw_pitches_df = _correct_home_away_swap(raw_pitches_df)
        

        # Convert the datetime game_date to a string formatted as YYYY-MM-DD, and sort the df on the column to make sure everything is in order
        raw_pitches_df = raw_pitches_df.with_columns(
            pl.col('game_date').dt.strftime('%Y-%m-%d').alias('game_date')
        ).sort(by=["game_date", "inning", "inning_topbot", "at_bat_number"],
               descending=[False, False, False, False])

        # Filter all pitches to only those with an event\
        raw_plays = raw_pitches_df.drop_nulls(subset=['events'])

        # Filter all pitches with an event to only those types we care about
        relevant_plays = raw_plays.filter(
            pl.col('events').is_in(constants.RELEVANT_PLAY_TYPES)
        )

        # Filter all pitches with an event to only those types we care about
        # As well as only the columns we care about
        final_plays = raw_plays.filter(
            pl.col('events').is_in([constants.RELEVANT_PLAY_TYPES])
        ).select(
            constants.RELEVANT_BATTING_COLUMNS
        )

        # Add a new column that groups all the event types into eventual Y labels
        final_plays = final_plays.with_columns(
            pl.col('events').replace(constants.PLAY_TYPE_DICT).alias('play_type')
        )

        # Insert a new 'type counter' coulumn that will be used repeatedly for calculating rolling stats
        final_plays = final_plays.with_columns(
            pl.lit(1).alias('type_counter')
        )
        
        
        
        return relevant_plays.collect()



In [6]:
builder = DatasetBuilder()

builder.clean_raw_pitches(df)

pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,spin_dir,spin_rate_deprecated,break_angle_deprecated,break_length_deprecated,zone,des,game_type,stand,p_throws,home_team,away_team,type,hit_location,bb_type,balls,strikes,game_year,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,outs_when_up,inning,inning_topbot,…,game_pk,pitcher.1,fielder_2.1,fielder_3,fielder_4,fielder_5,fielder_6,fielder_7,fielder_8,fielder_9,release_pos_y,estimated_ba_using_speedangle,estimated_woba_using_speedangle,woba_value,woba_denom,babip_value,iso_value,launch_speed_angle,at_bat_number,pitch_number,pitch_name,home_score,away_score,bat_score,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,spin_axis,delta_home_win_exp,delta_run_exp,str_year,batter_name,year
str,str,f64,f64,f64,str,i64,i64,str,str,i64,i64,i64,i64,i64,str,str,str,str,str,str,str,i64,str,i64,i64,i64,f64,f64,f64,f64,i64,i64,i64,i64,i64,str,…,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,f64,f64,f64,i64,i64,i64,i64,i64,i64,str,i64,i64,i64,i64,i64,i64,i64,i64,str,str,i64,f64,f64,str,str,i64
"""FS""","""2017-04-02""",86.6,-1.31,5.56,"""Tanaka, Masahiro""",572816,547888,"""single""","""hit_into_play""",,,,,13,"""Corey Dickerson singles on a s…","""R""","""L""","""R""","""TB""","""NYY""","""X""",8,"""line_drive""",0,1,2017,-1.06,-0.28,-0.13,1.28,,,,0,1,"""Bot""",…,490106,547888,596142,595885,516770,452104,591720,458731,453056,592450,54.89,0.893,0.863,0.9,1,1,0,4,5,2,"""Split-Finger""",0,0,0,0,0,0,0,0,"""Standard""","""Standard""",291,0.036,0.424,"""2017""","""corey dickerson""",2017
"""SI""","""2017-04-02""",88.2,3.07,5.59,"""Lester, Jon""",451594,452657,"""field_out""","""hit_into_play""",,,,,13,"""Dexter Fowler lines out to cen…","""R""","""R""","""L""","""STL""","""CHC""","""X""",8,"""line_drive""",2,0,2017,0.98,0.64,-0.04,1.38,,,,0,1,"""Bot""",…,490099,452657,575929,519203,595879,592178,608365,656941,518792,450314,54.42,0.233,0.229,0.0,1,0,0,3,5,3,"""Sinker""",0,0,0,0,0,0,0,0,"""Standard""","""Standard""",118,-0.021,-0.346,"""2017""","""dexter fowler""",2017
"""SI""","""2017-04-02""",90.7,-1.55,5.13,"""Tanaka, Masahiro""",595281,547888,"""double""","""hit_into_play""",,,,,8,"""Kevin Kiermaier doubles (1) on…","""R""","""L""","""R""","""TB""","""NYY""","""X""",7,"""line_drive""",2,1,2017,-1.75,0.29,-0.15,1.82,,,572816,0,1,"""Bot""",…,490106,547888,596142,595885,516770,452104,591720,458731,453056,592450,54.79,0.893,0.863,1.25,1,1,1,4,6,4,"""Sinker""",0,0,0,0,0,0,0,0,"""Standard""","""Standard""",265,0.103,1.038,"""2017""","""kevin kiermaier""",2017
"""FF""","""2017-04-02""",92.2,2.56,5.63,"""Bumgarner, Madison""",572041,518516,"""field_out""","""hit_into_play""",,,,,5,"""A.J. Pollock pops out to first…","""R""","""R""","""L""","""ARI""","""SF""","""X""",3,"""popup""",0,0,2017,-0.01,1.19,-0.12,2.58,,,,0,1,"""Bot""",…,490110,518516,457763,474832,605412,456488,543063,592620,452655,452254,53.52,0.0,0.0,0.0,1,0,0,3,6,1,"""4-Seam Fastball""",0,0,0,0,0,0,0,0,"""Standard""","""Standard""",181,-0.021,-0.245,"""2017""","""aj pollock""",2017
"""FF""","""2017-04-02""",91.3,3.05,5.63,"""Lester, Jon""",649557,452657,"""double""","""hit_into_play""",,,,,9,"""Aledmys Diaz doubles (1) on a …","""R""","""R""","""L""","""STL""","""CHC""","""X""",9,"""line_drive""",1,0,2017,0.94,1.38,0.41,2.4,,,,1,1,"""Bot""",…,490099,452657,575929,519203,595879,592178,608365,656941,518792,450314,54.23,0.453,0.506,1.25,1,1,1,5,6,2,"""4-Seam Fastball""",0,0,0,0,0,0,0,0,"""Standard""","""Standard""",143,0.04,0.39,"""2017""","""aledmys diaz""",2017
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""CU""","""2017-10-01""",79.9,2.52,5.89,"""Skoglund, Eric""",592273,607215,"""field_out""","""hit_into_play""",,,,,5,"""Brandon Drury flies out to cen…","""R""","""R""","""L""","""KC""","""ARI""","""X""",8,"""fly_ball""",1,2,2017,-0.87,-0.84,0.21,2.06,,,571757,1,9,"""Top""",…,492514,607215,460077,461235,593160,596144,609275,624585,449181,593528,54.14,0.007,0.01,0.0,1,0,0,3,84,5,"""Curveball""",2,14,14,2,14,2,14,2,"""Standard""","""Standard""",318,0.0,-0.218,"""2017""","""brandon drury""",2017
"""FF""","""2017-10-01""",91.7,2.56,5.76,"""Skoglund, Eric""",518614,607215,"""field_out""","""hit_into_play""",,,,,1,"""Daniel Descalso lines out shar…","""R""","""L""","""L""","""KC""","""ARI""","""X""",8,"""line_drive""",2,1,2017,0.25,0.98,-0.41,2.72,,,571757,2,9,"""Top""",…,492514,607215,460077,461235,593160,596144,609275,624585,449181,593528,53.71,0.647,0.739,0.0,1,0,0,4,85,4,"""4-Seam Fastball""",2,14,14,2,14,2,14,2,"""Standard""","""Standard""",164,0.0,-0.242,"""2017""","""daniel descalso""",2017
"""SI""","""2017-10-01""",93.5,-2.21,5.28,"""Albers, Matt""",607471,458006,"""strikeout""","""swinging_strike""",,,,,7,"""Christopher Bostick strikes ou…","""R""","""R""","""R""","""WSH""","""PIT""","""S""",2,,1,2,2017,-1.3,0.74,-0.8,1.76,,,,0,9,"""Top""",…,492525,458006,446653,435062,594694,506703,607208,645302,572191,457477,53.54,,,0.0,1,0,0,,91,4,"""Sinker""",6,11,11,6,11,6,11,6,"""Standard""","""Standard""",244,0.001,-0.173,"""2017""","""christopher bostick""",2017
"""FF""","""2017-10-01""",94.2,-1.85,5.27,"""Albers, Matt""",516782,458006,"""field_out""","""hit_into_play""",,,,,8,"""Starling Marte lines out to sh…","""R""","""R""","""R""","""WSH""","""PIT""","""X""",6,"""line_drive""",1,2,2017,-0.79,0.9,-0.05,1.83,,,,1,9,"""Top""",…,492525,458006,446653,435062,594694,506703,607208,645302,572191,457477,53.48,0.673,0.608,0.0,1,0,0,4,92,4,"""4-Seam Fastball""",6,11,11,6,11,6,11,6,"""Standard""","""Standard""",225,0.0,-0.122,"""2017""","""starling marte""",2017
