In [1]:
import sys
if not hasattr(sys.modules['__main__'], '__file__'):
    %run "C:\Users\james\Documents\MLB\Code\U1. Imports.ipynb"
    %run "C:\Users\james\Documents\MLB\Code\U2. Utilities.ipynb"
    %run "C:\Users\james\Documents\MLB\Code\U3. Classes.ipynb"
    %run "C:\Users\james\Documents\MLB\Code\U4. Datasets.ipynb"
    %run "C:\Users\james\Documents\MLB\Code\U5. Models.ipynb"

In [2]:
multiplier_df = pd.read_csv(os.path.join(baseball_path, "Multiplier Dataset.csv"))

### Data

Create Latest PA Data

In [5]:
def start_data(df):
    original_index = df.index  # Save the original index
    
    pl_df = pl.from_pandas(df)

    # Calculate hits, total bases, reached, and faced
    pl_df = pl_df.with_columns([
        (pl.col('b1').cast(pl.Float64) + pl.col('b2').cast(pl.Float64) + pl.col('b3').cast(pl.Float64) + pl.col('hr').cast(pl.Float64)).alias('h'),
        (pl.col('b1') * 1 + pl.col('b2') * 2 + pl.col('b3') * 3 + pl.col('hr') * 4).alias('tb'),
        (pl.col('b1').cast(pl.Float64) + pl.col('b2').cast(pl.Float64) + pl.col('b3').cast(pl.Float64) + pl.col('hr').cast(pl.Float64) + pl.col('bb').cast(pl.Float64) + pl.col('hbp').cast(pl.Float64)).alias('reached'),
        pl.lit(1).alias('faced'),
        (((pl.col('inning') - 1) * 3) + pl.col('outs')).alias('outs_total')
    ])

    # Outs per PA
    pl_df = pl_df.sort(['gamePk', 'inning', 'halfInning', 'atBatIndex'])
    pl_df = pl_df.with_columns([
        (pl.col('outs_total') - pl.col('outs_total').shift(1)).over(['gamePk', 'inning', 'halfInning']).alias('outs_pa')
    ]).with_columns([
        pl.when(pl.col('outs_pa').is_null()).then(pl.col('outs')).otherwise(pl.col('outs_pa')).alias('outs_pa')
    ])

    # Sort before cumulative calculations
    pl_df = pl_df.sort(['gamePk', 'pitcher', 'inning', 'atBatIndex'])
    
    # Rolling cumulative stats per inning
    for stat in events_list + ['h', 'tb', 'reached', 'faced', 'rbi', 'outs_pa']:
        pl_df = pl_df.with_columns([
            pl.col(stat).cum_sum().over(['gamePk', 'pitcher', 'inning']).alias(f'{stat}_inning')
        ])

    # Rolling cumulative stats per game
    for stat in events_list + ['h', 'tb', 'reached', 'faced', 'rbi', 'outs_pa']:
        pl_df = pl_df.with_columns([
            pl.col(stat).cum_sum().over(['gamePk', 'pitcher']).alias(f'{stat}_game')
        ])

    # Bottom of the inning flag
    pl_df = pl_df.with_columns([
        (pl.col('top') == 0).cast(pl.Int8).alias('bottom')
    ])

    # Sort to identify starting pitchers
    pl_df = pl_df.sort(['date', 'gamePk', 'bottom', 'atBatIndex'])

    # Identify first at-bat for each bottom
    pl_df = pl_df.with_columns([
        pl.col('atBatIndex').min().over(['gamePk', 'bottom']).alias('atBatIndex_min')
    ]).with_columns([
        (pl.col('atBatIndex') == pl.col('atBatIndex_min')).cast(pl.Int8).alias('first_ab')
    ])

    # Identify pulled pitcher
    pl_df = pl_df.with_columns([
        pl.col('atBatIndex').max().over(['gamePk', 'pitcher']).alias('atBatIndex_max')
    ]).with_columns([
        (pl.col('atBatIndex') == pl.col('atBatIndex_max')).cast(pl.Int8).alias('pulled')
    ])

    # Times faced in game (adjusted for total batters faced)
    pl_df = pl_df.with_columns([
        (pl.col('faced_game') / 9).floor().fill_null(0).alias('times_faced')
    ])

    result = pl_df.to_pandas()
    result.index = original_index  # Restore the original index
    
    return result


In [6]:
def rolling_pas(df, pa_num, adjust, events_list=events_list):
    if adjust:
        events_list_copy = [f"{event}_copy" for event in events_list]
        df[events_list_copy] = df[events_list].copy()
        df[events_list] = df[events_list_adj].copy()

    
    # Renaming columns on df before conversion to Polars
    df.rename(columns={'hit_distance_sc': 'totalDistance', 'launch_speed': 'launchSpeed'}, inplace=True)

    # Convert to Polars after renaming
    pl_df = pl.from_pandas(df)

    # Ensure types are correctly set after converting to Polars
    pl_df = pl_df.with_columns([
        pl.col('date').cast(pl.Int32),
        pl.col('gamePk').cast(pl.Int32),
        pl.col('atBatIndex').cast(pl.Int32),
        pl.col('batter').cast(pl.Int32),
        pl.col('pitcher').cast(pl.Int32)
    ])

    # Sorting is done in Polars
    pl_df = pl_df.sort(['date', 'gamePk', 'atBatIndex'])

    # Create expressions for batter and pitcher stats
    batter_avg_exprs = [
        pl.col(col).rolling_mean(window_size=pa_num, min_periods=1).over(['batter', 'pitchHand']).alias(col + '_b')
        for col in events_list + statcast_list
    ]
    batter_max_exprs = [
        pl.col(col).rolling_max(window_size=pa_num, min_periods=1).over(['batter', 'pitchHand']).alias(col + '_b')
        for col in max_list
    ]
    batter_sum_exprs = [
        pl.col(col).rolling_sum(window_size=pa_num, min_periods=1).over(['batter', 'pitchHand']).alias(col + '_b')
        for col in ['ab', 'pa']
    ]

    pitcher_avg_exprs = [
        pl.col(col).rolling_mean(window_size=pa_num, min_periods=1).over(['pitcher', 'batSide']).alias(col + '_p')
        for col in events_list + statcast_list
    ]
    pitcher_max_exprs = [
        pl.col(col).rolling_max(window_size=pa_num, min_periods=1).over(['pitcher', 'batSide']).alias(col + '_p')
        for col in max_list
    ]
    pitcher_sum_exprs = [
        pl.col(col).rolling_sum(window_size=pa_num, min_periods=1).over(['pitcher', 'batSide']).alias(col + '_p')
        for col in ['ab', 'pa']
    ]

    # Add the computed columns to pl_df
    pl_df = pl_df.with_columns(
        batter_avg_exprs + batter_max_exprs + batter_sum_exprs +
        pitcher_avg_exprs + pitcher_max_exprs + pitcher_sum_exprs
    )

    # Create 'imp_b' and 'imp_p' directly in Polars
    pl_df = pl_df.with_columns([
        (pl.col('pa_b') < 40).cast(pl.Int32).alias('imp_b'),
        (pl.col('pa_p') < 40).cast(pl.Int32).alias('imp_p')
    ])

    # Clean up date and other columns directly in Polars
    pl_df = pl_df.with_columns([
        pl.col('game_date').str.replace_all('-', '').cast(pl.Int32).alias('date'),
        pl.col('gamePk').cast(pl.Int32),
        pl.col('atBatIndex').cast(pl.Int32),
        pl.col('batter').cast(pl.Int32),
        pl.col('pitcher').cast(pl.Int32)
    ])

    # Sort the data as needed
    pl_df = pl_df.sort(['date', 'gamePk', 'atBatIndex'])

    # Calculating wOBA, SLG, OBP, and ISO directly in Polars
    pl_df = pl_df.with_columns([
        (0.690 * pl.col('bb_b') + 0.721 * pl.col('hbp_b') +
         0.885 * pl.col('b1_b') + 1.262 * pl.col('b2_b') +
         1.601 * pl.col('b3_b') + 2.070 * pl.col('hr_b')).alias('woba_b'),
        (0.690 * pl.col('bb_p') + 0.721 * pl.col('hbp_p') +
         0.885 * pl.col('b1_p') + 1.262 * pl.col('b2_p') +
         1.601 * pl.col('b3_p') + 2.070 * pl.col('hr_p')).alias('woba_p'),

        ((1 * pl.col('b1_b') + 2 * pl.col('b2_b') + 3 * pl.col('b3_b') + 4 * pl.col('hr_b')) *
         (1 / (1 - (pl.col('bb_b') + pl.col('hbp_b'))))).alias('slg_b'),
        ((1 * pl.col('b1_p') + 2 * pl.col('b2_p') + 3 * pl.col('b3_p') + 4 * pl.col('hr_p')) *
         (1 / (1 - (pl.col('bb_p') + pl.col('hbp_p'))))).alias('slg_p'),

        (pl.col('b1_b') + pl.col('b2_b') + pl.col('b3_b') + pl.col('hr_b') +
         pl.col('bb_b') + pl.col('hbp_b')).alias('obp_b'),
        (pl.col('b1_p') + pl.col('b2_p') + pl.col('b3_p') + pl.col('hr_p') +
         pl.col('bb_p') + pl.col('hbp_p')).alias('obp_p'),

        ((pl.col('b2_b') + 2 * pl.col('b3_b') + 3 * pl.col('hr_b')) *
         (1 / (1 - (pl.col('bb_b') + pl.col('hbp_b'))))).alias('iso_b'),
        ((pl.col('b2_p') + 2 * pl.col('b3_p') + 3 * pl.col('hr_p')) *
         (1 / (1 - (pl.col('bb_p') + pl.col('hbp_p'))))).alias('iso_p')
    ])

    # Convert back to pandas for final operations
    df_copy = pl_df.to_pandas()
    
    if adjust:
        df_copy[events_list] = df_copy[events_list_copy].copy()

        df_copy.drop(columns=events_list_copy, inplace=True)
        
    return df_copy


In [7]:
def clean_weather(df):
    import numpy as np

    # Split weather into temperature and weather type
    weather_split = df['weather'].str.split(", ", expand=True)
    df['temperature'] = pd.to_numeric(weather_split[0].str.replace(" degrees", ""), errors='coerce')
    df['weather'] = weather_split[1]

    # Split wind into speed and direction
    wind_split = df['wind'].str.split(", ", expand=True)
    df['windSpeed'] = pd.to_numeric(wind_split[0].str.replace(" mph", ""), errors='coerce').fillna(0)
    df['windDirection'] = wind_split[1].fillna('L to R').str.replace(".", "", regex=False)

    wind_speed = df['windSpeed'].to_numpy()
    angled = wind_speed / 2 * np.sqrt(2)
    direction = df['windDirection'].to_numpy()

    # Create lookup tables
    y_lookup = {
        "Out To CF": wind_speed,
        "Out To RF": angled,
        "L To R": np.zeros_like(wind_speed),
        "In From LF": -angled,
        "In From CF": -wind_speed,
        "In From RF": -angled,
        "R To L": np.zeros_like(wind_speed),
        "Out To LF": angled
    }

    x_lookup = {
        "L To R": wind_speed,
        "In From LF": angled,
        "In From CF": np.zeros_like(wind_speed),
        "In From RF": -angled,
        "R To L": -wind_speed,
        "Out To LF": -angled,
        "Out To CF": np.zeros_like(wind_speed),
        "Out To RF": angled
    }

    df['y_vect'] = np.zeros(len(df))
    df['x_vect'] = np.zeros(len(df))

    for key, values in y_lookup.items():
        df.loc[direction == key, 'y_vect'] = values[direction == key]
    for key, values in x_lookup.items():
        df.loc[direction == key, 'x_vect'] = values[direction == key]

    # Overwrite for domes/roofs
    is_dome = df['weather'].str.contains('Roof|Dome', na=False)
    df.loc[is_dome, 'temperature'] = 70
    df.loc[is_dome, ['x_vect', 'y_vect']] = 0

    return df


In [8]:
%%time
new_df_unadj = create_pa_inputs(None, start_year=2015, end_year=2025, short=50, long=300, adjust=False)

merge_datasets took 16.68 seconds
clean_weather took 11.93 seconds
create_events took 0.16 seconds
create_variables took 7.00 seconds
start_data took 17.75 seconds
Short took 10.69 seconds
Long took 14.01 seconds
CPU times: total: 3min 53s
Wall time: 1min 34s


In [12]:
del new_df_unadj

In [10]:
%%time
new_df_unadj.to_csv(os.path.join(baseball_path, "Complete Dataset - Unadjusted (test).csv"))

CPU times: total: 3min 53s
Wall time: 3min 58s


In [9]:
%%time
new_df_adj = create_pa_inputs(multiplier_df, start_year=2015, end_year=2025, short=50, long=300, adjust=True)

merge_datasets took 17.75 seconds
clean_weather took 14.53 seconds
create_events took 0.18 seconds
create_variables took 7.25 seconds
park_adjustments took 4.73 seconds
start_data took 28.71 seconds
Short took 21.26 seconds
Long took 39.78 seconds
CPU times: total: 7min 27s
Wall time: 2min 33s


In [11]:
%%time
new_df_adj.to_csv(os.path.join(baseball_path, "Complete Dataset - Adjusted (test).csv"))

CPU times: total: 5min 58s
Wall time: 6min 2s


In [13]:
del new_df_adj