## UPDATES IN PROGRESS -- PRIOR SAVED VERSIONS OF FINAL DATASETS CAN BE LOADED IN TRAINING MODELS WORKSHEET

In [1]:
### Things to do:
### - SORT VALUES IN THE BATTER AND PITCHER DFS ON GAME STATE TO ENSURE ALL PLAYS ARE IN ORDER
### - Add in current game situation (inning, score diff, runners on)
### - Add in num batters faced for pitcher in game (also maybe a rest metric like days since last pitched if its easy)
### If we ever start using the daily pulled weather with hourly changes, make sure the weather attachment and cleaning still works in build_plays_by_hand_combo


# Use the below to eventually build a stealing df to help with the simulation
# pitches[pitches.des.str.contains("steal") == True].iloc[40]

### Import Packages and Warnings

In [2]:
# Major Packages
import pandas as pd
import numpy
import sys
import sklearn 
import pickle as pkl
import warnings
import time
import timeit
import random
import datetime
from datetime import datetime as dt
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import log_loss
from sklearn.model_selection import cross_val_score, RandomizedSearchCV, cross_validate, cross_val_predict, GridSearchCV
import matplotlib.pyplot as plt
from IPython.display import clear_output
import numpy as np
from scipy import stats

# Modules in folder
import constants
from functions import *
# Warnings
warnings.simplefilter("ignore")

# Internal Modules Outside Current Folder
# Change the path so that we can import the local cloud functions stored in a different directory. THE PATH IS DIFFERENT ON MAC AND PI, SO USE TRY EXCEPT FOR BOTH
try: # for mac
    sys.path.insert(1, '/users/jaredzirkes/Desktop/Python/GitHub')
    from google_cloud.cloud_functions import CloudHelper
except:
    sys.path.insert(1, "/home/pi/Desktop/Python")
    from google_cloud.cloud_functions import CloudHelper

YEAR = 2016

### Import Prior Data

In [3]:
# Import file of weather data from the given year
weather = CloudHelper().download_from_cloud("proreference_weather_data/weather_data_{}".format(YEAR)).drop(columns=("Unnamed: 0"))

# Import file of raw pitches from the given year
all_pitches = CloudHelper().download_from_cloud("yearly_pitches_files/pitches_{}.pkl".format(YEAR))

In [9]:
x = all_pitches.copy()
x = clean_raw_pitches(x)
#x = insert_game_play_shares(x)

y = create_weather_regression_dataframes(x)

In [10]:
x["RR"].head()

Unnamed: 0,game_date,player_name,batter,pitcher,events,stand,p_throws,home_team,away_team,hit_location,...,temprature,wind_speed,wind_direction,Left to Right,Right to Left,in,out,zero,ballpark,game_play_share
0,2016-04-03,"Vólquez, Edinson",493316,450172,strikeout,R,R,KC,NYM,2.0,...,74,14,Right to Left,0.0,14.0,0.0,0.0,0.0,Kauffman,0.193548
1,2016-04-03,"Stroman, Marcus",519306,573186,field_out,R,R,TB,TOR,6.0,...,72,0,zero,0.0,0.0,0.0,0.0,0.0,Tropicana,0.309524
2,2016-04-03,"Colon, Bartolo",444876,112526,field_out,R,R,KC,NYM,5.0,...,74,14,Right to Left,0.0,14.0,0.0,0.0,0.0,Kauffman,0.419355
3,2016-04-03,"Colon, Bartolo",456715,112526,strikeout,R,R,KC,NYM,2.0,...,74,14,Right to Left,0.0,14.0,0.0,0.0,0.0,Kauffman,0.193548
4,2016-04-03,"Oh, Seunghwan",457705,493200,walk,R,R,PIT,STL,,...,39,14,out,0.0,0.0,0.0,14.0,0.0,PNC,0.137931


# Build Training Dataset of Plays

### Weather Regression

In [6]:
def weather_regress(weather_training_data):
    """
    Function regresses the percent of plays in a game that are each play type on the underlying weather condition to determine
    the impact of weather conditions on the play type distribution. This will be used in neutralizing batting stats for use in 
    modeling.
    
    Parameters
    --------------
    weather_training_data: DataFrame
        A DataFrame that contains the underlying weather conditions of each play in addition to the game information. This is the direct
        output of the prepare_weather_regressions function.
    -----------------    
   
    Returns: Tuple(DataFrame, Dictionary)
        A DataFrame of the original weather_training_data for use later on.
        A Nested Dictionary that contains the weather coefficients for each weather datapoint for each play type
    """
        
    weather_coefficients = {}

    for pitbat_combo in hand_combos:
        weather_coefficients[pitbat_combo] = {}
        
        # Segment to only the specific play type for each play type before regressing on the weather info
        for play_type in weather_training_data[pitbat_combo].play_type.unique():
            PAs = weather_training_data[pitbat_combo][weather_training_data[pitbat_combo].play_type == play_type]

            # Remove outliers for game_share_delta, most of which are caused by low pitbat_combo sample sizes in games
            PAs = PAs[(np.abs(stats.zscore(PAs.game_play_share)) < 3)]

            # Create 2 sets of x data, with and without squaring temprature
            x = PAs[PAs.columns[np.r_[2:4, 6:11]]] #grab only the weather related columns and then get rid of regular temprature

            x_sq = x[[col for col in x.columns if col != "temprature" and col != "wind_speed"]]

            y = PAs.game_play_share

            # Regress the temprature squared dataset on game_share_delta
            lin_sq = LinearRegression(fit_intercept = True)
            lin_sq.fit(x_sq, y)

            weather_coefficients[pitbat_combo][play_type] = {"intercept":lin_sq.intercept_, "temprature_sq":lin_sq.coef_[0], "wind_ltr":lin_sq.coef_[1],
                                                     "wind_rtl":lin_sq.coef_[2], "wind_in":lin_sq.coef_[3], "wind_out":lin_sq.coef_[4]}
            
    return(weather_training_data, weather_coefficients)


### Calculating Park Factors

In [7]:
def calculate_park_factors(all_plays_by_hand_combo):
    """
    Function calculated the park factor for each ballpark for each play type based on the percentage that the play type occurs in
    the park vs not in the park
    
    Parameters
    --------------
    all_plays_by_hand_combo: DataFrame
        A DataFrame that contains the all plays segmented by hand combo, and also includes the ballpark in which the play occured.
        
    -----------------    
   
    Returns: Tuple(DataFrame, Dictionary)
        The original all_plays_by_hand_combbo DataFrame for later use
        A Nested Dictionary that contains the park factors for each ballpark and each play
        
    """
    
    park_factors_dict = {}
    print("Calculating Ballpark Factors")

    for pitbat_combo in hand_combos:
        park_factors_dict[pitbat_combo] = {}
        
        # For each ballpark, segment all our plays into 2 DataFrames. 1 for all plays at the park and 1 or all plays not at the park
        for ballpark in all_plays_by_hand_combo["RR"].ballpark.unique():
            park_factors_dict[pitbat_combo][ballpark] = {}
            at_park_df = all_plays_by_hand_combo[pitbat_combo][(all_plays_by_hand_combo[pitbat_combo].ballpark == ballpark)].copy()
            not_at_park_df = all_plays_by_hand_combo[pitbat_combo][(all_plays_by_hand_combo[pitbat_combo].ballpark != ballpark)].copy()

            # For each play type, calculate the percentage it occurs at in the park and out of the park
            for play_type in all_plays_by_hand_combo["RR"].play_type.unique():
                at_park_rate = len(at_park_df[at_park_df.play_type == play_type])/len(at_park_df)
                not_at_park_rate = len(not_at_park_df[not_at_park_df.play_type == play_type])/len(not_at_park_df)

                try:
                    park_factor = at_park_rate/not_at_park_rate
                except:
                    park_factor = "n/a"
                
                # Insert the park factors into a dictionary
                park_factors_dict[pitbat_combo][ballpark][play_type] = park_factor

    clear_output(wait=False)
    
    return (all_plays_by_hand_combo, park_factors_dict)

## Neutralizing Stats

In [8]:
def neutralize_stats(all_plays_by_hand_combo, weather_coefficients, park_factors_dict, is_dump=False):
    """
    Function uses the weather coefficients and park factors to determine an 'impact' for each individual play in the date based on its
    actual weather info and park.
    
    Parameters
    --------------
    all_plays_by_hand_combo: DataFrame
        A DataFrame that contains the all plays segmented by hand combo, and also includes the ballpark in which the play occured and the real weather info.
    
    weather_cofficients: Dictionary
        A nested dictionary of the weather coefficients by play. This is the direct output of the weather_regress function
        
    park_factors_dist: Dictionary
        A nested dictionary of the park factors by play. This is the direct output of the calculate_park_factors function
        
    is_dump: Boolean
        A boolean determining whether or not the pickle the factord batting stats upon calcualtion
             
    -----------------    
   
    Returns: Tuple(DataFrame, Dictionary)
        The original all_plays_by_hand_combbo DataFrame for later use
        A Nested Dictionary that contains the park factors for each ballpark and each play
        
    """
    print("Neutralizing Batting Stats using Weather/Stadium Coefficients")

    factored_training_stats = {}
    for pitbat_combo in hand_combos:

        # Grab the relevant columns and games
        df = all_plays_by_hand_combo[pitbat_combo][["game_pk", "game_date", "batter", "pitcher",'on_3b', 'on_2b', 'on_1b', 'outs_when_up', 'inning', 'inning_topbot', "bat_score", "fld_score", "play_type", "temprature", "wind_speed", "wind_direction", "ballpark"]].copy()

        # Add information for the actual weather and stadium impacts for each game
        df = convert_wind_direction(df, df.wind_direction)
        df["weather_expectation"] = df.apply(lambda x: x["Left to Right"]*weather_coefficients[pitbat_combo][x.play_type]["wind_ltr"] + x["Right to Left"]*weather_coefficients[pitbat_combo][x.play_type]["wind_rtl"] +
                                        x["in"]*weather_coefficients[pitbat_combo][x.play_type]["wind_in"] + x["out"]*weather_coefficients[pitbat_combo][x.play_type]["wind_out"] +
                                        (x["temprature"]**2) * weather_coefficients[pitbat_combo][x.play_type]["temprature_sq"] + weather_coefficients[pitbat_combo][x.play_type]["intercept"], axis=1)

        df["neutral_weather_expectation"] = df.apply(lambda x: 72**2 * weather_coefficients[pitbat_combo][x.play_type]["temprature_sq"] + weather_coefficients[pitbat_combo][x.play_type]["intercept"], axis=1)
        df["weather_impact"] = df.weather_expectation/df.neutral_weather_expectation
        df["stadium_impact"] = df.apply(lambda x: park_factors_dict[pitbat_combo][x.ballpark][x.play_type], axis=1)

        # Multiply the weather and stadium impacts to get the total impact for the specific at-bat result
        df["play_value"] = 1
        df["impact"] = df.play_value * df.weather_impact * df.stadium_impact
        df.play_value = 1/df.impact
        
        # Grab the final df that we will use for rolling stats
        factored_training_stats[pitbat_combo] = df[["game_pk", "game_date","ballpark", "temprature", "wind_speed", "wind_direction", "batter", "pitcher", 'on_3b', 'on_2b', 'on_1b', 'outs_when_up', 'inning', 'inning_topbot', "bat_score", "fld_score", "play_type","impact", "play_value"]]
        
        if is_dump == True:
            factored_training_stats.to_pickle("/Users/jaredzirkes/Documents/GitHub/MLB-Simulation/training_batting_stats_with_factors.pkl")

    clear_output(wait=False)
    
    return factored_training_stats

#### Calculate League Averages Over the Length of Training

In [9]:
def calculate_league_averages(game_play_share_data):
    league_average_plays_dict = {}
    for pitbat_combo in hand_combos:
        league_average_plays_dict[pitbat_combo] = {}
        for play in plays:
            df = game_play_share_data[pitbat_combo]
            play_share = len(df[df.play_type == play])/len(df)
            league_average_plays_dict[pitbat_combo][play] = play_share

    return league_average_plays_dict

## Roll Stats Daily To Get Final Odds Functions Training Data Sets

In [10]:
def roll_factored_batting_stats(factored_batting_stats, min_periods = 0, is_dump=False):
    """
    Function rolls batting stats and percentages across the tracked play types.
    
    Parameters
    --------------
    factored_batting_stats: DataFrame
        A DataFrame that contains the all plays segmented by hand combo, and also includes a column with the calculated impact from the weather/ballpark. This is
        the direct output on the neutralize_stats function
    
    rolling_period: Integer
        The size of the time period in at-bats to roll stats by, as an accompanement to the monthly roll.
        
    min_periods: Integer
        The minimum number of at-bats to consider when rolling stats. The rolling function will return None before this number is hit.
        
    is_dump: Boolean
        A boolean determining whether or not the pickle the rolled factord batting stats upon calcualtion
        
    -----------------    
   
    Returns: Tuple(DataFrame, Dictionary)
        The original all_plays_by_hand_combbo DataFrame for later use
        A Nested Dictionary that contains the park factors for each ballpark and each play
    """
    
    # Create a rolling percentage for each play outcome for each batter and pitcher for each year 
    rolling_factored_batting_stats = {}
    rolling_factored_pitching_stats = {}

    for pitbat_combo in hand_combos:
        print("Rolling Batting and Pitching Stats {}".format(pitbat_combo))
        clear_output(wait=True)

        # Set up dictionaries to house everything
        rolling_factored_batting_stats[pitbat_combo] = {}
        rolling_factored_pitching_stats[pitbat_combo] = {}

        # Filter down to the stats for just the relevant hand combo and sort by game date for rolling
        batter_df, pitcher_df = factored_batting_stats[pitbat_combo].copy(), factored_batting_stats[pitbat_combo].copy()
        batter_df, pitcher_df = batter_df.sort_values(by = "game_date", ascending = True), pitcher_df.sort_values(by = "game_date", ascending = True)
        batter_df["pitbat"] = pitbat_combo
        pitcher_df["pitbat"] = pitbat_combo
    
        for play in plays:
            # Multiply the situation impact by a binary vector for play outcomes with a 1 for the correct play
            batter_df["season_{}".format(play)] = batter_df.apply(lambda x: 1*x.play_value if x.play_type==play else 0, axis = 1)
            batter_df["month_{}".format(play)] = batter_df["season_{}".format(play)]
            # Multiply the situation impact by a binary vector for play outcomes with a 1 for the correct play
            pitcher_df["season_{}".format(play)] = pitcher_df.apply(lambda x: 1*x.play_value if x.play_type==play else 0, axis = 1)
            pitcher_df["month_{}".format(play)] = pitcher_df["season_{}".format(play)]
            
        
        # Roll batting stats on a season and montly basis
        season_rolled_batter_df = batter_df.copy().groupby(by="batter").rolling(window=504, closed="left").sum().to_dict()
        month_rolled_batter_df = batter_df.copy().groupby(by="batter").rolling(window=75, closed="left").sum().to_dict()
        
        

        # Roll pitching stats on a season and montly basis
        season_rolled_pitcher_df = pitcher_df.copy().groupby(by="pitcher").rolling(window=504, closed="left").sum().to_dict()
        month_rolled_pitcher_df = pitcher_df.copy().groupby(by="pitcher").rolling(window=75, closed="left").sum().to_dict()
        
        for play in plays:
            # Assign the rolled values from players' stats back to the pitcher and batter DataFrames by pulling the data from the dictionaries
            batter_df["season_{}".format(play)] = batter_df.apply(lambda x: season_rolled_batter_df["season_{}".format(play)][(x.batter, x.name)], axis = 1)
            batter_df["month_{}".format(play)] = batter_df.apply(lambda x: month_rolled_batter_df["month_{}".format(play)][(x.batter, x.name)], axis = 1)
            
            
            pitcher_df["season_{}".format(play)] = pitcher_df.apply(lambda x: season_rolled_pitcher_df["season_{}".format(play)][(x.pitcher, x.name)], axis = 1)
            pitcher_df["month_{}".format(play)] = pitcher_df.apply(lambda x: month_rolled_pitcher_df["month_{}".format(play)][(x.pitcher, x.name)], axis = 1)

      
        print("Repercentaging Rolled Batting Stats {}".format(pitbat_combo))
        clear_output(wait=True)
        
        # Repercentage factored batting stats percentage to sum to 1 because they don't necessarily after neutralization
        season_columns = ["season_{}".format(play) for play in plays]
        month_columns = ["month_{}".format(play) for play in plays]
        batter_df[season_columns] = batter_df.apply(lambda row: pd.Series([row[f"season_{play_type}"]/row[season_columns].sum() for play_type in list(plays)]) if row[season_columns].sum() > 0 else pd.Series([0 for play_type in plays]), axis=1)
        batter_df[month_columns] = batter_df.apply(lambda row: pd.Series([row[f"month_{play_type}"]/row[month_columns].sum() for play_type in list(plays)]) if row[month_columns].sum() > 0 else pd.Series([0 for play_type in plays]), axis=1)
       
        
        print("Repercentaging Rolled Pitching Stats {}".format(pitbat_combo))
        # Repercentage factored pitching stats percentage to sum to 1 because they don't necessarily after neutralization
        pitcher_df[season_columns] = pitcher_df.apply(lambda row: pd.Series([row[f"season_{play_type}"]/row[season_columns].sum() for play_type in list(plays)]) if row[season_columns].sum() > 0 else pd.Series([0 for play_type in plays]), axis=1)
        pitcher_df[month_columns] = pitcher_df.apply(lambda row: pd.Series([row[f"month_{play_type}"]/row[month_columns].sum() for play_type in list(plays)]) if row[month_columns].sum() > 0 else pd.Series([0 for play_type in plays]), axis=1)
       
        
        
#         pitcher_df[["season_{}".format(play) for play in plays]] = pitcher_df.apply(lambda x: pd.Series([x[["season_{}".format(play) for play in plays]]["season_{}".format(p)]/x[["season_{}".format(play) for play in plays]].sum() for p in [z for z in plays]]) if x[["season_{}".format(p) for p in plays]].sum() > 0 else pd.Series([0 for p in plays]), axis=1)
#         pitcher_df[["month_{}".format(play) for play in plays]] = pitcher_df.apply(lambda x: pd.Series([x[["month_{}".format(play) for play in plays]]["month_{}".format(p)]/x[["month_{}".format(play) for play in plays]].sum() for p in [z for z in plays]]) if x[["month_{}".format(p) for p in plays]].sum() > 0 else pd.Series([0 for p in plays]), axis=1)
  
        # Place the final rolling factored batting stats DataFrame into the storage dictionary
        rolling_factored_batting_stats[pitbat_combo] = batter_df[["game_pk", "game_date", "ballpark","temprature", "wind_speed", "wind_direction", "batter", "pitcher", "pitbat",'on_3b', 'on_2b', 'on_1b', 'outs_when_up', 'inning', 'inning_topbot', "bat_score", "fld_score"] + ["season_{}".format(play) for play in plays] + ["month_{}".format(play) for play in plays]]
        rolling_factored_pitching_stats[pitbat_combo] = pitcher_df[["game_pk", "game_date", "ballpark","temprature", "wind_speed", "wind_direction", "batter", "pitcher", "pitbat",'on_3b', 'on_2b', 'on_1b', 'outs_when_up', 'inning', 'inning_topbot', "bat_score", "fld_score"] + ["season_{}".format(play) for play in plays] + ["month_{}".format(play) for play in plays]]
    
    if is_dump == True:
        rolling_factored_pitching_stats.to_pickle("rolling_factored_pitching_stats.pkl")
        rolling_factored_batting_stats.to_pickle("rolling_factored_batting_stats.pkl")
        
    clear_output(wait=False)
    
    return {"pitching_stats":rolling_factored_pitching_stats, "batting_stats":rolling_factored_batting_stats}

In [11]:
# Pull all the rolled individual player DataFrames out of the dictionary and into a large DataFrame that will be used for final training
def stitch_individual_stats(rolling_factored_stats):

    stitched_data = {}
    
    batting_dict = {}
    pitching_dict = {}
    
    
    df_batter = pd.concat([rolling_factored_stats["batting_stats"][pitbat_combo] for pitbat_combo in hand_combos])
    df_pitcher = pd.concat([rolling_factored_stats["pitching_stats"][pitbat_combo] for pitbat_combo in hand_combos])
        
        
    stitched_data["batting_stats"] = df_batter
    stitched_data["pitching_stats"] = df_pitcher
    
    return stitched_data
        
              
        
        
    
#     training_stats = pd.DataFrame()
#     pitching_holder = pd.DataFrame()

#     for pitbat_combo in hand_combos:
#         for batter in rolling_factored_stats["batting_stats"][pitbat_combo]:
            
#             # Find each specific player df of unique pitbat combo, and batter
#             df_b = rolling_factored_stats["batting_stats"][pitbat_combo][batter]

#             # We will through an error trying to look for games with dates less than our opening day, and there's
#             # no need to stats for ~1 games anyways, so cut off the first 3 PAs of stats
#             if len(df_b) > 3:
#                 training_stats = training_stats.append(df_b[3:]) 

#         # Do the same thing for pitchers. Note we can leave in the first 3 PAs, because we will be simply joining with
#         # the batters, so these PAs will get cut off then.
#         for pitcher in rolling_factored_stats["pitching_stats"][pitbat_combo]:
#             df_p = rolling_factored_stats["pitching_stats"][pitbat_combo][pitcher]
#             pitching_holder = pitching_holder.append(df_p) 

#         clear_output(wait=False)

#     if with_year_breaks == True:
#         training_stats["year"] = training_stats.game_date.apply(lambda x: x.split("-")[0])
#         pitching_holder["year"] = pitching_holder.game_date.apply(lambda x: x.split("-")[0])
#     else:
#         training_stats["year"] = training_stats.game_date.apply(lambda x: "All Years")
#         pitching_holder["year"] = pitching_holder.game_date.apply(lambda x: "All Years")

#     #pitching_holder = pitching_holder.rename(columns = {"batter":'pitcher'})

#     clear_output(wait=False)
    
    return {"pitching_stats":pitching_holder, "batting_stats":training_stats}

In [12]:
# Attach the pitching probability vector to the training set by "joining" on the pitbat combo, year, and pitcher name, where the date is just less than the given PA.
# Then reattatch the weather and ballpark info for that game
def finalize_dataset(stitched_dataset, rolling_factored_stats, factored_batting_stats, stats_with_weather):
    stitched_dataset["pitching_stats"].columns = ["pitcher_" + col for col in stitched_dataset["pitching_stats"].columns]
    pitching_columns_to_add = ["pitcher_season_{}".format(play) for play in plays] + ["pitcher_month_{}".format(play) for play in plays]
    stitched_dataset["batting_stats"][pitching_columns_to_add] = stitched_dataset["pitching_stats"][pitching_columns_to_add]
    
#     for play in plays:
# #         print("Attaching the Batter Probability Vectors to the Data Set. There are {} Plays Remaining".format(len(plays) - plays.index(play)))
# #         #stitched_dataset["batting_stats"]["b_season_" + play] = stitched_dataset["batting_stats"].apply(lambda x: rolling_factored_stats["batting_stats"][x.pitbat][x.batter][rolling_factored_stats["batting_stats"][x.pitbat][x.batter].game_date < x.game_date].iloc[-1]["season_{}".format(play)] if len(rolling_factored_stats["batting_stats"][x.pitbat][x.batter][rolling_factored_stats["batting_stats"][x.pitbat][x.batter].game_date < x.game_date])>0 else None, axis = 1)
# #         # stitched_dataset["batting_stats"]["b_month_" + play] = stitched_dataset["batting_stats"].apply(lambda x: rolling_factored_stats["batting_stats"][x.pitbat][x.batter][rolling_factored_stats["batting_stats"][x.pitbat][x.batter].game_date < x.game_date].iloc[-1]["month_{}".format(play)] if len(rolling_factored_stats["batting_stats"][x.pitbat][x.batter][rolling_factored_stats["batting_stats"][x.pitbat][x.batter].game_date < x.game_date])>0 else None, axis = 1)
       
# #         batting_df = stitched_dataset["batting_stats"].apply(lambda x: stitched_dataset["batting_stats"][(stitched_dataset["batting_stats"].batter.values == x.batter) & (stitched_dataset["batting_stats"].game_date.values < x.game_date)].iloc[-1][["season_" + play, "month_" + play]] if len(stitched_dataset["batting_stats"][(stitched_dataset["batting_stats"].batter.values == x.batter) & (stitched_dataset["batting_stats"].game_date.values < x.game_date)]) > 0 else pd.DataFrame(), axis=1) 
# #         stitched_dataset["batting_stats"]["b_season_" + play] = batting_df["season_" + play] if len(batting_df) > 0 else None
# #         stitched_dataset["batting_stats"]["b_month_" + play] = batting_df["month_" + play] if len(batting_df) > 0 else None
        

# #         clear_output(wait=True)

#         print("Attaching the Pitcher Probability Vectors to the Data Set. There are {} Plays Remaining".format(len(plays) - plays.index(play)))
# #         stitched_dataset["batting_stats"]["p_season_" + play] = stitched_dataset["batting_stats"].apply(lambda x: rolling_factored_stats["pitching_stats"][x.pitbat][x.pitcher][rolling_factored_stats["pitching_stats"][x.pitbat][x.pitcher].game_date < x.game_date].iloc[-1]["season_{}".format(play)] if len(rolling_factored_stats["pitching_stats"][x.pitbat][x.pitcher][rolling_factored_stats["pitching_stats"][x.pitbat][x.pitcher].game_date < x.game_date])>0 else None, axis = 1)
# #         stitched_dataset["batting_stats"]["p_month_" + play] = stitched_dataset["batting_stats"].apply(lambda x: rolling_factored_stats["pitching_stats"][x.pitbat][x.pitcher][rolling_factored_stats["pitching_stats"][x.pitbat][x.pitcher].game_date < x.game_date].iloc[-1]["month_{}".format(play)] if len(rolling_factored_stats["pitching_stats"][x.pitbat][x.pitcher][rolling_factored_stats["pitching_stats"][x.pitbat][x.pitcher].game_date < x.game_date])>0 else None, axis = 1)
        
#         pitching_df = stitched_dataset["pitching_stats"].apply(lambda x: stitched_dataset["pitching_stats"][(stitched_dataset["pitching_stats"].pitcher.values == x.pitcher) & (stitched_dataset["pitching_stats"].game_date.values == x.game_date)].iloc[-1]["season_" + play, "month_" + play] if len(stitched_dataset["pitching_stats"][(stitched_dataset["pitching_stats"].pitcher.values == x.pitcher) & (stitched_dataset["pitching_stats"].game_date.values < x.game_date)]) > 0 else pd.DataFrame(), axis=1)
#         stitched_dataset["pitching_stats"]["b_season_" + play] = pitching_df["season_" + play] if len(pitching_df) > 0 else None
#         stitched_dataset["pitching_stats"]["b_month_" + play] = pitching_df["month_" + play] if len(pitching_df) > 0 else None
        
        
        
#         clear_output(wait=True)


    # Add in a column for the actual play, to be used for comparison against our prediction vector
    stitched_dataset["batting_stats"]["play"] = stitched_dataset["batting_stats"].apply(lambda x: factored_batting_stats[x.pitbat].loc[x.name].play_type, axis=1)
    
    
    # Attatch the weather information # THIS MAY HAVE TO CHANGE WITH WEATHER CODING UPDATES
    print("Attatching Original Weather Information to Final Dataset")

    weather_columns = ["temprature_squared", "Left to Right", "Right to Left", "in", "out", "zero"]
    stitched_dataset["batting_stats"][weather_columns] = stitched_dataset["batting_stats"].apply(lambda x: stats_with_weather[x.pitbat][stats_with_weather[x.pitbat].game_pk == x.game_pk].iloc[0][weather_columns] if len(stats_with_weather[x.pitbat][stats_with_weather[x.pitbat].game_pk == x.game_pk]) > 0 else pd.Series({x:None for x in weather_columns}) , axis=1)
    stitched_dataset["batting_stats"]["is_on_base"] = stitched_dataset["batting_stats"].play.apply(lambda x: 1 if x in ["single", "double", "triple", "home_run", "walk", "intent_walk"] else 0)
    
    # Attatch the League Average Information 
    print("Attatching League Average Information")
    league_averages = {}
    for pitbat_combo in hand_combos:
        league_averages[pitbat_combo] = {}
        pitbat_df = stitched_dataset["batting_stats"][stitched_dataset["batting_stats"].pitbat == pitbat_combo].copy()
        for date in pitbat_df.game_date.unique():
            league_averages[pitbat_combo][date] = {"season":{}, "month":{}}
            season_ago = str(int(date.split("-")[0]) - 1) + date.split("-")[1] + date.split("-")[2]
            month_ago = date.split("-")[0] + str(int(date.split("-")[1]) - 1) + date.split("-")[2] #We can just subtract one from the month because baseball is not played in January
            
            season_pitbat_date_df = pitbat_df[(pitbat_df.game_date < date) & (pitbat_df.game_date > season_ago)]#[-1*min(504, len(pitbat_df)):].copy()
            month_pitbat_date_df = pitbat_df[(pitbat_df.game_date < date) & (pitbat_df.game_date > month_ago)]#[-1*min(100, len(pitbat_df)):].copy()
            
            # Find league average from the month and full seasons worth of time
            
            for play in plays:
                season_play_average = len(season_pitbat_date_df[season_pitbat_date_df.play == play])/len(season_pitbat_date_df) if len(season_pitbat_date_df) > 0 else None
                month_play_average = len(month_pitbat_date_df[month_pitbat_date_df.play == play])/len(month_pitbat_date_df) if len(month_pitbat_date_df) > 0 else None
                
                league_averages[pitbat_combo][date]["season"][play] = season_play_average
                league_averages[pitbat_combo][date]["month"][play] = season_play_average

    for play in plays:
        stitched_dataset["batting_stats"]["season_league_average_{}".format(play)] = stitched_dataset["batting_stats"].apply(lambda x: league_averages[x.pitbat][x.game_date]["season"][play], axis=1)
        stitched_dataset["batting_stats"]["month_league_average_{}".format(play)] = stitched_dataset["batting_stats"].apply(lambda x: league_averages[x.pitbat][x.game_date]["month"][play], axis=1)
    
    
    clear_output(wait=False)
    
    return stitched_dataset

In [13]:
def process_pipeline(final_dataset):
    for col in ["on_3b", "on_2b", "on_1b"]:
        final_dataset[col] = final_dataset[col].apply(lambda x: 1 if pd.isna(x) == False else 0) 
    
    final_dataset["inning_topbot"] = odds_dataset[col].apply(lambda x: 1 if x == "Top" else 0) 
    
    ml_full_df = odds_dataset[[col for col in odds_dataset.columns if col not in ["game_pk", "batter", "pitcher", "temprature", "wind_speed", "wind_direction", "year"]]].dropna()
    ml_full_df = ml_full_df[ml_full_df.game_date.apply(lambda x: int(x.split("-")[1])) >= 5].reset_index(drop=True)
    ml_full_df.drop(columns = ["game_date"], inplace=True)
    ml_full_y_play = ml_full_df.play
    ml_full_y_on_base = ml_full_df.is_on_base
    ml_full_df.drop(columns = ["play", "is_on_base"], inplace = True)

    numeric_features = [col for col in ml_full_df if col not in ["ballpark", "pitbat"]]
    numeric_transformer = Pipeline(
        steps=[("scaler", StandardScaler())]
    )
    
    categorical_features = ["ballpark", "pitbat"]
    categorical_transformer = Pipeline(
        steps=[
            ("encoder", OneHotEncoder(handle_unknown="ignore")),
        ]
    )
    
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, numeric_features),
            ("cat", categorical_transformer, categorical_features),
        ]
    )

    ml_pipe = Pipeline(
    steps=[("preprocessor", preprocessor)]
    )

    ml_full_df = ml_pipe.fit_transform(ml_full_df)
    

In [14]:
def build_dataset(pitches, pickle):
    # Clean raw pitch data and sort everything by hand combos
    all_plays_by_hand_combo = build_plays_by_hand_combo(pitches)
    all_plays_by_hand_combo = attach_ballpark_info(all_plays_by_hand_combo)
    
    # Calculate the shares of each play in each game
    game_play_share_data = calculate_game_play_shares(all_plays_by_hand_combo)
    
    # Calculate the league averages over time for each play
    league_averages = calculate_league_averages(game_play_share_data)

    # Clean the data necessary for the weather regression, then run the analysis
    data_for_weather_regression = prepare_weather_regression(game_play_share_data)
    weather_regression_set = weather_regress(data_for_weather_regression)
    weather_coefficients = weather_regression_set[1]
    
    # Additionally, calculate the impact of specific parks 
    park_factors = calculate_park_factors(game_play_share_data)[1]

    # "Neutralize" the raw events based on the impact from stadium and weather
    neutralized_stats = neutralize_stats(game_play_share_data, weather_coefficients, park_factors)

    # Take the neutralized stats for each player and roll them for a month and a season's worth of time
    individual_rolled_factored_stats = roll_factored_batting_stats(neutralized_stats)
    
    # Recombine the separated dataframes (by hand combo) into one large dataframe
    stitched_rolled_factored_stats = stitch_individual_stats(individual_rolled_factored_stats)
    
    # Join the batting and pitching datasets so that for any given day we have both the batter and pitcher's stats
    final_dataset = finalize_dataset(stitched_rolled_factored_stats, individual_rolled_factored_stats, neutralized_stats, data_for_weather_regression)

    # Process the final dataset with a standard pipeline to be used in training algorithms
    processed_dataset = process_pipeline(final_dataset["stats"]["batting_stats"])
    
    return {"stats":final_dataset, "league_averages":league_averages}

In [15]:
def build_dataset(pitches, pickle):
    # Clean raw pitch data and sort everything by hand combos
    all_plays_by_hand_combo = build_plays_by_hand_combo(pitches)
    all_plays_by_hand_combo = attach_ballpark_info(all_plays_by_hand_combo)
    
    # Calculate the shares of each play in each game
    game_play_share_data = calculate_game_play_shares(all_plays_by_hand_combo)
    
    # Calculate the league averages over time for each play
    league_averages = calculate_league_averages(game_play_share_data)

    # Clean the data necessary for the weather regression, then run the analysis
    data_for_weather_regression = prepare_weather_regression(game_play_share_data)
    weather_regression_set = weather_regress(data_for_weather_regression)
    weather_coefficients = weather_regression_set[1]
    
    # Additionally, calculate the impact of specific parks 
    park_factors = calculate_park_factors(game_play_share_data)[1]

    # "Neutralize" the raw events based on the impact from stadium and weather
    neutralized_stats = neutralize_stats(game_play_share_data, weather_coefficients, park_factors)

    # Take the neutralized stats for each player and roll them for a month and a season's worth of time
    individual_rolled_factored_stats = roll_factored_batting_stats(neutralized_stats)
    
    # Recombine the separated dataframes (by hand combo) into one large dataframe
    stitched_rolled_factored_stats = stitch_individual_stats(individual_rolled_factored_stats)
    
    # Join the batting and pitching datasets so that for any given day we have both the batter and pitcher's stats
    final_dataset = finalize_dataset(stitched_rolled_factored_stats, individual_rolled_factored_stats, neutralized_stats, data_for_weather_regression)
    
    return {"stats":final_dataset, "league_averages":league_averages}

In [None]:
pkl.dump(ml_full_df, open("/Users/jaredzirkes/Desktop/Python/Non-Github/Project - MLB Simulation/Checkpoint Files/Model Building/ML X Dataset", "wb"))
pkl.dump(ml_full_y_play, open("/Users/jaredzirkes/Desktop/Python/Non-Github/Project - MLB Simulation/Checkpoint Files/Model Building/ML Y Dataset(Plays)", "wb"))
pkl.dump(ml_full_y_on_base, open("/Users/jaredzirkes/Desktop/Python/Non-Github/Project - MLB Simulation/Checkpoint Files/Model Building/ML Y Dataset (On Base)", "wb"))