In [None]:
### Things to do:
### - Add in current game situation (inning, score diff, runners on)
### - Add in num batters faced for pitcher in game (also maybe a rest metric like days since last pitched if its easy)
### - Expand y vector to full set of plays


# Use the below to eventually build a stealing df to help with the simulation
# pitches[pitches.des.str.contains("steal") == True].iloc[40]

In [93]:
import pandas as pd
import numpy
import sklearn 
import pickle as pkl
import warnings
import time
import random
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import log_loss
from sklearn.model_selection import cross_val_score, RandomizedSearchCV, cross_validate, cross_val_predict, GridSearchCV
import matplotlib.pyplot as plt
from IPython.display import clear_output


import numpy as np
from scipy import stats

warnings.simplefilter("ignore")

hand_combos = ["RR", "RL", "LR", "LL"]
training_years = ["2012", "2013", "2014"]
plays = ['strikeout', 'fly_out', 'double', 'out', 'fielders_choice',
       'error', 'walk', 'home_run', 'single', 'sacrifice', 'double_play',
       'intent_walk', 'triple']
on_bases = ["single", "double", "triple", "home_run", "walk"]

In [43]:
ballpark_info = pd.read_excel("/Users/jaredzirkes/Desktop/Python/MLB BETTING/Ballpark Info.xlsx", header=2)[["Stadium", "Team", "Start Date", "End Date"]]

## Functions

In [44]:
def convert_wind_direction(df, wind_column):
    """ Given a df of batting stats with a column for wind speed and  a column for wind direction, 
    convert to a wind speed in each direction across multiple columns."""
    
    # When wind speed is 0, the direction is automatically listed as "in" --> convert it to "zero" to differentiate
    ind = df[df.wind_speed == 0].index
    df.loc[ind, "wind_direction"] = "zero"
    
    # Use pd.get_dummies to One Hot Encode
    wind_columns = pd.get_dummies(wind_column, columns=['categorical_column', ])
    
    wind_columns = pd.concat([df, wind_columns], axis = 1)
    
    # Finally multiply the binary wind direction columns by the wind speed to get the final wind speed in the correct direction
    for column in wind_columns.columns[-5:]:
        wind_columns[column] = wind_columns[column] * wind_columns["wind_speed"]
    
    return wind_columns

In [45]:
def convert_stadium_column(df, stadium_column):
    stadiums = pd.get_dummies(stadium_column, columns=["categorical_column", ])
    df = pd.concat([df, stadiums], axis=1)
    
    return df

In [46]:
def log5 (pB, pP, pL):
    """ Given the probability of a PA outcome for the pitcher, the batter, and the overall league, calculate the
    probability in that given at bat using the log5 equation. NOTE: DO NOT USE RIGHT NOW""" 
    one = (pB*pP)/pL
    two = ((1-pB)*(1-pP))/(1-pL)
    
    return one/(one + two)


def morey_z(pB, pP, pL):
    """ Given the probability of a PA outcome for the pitcher, the batter, and the overall league, calculate the
    probability in that given at bat using the Morey Z equation"""
    one = (pB-pL)/np.sqrt(pL*(1-pL))
    two = (pP-pL)/np.sqrt(pL*(1-pL))
    three = np.sqrt(pL*(1-pL))
    return ((one + two)/np.sqrt(2) * three) +pL

In [47]:
def ab_play_percentages(batting_percentages, pitching_percentages, league_percentages, pitbat_combo, function):
    """ Given a list of probabilities for all PA outcomes for the batter, the pitcher, and the league, along with
    the pitbat combo, and the desired probability funtion, return a list of the probabilities for all PA outcomes 
    for the specific PA"""
    
    ab_percentages = {}
    
    # Get the specific percentages for each play type
    for play in plays:
        batting_percent = batting_percentages["b_" + play]
        pitching_percent = pitching_percentages["p_" + play]
        league_percent = league_percentages[pitbat_combo][play]
        
        # Ensure we are using one of the two acceptable prediction functions
        if function not in ["morey z", "Morey Z", "log5", "Log5"]:
            while funtion not in ["morey z", "Morey Z", "log5", "Log5"]:
                function = input("Acceptable Functions are Morey Z and Log5. Please input one.")
        
        # Calculate the predicted percentage for the specific play for the PA
        if function == "morey z" or function == "Morey Z":
            expected_percent = morey_z(batting_percent, pitching_percent, league_percent)
        else:
            expected_percent = log5(batting_percent, pitching_percent, league_percent)
    
        # Insert the predicted percentage for the play type into our dictionary for delivery
        ab_percentages[play] = expected_percent
    
    return ab_percentages
        

# Build Data Sets for Odds Functions

In [48]:
all_plays_by_hand_combo = pkl.load(open("/users/jaredzirkes/Desktop/Python/MLB BETTING/all_plays_by_hand_combo.pkl", "rb"))

In [144]:
# Combine our first three years of data (maintaining hand combo seperation) to be the full initial data
def combine_raw_pitches(training_years):
    all_training_data = {x:pd.DataFrame() for x in hand_combos}
    for year in training_years:
        for pitbat_combo in hand_combos:
            print("Combining Pitch Data Across Years and Attatching Ballpark Info", year, pitbat_combo)
            
            df = all_plays_by_hand_combo[year][pitbat_combo]   
            
            df = df.sort_values(by = "game_date", ascending = True)
            
            all_training_data[pitbat_combo] = all_training_data[pitbat_combo].append(df).reset_index(drop=True)

            all_training_data[pitbat_combo]["type_counter"] = 1

            all_training_data[pitbat_combo]["ballpark"] = all_training_data[pitbat_combo].apply(lambda x: ballpark_info[(ballpark_info.Team == x.home_team) & (ballpark_info["End Date"] > int(x.game_date.split("-")[0]))].Stadium.iloc[0],axis=1)
            clear_output(wait = True)

    clear_output(wait = False)
    
    return all_training_data

In [9]:
# For each game, calculate within the game (and pitbat_combo), the share of the plays that were each play type
def calculate_game_play_shares(all_training_data):

    game_play_shares = {x:{"games":{}, "players":{}} for x in hand_combos}
    n = 0

    for pitbat_combo in all_training_data:
        full_df = all_training_data[pitbat_combo].copy()
        # For each game
        for game in full_df.game_pk.unique():
            clear_output(wait = True)
            game_df = full_df[full_df.game_pk == game].copy()
            game_df["type_counter"] = game_df.groupby(by = "play_type").cumsum().type_counter #calculate the total number of the play in the specific game

            total = len(game_df)

            game_df = game_df.groupby(by = "play_type").max()

            game_df["play_share"]  = game_df.type_counter/total #divide by the total number of plays, getting the play share for the game

            game_play_shares[pitbat_combo]["games"][game] = game_df
            game_play_shares[pitbat_combo]["games"][game]["count"] = total

            if n%1000 == 0:
                print("Calculating Probability Vectors for Each Game. There are {}K Instances Remaining".format(round((sum([len(all_training_data[x].game_pk.unique()) for x in hand_combos])-n)/1000),6))
            n+= 1

    clear_output(wait = False)
    
    

        # For every play, insert the % of all plays in the game it occured in that were of the same play type into all_training from game_play_shares df
    for pitbat_combo in hand_combos:
        print("Inserting Data From Game by Game Probability Vectors To the Larger Data Set. There are {} Pitbat Combos Remaining".format(len(hand_combos) - hand_combos.index(pitbat_combo)))
        clear_output(wait = True)

        all_training_data[pitbat_combo]["game_play_share"] = all_training_data[pitbat_combo].apply(lambda x: game_play_shares[pitbat_combo]["games"][x.game_pk].loc[x.play_type].play_share, axis = 1)
        #all_training_data[pitbat_combo]["batter_play_share"] = all_training_data[pitbat_combo].apply(lambda x: game_play_shares[pitbat_combo]["players"][x.batter].loc[x.play_type].play_share, axis = 1)

    return all_training_data
    

In [10]:
# # For every play, insert the % of all plays in the game it occured in that were of the same play type into all_training from game_play_shares df
# for pitbat_combo in hand_combos:
#     print("Inserting Data From Game by Game Probability Vectors To the Larger Data Set. There are {} Pitbat Combos Remaining".format(len(hand_combos) - hand_combos.index(pitbat_combo)))
#     clear_output(wait = True)
    
#     all_training_data[pitbat_combo]["game_play_share"] = all_training_data[pitbat_combo].apply(lambda x: game_play_shares[pitbat_combo]["games"][x.game_pk].loc[x.play_type].play_share, axis = 1)
#     #all_training_data[pitbat_combo]["batter_play_share"] = all_training_data[pitbat_combo].apply(lambda x: game_play_shares[pitbat_combo]["players"][x.batter].loc[x.play_type].play_share, axis = 1)
    
# # # Now that we have the MLB eod % of plays by play type for every day and the % of plays that are each play in every game,
# # # calculate/insert the difference between the individual game and the MLB eod values for every play
# # for pitbat_combo in hand_combos:
# #     all_training_data[pitbat_combo]["game_share_delta"] = all_training_data[pitbat_combo].game_play_share / all_training_data[pitbat_combo].eod_play_share
# #     #all_training_data[pitbat_combo]["batter_share_delta"] = all_training_data[pitbat_combo].batter_play_share - all_training_data[pitbat_combo].eod_play_share

## Calculating Batting Stats Factors!

#### Cleaning for Weather Regression

In [145]:
def clean_for_weather_regression(all_training_data):
    # Remove the first 300 (~10days) games from each season to let the rolling stats normalize
    weather_training_data = {x:{} for x in hand_combos}
    first_games = []
    
    l =  []


    for pitbat_combo in hand_combos:  
        weather_training_df = all_training_data[pitbat_combo].copy()
        for year in training_years:
            first_game_pks = all_plays_by_hand_combo[year][pitbat_combo].game_pk.unique()[:300] # Find the game_ids for the first 300 games of each season
            first_games.append(list(first_game_pks))

        first_games_list = np.concatenate(first_games).ravel()

        weather_training_df = weather_training_df[weather_training_df.game_pk.isin(first_games_list) == False] # Pull out only the games that aren't in the first 100 games
        weather_training_data[pitbat_combo] = weather_training_df[["game_pk","game_date", "play_type", "temprature", "wind_speed", "wind_direction", "game_play_share"]]
        
    # Group the weather training data by game and play type to get the game_share_delta for each play type for each game
    for pitbat_combo in hand_combos:
        weather_training_data[pitbat_combo] = weather_training_data[pitbat_combo].groupby(by = ["game_pk", "play_type"]).last().reset_index()

    # Because the only plays currently in our data are play types that happened in games, fill in all the missing play types for 
    # Each game with a game_share of 0 for that play type
    play_types = ['out', 'single', 'strikeout', 'double', 'walk', 'home_run','triple']
    n = 0
    for pitbat_combo in hand_combos:
        for game in weather_training_data[pitbat_combo].game_pk.unique():
            n += 1
            if n%500 == 0:
                print("Filling in the Missing Values for Probability Vectors. There are {}K Instances Remaining".format(round((sum([len(weather_training_data[x].game_pk.unique()) for x in hand_combos])-n)/1000),6))
            clear_output(wait = True)
            df = weather_training_data[pitbat_combo][weather_training_data[pitbat_combo].game_pk == game].copy()
            if len(df) < len(play_types):
                missing_plays = [play for play in play_types if play not in df.play_type.values]
                for play in missing_plays:
                    #weather_training_data[pitbat_combo] =  weather_training_data[pitbat_combo].append(pd.Series({"game_pk":game, "game_date":df.iloc[0].game_date, "play_type":play, "temprature":df.iloc[0].temprature, "wind_speed":df.iloc[0].wind_speed, "wind_direction":df.iloc[0].wind_direction, "game_share_delta":all_training_data[pitbat_combo][(all_training_data[pitbat_combo].game_date < df.iloc[0].game_date) & (all_training_data[pitbat_combo].play_type == play)].iloc[-1].eod_play_share * -1}), ignore_index = True)
                    weather_training_data[pitbat_combo] =  weather_training_data[pitbat_combo].append(pd.Series({"game_pk":game, "game_date":df.iloc[0].game_date, "play_type":play, "temprature":df.iloc[0].temprature, "wind_speed":df.iloc[0].wind_speed, "wind_direction":df.iloc[0].wind_direction, "game_play_share":0}), ignore_index=True)
    clear_output(wait = False)

    for pitbat_combo in hand_combos:
        # Filter down to only the relevant columns for the weather regression
        weather_training_data[pitbat_combo] = weather_training_data[pitbat_combo][["game_pk", "play_type", "temprature", "wind_speed", "wind_direction", "game_play_share"]]

        # Square temprature to use in the regression because I believe it behaves this way
        weather_training_data[pitbat_combo]["temprature_squared"] = weather_training_data[pitbat_combo]["temprature"].apply(lambda x: x**2)

        # Encode the wind directions and calculate final wind speeds in the direction
        weather_training_data[pitbat_combo] = convert_wind_direction(weather_training_data[pitbat_combo], weather_training_data[pitbat_combo].wind_direction)


    
    return (weather_training_data, first_games_list)

#### Weather Regression

In [146]:
def weather_regress(weather_training_data):
    weather_coefficients = {}

    for pitbat_combo in hand_combos:
        weather_coefficients[pitbat_combo] = {}
        for play_type in weather_training_data[pitbat_combo].play_type.unique():
            PAs = weather_training_data[pitbat_combo][weather_training_data[pitbat_combo].play_type == play_type]

            # Remove outliers for game_share_delta, most of which are caused by low pitbat_combo sample sizes in games
            PAs = PAs[(np.abs(stats.zscore(PAs.game_play_share)) < 3)]

            # Create 2 sets of x data, with and without squaring temprature
            x = PAs[PAs.columns[np.r_[2:4, 6:11]]] #grab only the weather related columns and then get rid of regular temprature

            x_sq = x[[col for col in x.columns if col != "temprature" and col != "wind_speed"]]

            y = PAs.game_play_share

            # Regress the temprature squared dataset on game_share_delta
            lin_sq = LinearRegression(fit_intercept = True)
            lin_sq.fit(x_sq, y)

            weather_coefficients[pitbat_combo][play_type] = {"intercept":lin_sq.intercept_, "temprature_sq":lin_sq.coef_[0], "wind_ltr":lin_sq.coef_[1],
                                                     "wind_rtl":lin_sq.coef_[2], "wind_in":lin_sq.coef_[3], "wind_out":lin_sq.coef_[4]}
            
    return(weather_training_data, weather_coefficients)


#### Calculating Park Factors

In [147]:
def calculate_park_factors(all_training_data):
    park_factors_dict = {}
    print("Calculating Ballpark Factors")

    for pitbat_combo in hand_combos:
        park_factors_dict[pitbat_combo] = {}

        for ballpark in all_training_data["RR"].ballpark.unique():
            park_factors_dict[pitbat_combo][ballpark] = {}
            at_park_df = all_training_data[pitbat_combo][(all_training_data[pitbat_combo].ballpark == ballpark)]
            not_at_park_df = all_training_data[pitbat_combo][(all_training_data[pitbat_combo].ballpark != ballpark)]


            for play_type in all_training_data["RR"].play_type.unique():
                at_park_rate = len(at_park_df[at_park_df.play_type == play_type])/len(at_park_df)
                not_at_park_rate = len(not_at_park_df[not_at_park_df.play_type == play_type])/len(not_at_park_df)

                try:
                    park_factor = at_park_rate/not_at_park_rate
                except:
                    part_factor = "n/a"

                park_factors_dict[pitbat_combo][ballpark][play_type] = park_factor

    clear_output(wait=False)
    
    return (all_training_data, park_factors_dict)

## Adjusting Stats for Batting Factors

In [148]:
def neutralize_stats(all_training_data, weather_coefficients, park_factors_dict, first_games_list, is_dump):
    # Start a new dictionary to hold the edited training stats
    print("Neutralizing Batting Stats using Weather/Stadium Coefficients")

    factored_training_stats = {}
    for pitbat_combo in hand_combos:

        # Grab the relevant columns and games
        df = all_training_data[pitbat_combo][["game_pk", "game_date", "batter", "pitcher",'on_3b', 'on_2b', 'on_1b', 'outs_when_up', 'inning', 'inning_topbot', "bat_score", "fld_score", "play_type", "temprature", "wind_speed", "wind_direction", "ballpark"]].copy()
        #df = df[df.game_pk.isin(first_games_list) == False] # We will take out the first ten games later on

        # Add information for the actual weather and stadium impacts for each game
        df = convert_wind_direction(df, df.wind_direction)
        df["weather_expectation"] = df.apply(lambda x: x["Left to Right"]*weather_coefficients[pitbat_combo][x.play_type]["wind_ltr"] + x["Right to Left"]*weather_coefficients[pitbat_combo][x.play_type]["wind_rtl"] +
                                        x["in"]*weather_coefficients[pitbat_combo][x.play_type]["wind_in"] + x["out"]*weather_coefficients[pitbat_combo][x.play_type]["wind_out"] +
                                        (x["temprature"]**2) * weather_coefficients[pitbat_combo][x.play_type]["temprature_sq"] + weather_coefficients[pitbat_combo][x.play_type]["intercept"], axis=1)

        df["neutral_weather_expectation"] = df.apply(lambda x: 72**2 * weather_coefficients[pitbat_combo][x.play_type]["temprature_sq"] + weather_coefficients[pitbat_combo][x.play_type]["intercept"], axis=1)
        df["weather_impact"] = df.weather_expectation/df.neutral_weather_expectation
        df["stadium_impact"] = df.apply(lambda x: park_factors_dict[pitbat_combo][x.ballpark][x.play_type], axis=1)

        # Multiply the weather and stadium impacts to get the total impact for the specific at-bat result
        df["play_value"] = 1
        df["impact"] = df.play_value * df.weather_impact * df.stadium_impact
        df.play_value = 1/df.impact

        factored_training_stats[pitbat_combo] = df[["game_pk", "game_date","ballpark", "temprature", "wind_speed", "wind_direction", "batter", "pitcher", 'on_3b', 'on_2b', 'on_1b', 'outs_when_up', 'inning', 'inning_topbot', "bat_score", "fld_score", "play_type","impact", "play_value"]]
        
        if is_dump == True:
            pkl.dump(factored_training_stats, open("/Users/jaredzirkes/Documents/GitHub/MLB-Simulation/training_batting_stats_with_factors.pkl", "wb"))

    clear_output(wait=False)
    
    return factored_training_stats

#### Calculate League Averages Over the Length of Training

In [149]:
def calculate_league_averages(game_play_share_data):
    league_average_plays_dict = {}
    for pitbat_combo in hand_combos:
        league_average_plays_dict[pitbat_combo] = {}
        for play in plays:
            df = game_play_share_data[pitbat_combo]
            play_share = len(df[df.play_type == play])/len(df)
            league_average_plays_dict[pitbat_combo][play] = play_share

    return league_average_plays_dict

## Roll Stats Daily To Get Final Odds Functions Training Data Sets

### Rolling with Breaks for Years

In [150]:
def roll_factored_batting_stats(factored_batting_stats, with_year_breaks, rolling_period, min_periods, is_dump):
    # Create a rolling percentage for each play outcome for each batter and pitcher for each year 
    rolling_factored_batting_stats = {}
    rolling_factored_pitching_stats = {}

    for pitbat_combo in hand_combos:
        if with_year_breaks == True:
            factored_batting_stats[pitbat_combo]["year"] = factored_batting_stats[pitbat_combo].game_date.apply(lambda x: x.split("-")[0])
        else:
            factored_batting_stats[pitbat_combo]["year"] = factored_batting_stats[pitbat_combo].game_date.apply(lambda x: "All Years")

    for pitbat_combo in hand_combos:
        print("Rolling Batting Stats Daily. There are {} Hand Combos Left".format(len(hand_combos) - (hand_combos.index(pitbat_combo)) + 4))
        clear_output(wait=True)


        rolling_factored_batting_stats[pitbat_combo] = {}
        rolling_factored_pitching_stats[pitbat_combo] = {}
        for year in factored_batting_stats[pitbat_combo].year.unique():
            rolling_factored_batting_stats[pitbat_combo][year] = {}
            rolling_factored_pitching_stats[pitbat_combo][year] = {}

            # Filter down to the stats for just the relevant year
            df = factored_batting_stats[pitbat_combo][factored_batting_stats[pitbat_combo].year == str(year)]
            df = df.sort_values(by = "game_date", ascending = True)

            # Build rolling stats for batters
            b = 0
            for batter in df.batter.unique():
                clear_output(wait = True)


                batter_df = df[df.batter == batter]
                batter_df["season_at_bat_num"] = 1
                batter_df["month_at_bat_num"] = 1

                # Make a rolling count for the at bats for each batter
                if with_year_breaks == True:
                    batter_df["season_at_bat_num"] = batter_df.season_at_bat_num.rolling(len(batter_df), min_periods = min_periods).sum()
                    batter_df["month_at_bat_num"] = batter_df.month_at_bat_num.rolling(100, min_periods = min_periods).sum()
                else:
                    batter_df["season_at_bat_num"] = batter_df.season_at_bat_num.rolling(rolling_period, min_periods = min_periods).sum()
                    batter_df["month_at_bat_num"] = batter_df.month_at_bat_num.rolling(100, min_periods = min_periods).sum()

                for play in plays:
                    # Multiply the situation impact by a binary vector for play outcomes with a 1 for the correct play
                    batter_df["season_{}".format(play)] = batter_df.apply(lambda x: 1*x.play_value if x.play_type==play else 0, axis = 1)
                    batter_df["month_{}".format(play)] = batter_df.apply(lambda x: 1*x.play_value if x.play_type==play else 0, axis = 1)

                    if with_year_breaks == True:
                        batter_df["season_{}".format(play)] = batter_df["season_{}".format(play)].rolling(len(batter_df), min_periods = min_periods).sum()
                        batter_df["month_{}".format(play)] = batter_df["month_{}".format(play)].rolling(100, min_periods = min_periods).sum()
                        
                    else:
                        batter_df["season_{}".format(play)] = batter_df["season_{}".format(play)].rolling(rolling_period, min_periods = min_periods).sum()
                        batter_df["month_{}".format(play)] = batter_df["month_{}".format(play)].rolling(100, min_periods = min_periods).sum()

                    batter_df["season_{}".format(play)] = batter_df["season_{}".format(play)]/batter_df.season_at_bat_num
                    batter_df["month_{}".format(play)] = batter_df["month_{}".format(play)]/batter_df.month_at_bat_num
                    
                    
                    batter_df["pitbat"] = pitbat_combo
                rolling_factored_batting_stats[pitbat_combo][year][batter] = batter_df[["game_pk", "game_date", "ballpark","temprature", "wind_speed", "wind_direction", "batter", "pitcher", "pitbat",'on_3b', 'on_2b', 'on_1b', 'outs_when_up', 'inning', 'inning_topbot', "bat_score", "fld_score"] + ["season_{}".format(play) for play in plays] + ["month_{}".format(play) for play in plays]]

                if b%10 ==0:
                    if with_year_breaks == True:
                        print("Rolling Batting Stats. There are {} batters remaining in {}, with {} years remaining. Then {} Hand Combos Remaining".format(len(df.batter.unique()) - b, year, len(training_years) - training_years.index(year)-1, len(hand_combos) - hand_combos.index(pitbat_combo) - 1))
                    else:
                        print("Rolling Batting Stats. There are {} batters remaining. Then {} Hand Combos Remaining".format(len(df.batter.unique()) - b, len(hand_combos) - hand_combos.index(pitbat_combo) - 1))
                        
                b+=1

                # Repercentage factored batting stats to sum to 1
                rolling_factored_batting_stats[pitbat_combo][year][batter][["season_{}".format(play) for play in plays]] = rolling_factored_batting_stats[pitbat_combo][year][batter].apply(lambda x: pd.Series([x[["season_{}".format(play) for play in plays]]["season_{}".format(p)]/x[["season_{}".format(play) for play in plays]].sum() for p in [z for z in plays]]) if x[["season_{}".format(p) for p in plays]].sum() > 0 else [0 for p in plays], axis=1)
                rolling_factored_batting_stats[pitbat_combo][year][batter][["month_{}".format(play) for play in plays]] = rolling_factored_batting_stats[pitbat_combo][year][batter].apply(lambda x: pd.Series([x[["month_{}".format(play) for play in plays]]["month_{}".format(p)]/x[["month_{}".format(play) for play in plays]].sum() for p in [z for z in plays]]) if x[["month_{}".format(p) for p in plays]].sum() > 0 else [0 for p in plays], axis=1)


            # Build the rolling stats for pitchers        
            p=0
            for pitcher in df.pitcher.unique():
                clear_output(wait=True)           
                pitcher_df = df[df.pitcher == pitcher]
                pitcher_df["season_at_bat_num"] = 1
                pitcher_df["month_at_bat_num"] = 1
                
                if with_year_breaks == True:
                    batter_df["season_at_bat_num"] = pitcher_df.season_at_bat_num.rolling(len(pitcher_df), min_periods = min_periods).sum()
                    batter_df["month_at_bat_num"] = pitcher_df.month_at_bat_num.rolling(100, min_periods = min_periods).sum()
                else:
                    batter_df["season_at_bat_num"] = pitcher_df.season_at_bat_num.rolling(rolling_period, min_periods = min_periods).sum()
                    batter_df["month_at_bat_num"] = pitcher_df.month_at_bat_num.rolling(100, min_periods = min_periods).sum()

                for play in plays:
                    # Multiply the situation impact by a binary vector for play outcomes with a 1 for the correct play
                    pitcher_df["season_{}".format(play)] = pitcher_df.apply(lambda x: 1*x.play_value if x.play_type==play else 0, axis = 1)
                    pitcher_df["month_{}".format(play)] = pitcher_df.apply(lambda x: 1*x.play_value if x.play_type==play else 0, axis = 1)
                    
                    if with_year_breaks == True:
                        pitcher_df["season_{}".format(play)] = pitcher_df["season_{}".format(play)].rolling(len(pitcher_df), min_periods = min_periods).sum()
                        pitcher_df["month_{}".format(play)] = pitcher_df["month_{}".format(play)].rolling(100, min_periods = min_periods).sum()
                    else:
                        pitcher_df["season_{}".format(play)] = pitcher_df["season_{}".format(play)].rolling(rolling_period, min_periods = min_periods).sum()
                        pitcher_df["month_{}".format(play)] = pitcher_df["month_{}".format(play)].rolling(rolling_period, min_periods = min_periods).sum()
                        
                    pitcher_df["season_{}".format(play)] = pitcher_df["season_{}".format(play)]/pitcher_df.season_at_bat_num
                    pitcher_df["month_{}".format(play)] = pitcher_df["month_{}".format(play)]/pitcher_df.month_at_bat_num             
                    
                    
                    pitcher_df["pitbat"] = pitbat_combo

                #rolling_factored_pitching_stats[pitbat_combo][year][pitcher] = pitcher_df[["game_pk", "game_date", "batter", "pitcher", "pitbat", "out", "strikeout", "walk", "single", "double", "triple", "home_run"]]
                rolling_factored_pitching_stats[pitbat_combo][year][pitcher] = pitcher_df[["game_pk", "game_date", "ballpark","temprature", "wind_speed", "wind_direction", "batter", "pitcher", "pitbat",'on_3b', 'on_2b', 'on_1b', 'outs_when_up', 'inning', 'inning_topbot', "bat_score", "fld_score"] + ["season_{}".format(play) for play in plays] + ["month_{}".format(play) for play in plays]]
                    
                if p%5 ==0:
                    if with_year_breaks == True:
                        print("Rolling Pitching Stats. There are {} pitchers remaining in {}, with {} years remaining. Then {} Hand Combos Remaining".format(len(df.pitcher.unique()) - p, year, len(training_years) - training_years.index(year)-1, len(hand_combos) - hand_combos.index(pitbat_combo) - 1))
                    else:
                        print("Rolling Pitching Stats. There are {} pitchers remaining. Then {} Hand Combos Remaining".format(len(df.pitcher.unique()) - b, len(hand_combos) - hand_combos.index(pitbat_combo) - 1))
                p+=1

                # Repercentage factored pitching stats to sum to 1
                rolling_factored_pitching_stats[pitbat_combo][year][pitcher][["season_{}".format(p) for p in plays]] = rolling_factored_pitching_stats[pitbat_combo][year][pitcher].apply(lambda x: pd.Series([x[["season_{}".format(p) for p in plays]]["season_{}".format(p)]/x[["season_{}".format(p) for p in plays]].sum() for p in plays]) if x[["season_{}".format(p) for p in plays]].sum() >0 else [0 for p in plays], axis=1)
                rolling_factored_pitching_stats[pitbat_combo][year][pitcher][["month_{}".format(p) for p in plays]] = rolling_factored_pitching_stats[pitbat_combo][year][pitcher].apply(lambda x: pd.Series([x[["month_{}".format(p) for p in plays]]["month_{}".format(p)]/x[["month_{}".format(p) for p in plays]].sum() for p in plays]) if x[["month_{}".format(p) for p in plays]].sum() >0 else [0 for p in plays], axis=1)

    clear_output(wait=False)
    
    if is_dump == True:
        pkl.dump(rolling_factored_pitching_stats, open("rolling_factored_pitching_stats.pkl","wb"))
        pkl.dump(rolling_factored_batting_stats, open("rolling_factored_batting_stats.pkl","wb"))
    
    return {"pitching_stats":rolling_factored_pitching_stats, "batting_stats":rolling_factored_batting_stats}

In [151]:
# # Create a rolling percentage for each play outcome for each batter and pitcher for each year 
# rolling_factored_batting_stats = {}
# rolling_factored_pitching_stats = {}

# for pitbat_combo in hand_combos:
#     factored_batting_stats[pitbat_combo]["year"] = factored_batting_stats[pitbat_combo].game_date.apply(lambda x: x.split("-")[0])

# for pitbat_combo in hand_combos:
#     print("Rolling Batting Stats Daily. There are {} Hand Combos Left".format(len(hand_combos) - (hand_combos.index(pitbat_combo)) + 4))
#     clear_output(wait=True)
    
    
#     rolling_factored_batting_stats[pitbat_combo] = {}
#     rolling_factored_pitching_stats[pitbat_combo] = {}
#     for year in training_years:
#         rolling_factored_batting_stats[pitbat_combo][year] = {}
#         rolling_factored_pitching_stats[pitbat_combo][year] = {}
        
#         # Filter down to the stats for just the relevant year
#         df = factored_batting_stats[pitbat_combo][factored_batting_stats[pitbat_combo].year == str(year)]
        
#         # Build rolling stats for pitchers
#         b = 0
#         for batter in df.batter.unique():
#             clear_output(wait = True)
            
            
#             batter_df = df[df.batter == batter]
#             batter_df["at_bat_num"] = 1
            
#             # Make a rolling count for the at bats for each batter
#             batter_df["at_bat_num"] = batter_df.at_bat_num.rolling(len(batter_df), min_periods = 1).sum()
            
#             for play in plays:
#                 # Multiply the situation impact by a binary vector for play outcomes with a 1 for the correct play
#                 batter_df[play] = batter_df.apply(lambda x: 1*x.play_value if x.play_type==play else 0, axis = 1)
#                 batter_df[play] = batter_df[play].rolling(len(batter_df), min_periods = 1).sum()
#                 batter_df[play] = batter_df[play]/batter_df.at_bat_num
#                 batter_df["pitbat"] = pitbat_combo
#             rolling_factored_batting_stats[pitbat_combo][year][batter] = batter_df[["game_pk", "game_date", "batter", "pitcher", "pitbat", "at_bat_num", "out", "strikeout", "walk", "single", "double", "triple", "home_run"]]
                        
#             if b%5 ==0:
#                 print("Rolling Batting Stats. There are {} batters remaining in {}, with {} years remaining".format(len(df.batter.unique()) - b, year, len(training_years) - training_years.index(year)-1))
#             b+=1

#             # Repercentage factored batting stats to sum to 1
#             rolling_factored_batting_stats[pitbat_combo][year][batter][plays] = rolling_factored_batting_stats[pitbat_combo][year][batter].apply(lambda x: pd.Series([x[plays][p]/x[plays].sum() for p in plays]), axis=1)
        
        
#         # Build the rolling stats for pitchers        
#         p=0
#         for pitcher in df.pitcher.unique():
#             clear_output(wait=True)           
#             pitcher_df = df[df.pitcher == pitcher]
#             pitcher_df["at_bat_num"] = 1
#             pitcher_df["at_bat_num"] = pitcher_df.at_bat_num.rolling(len(pitcher_df), min_periods = 1).sum()
            
#             for play in plays:
#                 # Multiply the situation impact by a binary vector for play outcomes with a 1 for the correct play
#                 pitcher_df[play] = pitcher_df.apply(lambda x: 1*x.play_value if x.play_type==play else 0, axis = 1)
#                 pitcher_df[play] = pitcher_df[play].rolling(len(pitcher_df), min_periods = 1).sum()
#                 pitcher_df[play] = pitcher_df[play]/pitcher_df.at_bat_num
#                 pitcher_df["pitbat"] = pitbat_combo

#             rolling_factored_pitching_stats[pitbat_combo][year][pitcher] = pitcher_df[["game_pk", "game_date", "batter", "pitcher", "pitbat", "at_bat_num", "out", "strikeout", "walk", "single", "double", "triple", "home_run"]]

#             if p%5 ==0:
#                 print("Rolling Pitching Stats. There are {} batters remaining in {}, with {} years remaining".format(len(df.batter.unique()) - p, year, len(training_years) - training_years.index(year)-1))
#             p+=1
            
#             # Repercentage factored pitching stats to sum to 1
#             rolling_factored_pitching_stats[pitbat_combo][year][pitcher][plays] = rolling_factored_pitching_stats[pitbat_combo][year][pitcher].apply(lambda x: pd.Series([x[plays][p]/x[plays].sum() for p in plays]), axis=1)
            
# clear_output(wait=False)

In [166]:
# rolling_factored_pitching_stats = pkl.load(open("rolling_factored_pitching_stats.pkl","rb"))
rolling_factored_batting_stats = pkl.load(open("rolling_factored_batting_stats.pkl","rb"))

In [153]:
# Pull all the rolled individual player DFs out of the dictionary and into a large DF 
# that will be used for final training
def stitch_individual_stats(rolling_factored_stats, with_year_breaks):
    training_stats = pd.DataFrame()
    pitching_holder = pd.DataFrame()

    for pitbat_combo in hand_combos:
        for year in rolling_factored_stats["batting_stats"][pitbat_combo]:
            print("Stitching Together Individual Stats Into the Odds Functions Data Set. There are {} Pitbats Left and {} Years Remaining in the Current Pitbat".format(len(hand_combos)-hand_combos.index(pitbat_combo), "Insert Years Remaining"))
            clear_output(wait = True)
            for batter in rolling_factored_stats["batting_stats"][pitbat_combo][year]:
                # Find each specific player df of unique pitbat combo, year, and batter
                df_b = rolling_factored_stats["batting_stats"][pitbat_combo][year][batter]

                # We will through an error trying to look for games with dates less than our opening day, and there's
                # no need to stats for ~1 games anyways, so cut off the first 3 PAs of stats
                if len(df_b) > 3:
                    training_stats = training_stats.append(df_b[3:]) 

            # Do the same thing for pitchers. Note we can leave in the first 3 PAs, because we will be simply joining with
            # the batters, so these PAs will get cut off then.
            for pitcher in rolling_factored_stats["pitching_stats"][pitbat_combo][year]:
                df_p = rolling_factored_stats["pitching_stats"][pitbat_combo][year][pitcher]
                pitching_holder = pitching_holder.append(df_p) 

        clear_output(wait=False)

    if with_year_breaks == True:
        training_stats["year"] = training_stats.game_date.apply(lambda x: x.split("-")[0])
        pitching_holder["year"] = pitching_holder.game_date.apply(lambda x: x.split("-")[0])
    else:
        training_stats["year"] = training_stats.game_date.apply(lambda x: "All Years")
        pitching_holder["year"] = pitching_holder.game_date.apply(lambda x: "All Years")

    #pitching_holder = pitching_holder.rename(columns = {"batter":'pitcher'})

    clear_output(wait=False)
    
    return {"pitching_stats":pitching_holder, "batting_stats":training_stats}

In [154]:
# Attach the pitching probability vector to the training set by "joining" on the pitbat combo, year, and pitcher name, where the date is just less than the given PA.
# Then reattatch the weather and ballpark info for that game
def finalize_odds_dataset(stitched_dataset, rolling_factored_stats, factored_batting_stats, stats_with_weather):
    for play in plays:
        print("Attaching the Batter Probability Vectors to the Odds Functions Data Set. There are {} Plays Remaining".format(len(plays) - plays.index(play)))
        stitched_dataset["batting_stats"]["b_season_" + play] = stitched_dataset["batting_stats"].apply(lambda x: rolling_factored_stats["batting_stats"][x.pitbat][x.year][x.batter][rolling_factored_stats["batting_stats"][x.pitbat][x.year][x.batter].game_date < x.game_date].iloc[-1]["season_{}".format(play)] if len(rolling_factored_stats["batting_stats"][x.pitbat][x.year][x.batter][rolling_factored_stats["batting_stats"][x.pitbat][x.year][x.batter].game_date < x.game_date])>0 else None, axis = 1)
        stitched_dataset["batting_stats"]["b_month_" + play] = stitched_dataset["batting_stats"].apply(lambda x: rolling_factored_stats["batting_stats"][x.pitbat][x.year][x.batter][rolling_factored_stats["batting_stats"][x.pitbat][x.year][x.batter].game_date < x.game_date].iloc[-1]["month_{}".format(play)] if len(rolling_factored_stats["batting_stats"][x.pitbat][x.year][x.batter][rolling_factored_stats["batting_stats"][x.pitbat][x.year][x.batter].game_date < x.game_date])>0 else None, axis = 1)
        clear_output(wait=True)

        print("Attaching the Pitcher Probability Vectors to the Odds Functions Data Set. There are {} Plays Remaining".format(len(plays) - plays.index(play)))
        stitched_dataset["batting_stats"]["p_season_" + play] = stitched_dataset["batting_stats"].apply(lambda x: rolling_factored_stats["pitching_stats"][x.pitbat][x.year][x.pitcher][rolling_factored_stats["pitching_stats"][x.pitbat][x.year][x.pitcher].game_date < x.game_date].iloc[-1]["season_{}".format(play)] if len(rolling_factored_stats["pitching_stats"][x.pitbat][x.year][x.pitcher][rolling_factored_stats["pitching_stats"][x.pitbat][x.year][x.pitcher].game_date < x.game_date])>0 else None, axis = 1)
        stitched_dataset["batting_stats"]["p_month_" + play] = stitched_dataset["batting_stats"].apply(lambda x: rolling_factored_stats["pitching_stats"][x.pitbat][x.year][x.pitcher][rolling_factored_stats["pitching_stats"][x.pitbat][x.year][x.pitcher].game_date < x.game_date].iloc[-1]["month_{}".format(play)] if len(rolling_factored_stats["pitching_stats"][x.pitbat][x.year][x.pitcher][rolling_factored_stats["pitching_stats"][x.pitbat][x.year][x.pitcher].game_date < x.game_date])>0 else None, axis = 1)
        clear_output(wait=True)

    #stitched_dataset["batting_stats"] = stitched_dataset["batting_stats"].drop(columns = plays)

    # Add in a column for the actual play, to be used for comparison against our prediction vector
    stitched_dataset["batting_stats"]["play"] = stitched_dataset["batting_stats"].apply(lambda x: factored_batting_stats[x.pitbat].loc[x.name].play_type, axis=1)
    
    
    # Attatch the weather information
    print("Attatching Original Weather Information to Final Dataset")

    weather_columns = ["temprature_squared", "Left to Right", "Right to Left", "in", "out", "zero"]
    stitched_dataset["batting_stats"][weather_columns] = stitched_dataset["batting_stats"].apply(lambda x: stats_with_weather[x.pitbat][stats_with_weather[x.pitbat].game_pk == x.game_pk].iloc[0][weather_columns] if len(stats_with_weather[x.pitbat][stats_with_weather[x.pitbat].game_pk == x.game_pk]) > 0 else pd.Series({x:None for x in weather_columns}) , axis=1)
    stitched_dataset["batting_stats"]["is_on_base"] = stitched_dataset["batting_stats"].play.apply(lambda x: 1 if x in on_bases else 0)
    
    # Attatch the League Average Information ### THIS IS STILL RUNNING FOR THE WHOLE DATASET - MAKE FOR JUST A ROLLING PERIOD!!!!
    print("Attatching League Average Information")
    league_averages = {}
    for pitbat_combo in hand_combos:
        league_averages[pitbat_combo] = {}
        pitbat_df = stitched_dataset["batting_stats"][stitched_dataset["batting_stats"].pitbat == pitbat_combo].copy()
        for date in pitbat_df.game_date.unique():
            league_averages[pitbat_combo][date] = {"season":{}, "month":{}}
            season_pitbat_date_df = pitbat_df[pitbat_df.game_date < date][-1*min(504, len(pitbat_df)):].copy()
            month_pitbat_date_df = pitbat_df[pitbat_df.game_date < date][-1*min(100, len(pitbat_df)):].copy()
            
            
            
            # Find league average from the month, half season, and full seasons worth of time
            
            for play in plays:
                season_play_average = len(season_pitbat_date_df[season_pitbat_date_df.play == play])/len(season_pitbat_date_df) if len(season_pitbat_date_df) > 0 else None
                month_play_average = len(month_pitbat_date_df[month_pitbat_date_df.play == play])/len(month_pitbat_date_df) if len(month_pitbat_date_df) > 0 else None
                
                league_averages[pitbat_combo][date]["season"][play] = season_play_average
                league_averages[pitbat_combo][date]["month"][play] = season_play_average

    for play in plays:
        stitched_dataset["batting_stats"]["season_league_average_{}".format(play)] = stitched_dataset["batting_stats"].apply(lambda x: league_averages[x.pitbat][x.game_date]["season"][play], axis=1)
        stitched_dataset["batting_stats"]["month_league_average_{}".format(play)] = stitched_dataset["batting_stats"].apply(lambda x: league_averages[x.pitbat][x.game_date]["month"][play], axis=1)
    
    
    clear_output(wait=False)
    
    return stitched_dataset

In [155]:
def build_odds_dataset(training_years, pickle, rolling_period):

    all_training_data = combine_raw_pitches(training_years)

    game_play_share_data = calculate_game_play_shares(all_training_data)
    league_averages = calculate_league_averages(game_play_share_data)

    data_for_weather_regression_set = clean_for_weather_regression(game_play_share_data)
    data_for_weather_regression = data_for_weather_regression_set[0]

    first_games_list = data_for_weather_regression_set[1]

    weather_regression_set = weather_regress(data_for_weather_regression)
    weather_coefficients = weather_regression_set[1]

    park_factors = calculate_park_factors(game_play_share_data)[1]

    neutralized_stats = neutralize_stats(game_play_share_data, weather_coefficients, park_factors, first_games_list, pickle)

    individual_rolled_factored_stats = roll_factored_batting_stats(neutralized_stats, False, rolling_period, 0, pickle)

    rolled_factored_stats = stitch_individual_stats(individual_rolled_factored_stats, False)

    final_odds_dataset = finalize_odds_dataset(rolled_factored_stats, individual_rolled_factored_stats, neutralized_stats, data_for_weather_regression)["batting_stats"]
    
    return {"odds_dataset":final_odds_dataset, "league_averages":league_averages}

In [123]:
# training_years, pickle, rolling_period = ["2012"], False, 504

# all_training_data = combine_raw_pitches(training_years)

# game_play_share_data = calculate_game_play_shares(all_training_data)
# league_averages = calculate_league_averages(game_play_share_data)

# data_for_weather_regression_set = clean_for_weather_regression(game_play_share_data)
# data_for_weather_regression = data_for_weather_regression_set[0]

# first_games_list = data_for_weather_regression_set[1]

# weather_regression_set = weather_regress(data_for_weather_regression)
# weather_coefficients = weather_regression_set[1]

# park_factors = calculate_park_factors(game_play_share_data)[1]

# neutralized_stats = neutralize_stats(game_play_share_data, weather_coefficients, park_factors, first_games_list, pickle)

# individual_rolled_factored_stats = roll_factored_batting_stats(neutralized_stats, False, 5, 0, pickle)

# rolled_factored_stats = stitch_individual_stats(individual_rolled_factored_stats, False)

# final_odds_dataset = finalize_odds_dataset(rolled_factored_stats, individual_rolled_factored_stats, neutralized_stats, data_for_weather_regression)["batting_stats"]


In [168]:
odds_dataset = build_odds_dataset(["2012", "2013", "2014"], True, 504)
pkl.dump(odds_dataset, open("dataset", "wb"))


In [170]:
odds_dataset.columns

AttributeError: 'dict' object has no attribute 'columns'

# Continued Cleaning/Pipeline for ML Datasets

In [178]:
odds_dataset = pkl.load(open("dataset", "rb"))["odds_dataset"]


In [179]:
for col in ["on_3b", "on_2b", "on_1b"]:
    odds_dataset[col] = odds_dataset[col].apply(lambda x: 1 if pd.isna(x) == False else 0) 
    
for col in ["inning_topbot"]:
     odds_dataset[col] = odds_dataset[col].apply(lambda x: 1 if x == "Top" else 0) 

In [180]:
#ml_full_df = odds_dataset["odds_dataset"][[col for col in odds_dataset["odds_dataset"].columns if col not in ["game_date","game_pk", "batter", "pitcher", "temprature", "wind_speed", "wind_direction", "out", "strikeout", "walk", "single", "double", "triple", "home_run", "year", "ballpark", "pitbat"]] + ["ballpark", "pitbat"]].dropna()
ml_full_df = odds_dataset[[col for col in odds_dataset.columns if col not in ["game_pk", "batter", "pitcher", "temprature", "wind_speed", "wind_direction", "year", "ballpark", "pitbat"] + ["season_{}".format(p) for p in plays] + ["month_{}".format(p) for p in plays]] + ["ballpark", "pitbat"]].dropna()
ml_full_df = ml_full_df[ml_full_df.game_date.apply(lambda x: int(x.split("-")[1])) >= 5].reset_index(drop=True)
ml_full_df.drop(columns = ["game_date"], inplace=True)
ml_full_y_play = ml_full_df.play
ml_full_y_on_base = ml_full_df.is_on_base
ml_full_df.drop(columns = ["play", "is_on_base"], inplace = True)

In [182]:
ml_full_df.columns

Index(['on_3b', 'on_2b', 'on_1b', 'outs_when_up', 'inning', 'inning_topbot',
       'bat_score', 'fld_score', 'b_season_strikeout', 'b_month_strikeout',
       'p_season_strikeout', 'p_month_strikeout', 'b_season_fly_out',
       'b_month_fly_out', 'p_season_fly_out', 'p_month_fly_out',
       'b_season_double', 'b_month_double', 'p_season_double',
       'p_month_double', 'b_season_out', 'b_month_out', 'p_season_out',
       'p_month_out', 'b_season_fielders_choice', 'b_month_fielders_choice',
       'p_season_fielders_choice', 'p_month_fielders_choice', 'b_season_error',
       'b_month_error', 'p_season_error', 'p_month_error', 'b_season_walk',
       'b_month_walk', 'p_season_walk', 'p_month_walk', 'b_season_home_run',
       'b_month_home_run', 'p_season_home_run', 'p_month_home_run',
       'b_season_single', 'b_month_single', 'p_season_single',
       'p_month_single', 'b_season_sacrifice', 'b_month_sacrifice',
       'p_season_sacrifice', 'p_month_sacrifice', 'b_season_double_p

In [174]:
numeric_features = [col for col in ml_full_df if col not in ["ballpark", "pitbat"]]
numeric_transformer = Pipeline(
    steps=[("scaler", StandardScaler())]
)

categorical_features = ["ballpark", "pitbat"]
categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(handle_unknown="ignore", sparse=False)),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)


In [175]:
ml_pipe = Pipeline(
    steps=[("preprocessor", preprocessor)]
)

ml_full_df = ml_pipe.fit_transform(ml_full_df)

In [176]:
pkl.dump(ml_full_df, open("ML X Dataset", "wb"))
pkl.dump(ml_full_y_play, open("ML Y Dataset (Plays)", "wb"))
pkl.dump(ml_full_y_on_base, open("ML Y Dataset (On Base)", "wb"))