In [21]:
import pandas as pd
import numpy
import sklearn 
import pickle as pkl
import warnings
import time
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
from IPython.display import clear_output


import numpy as np
from scipy import stats

warnings.simplefilter("ignore")

hand_combos = ["RR", "RL", "LR", "LL"]
training_years = ["2012", "2013", "2014"]
plays = ["out", "strikeout", "walk", "single", "double", "triple", "home_run"]

In [2]:
ballpark_info = pd.read_excel("/Users/jaredzirkes/Desktop/Python/MLB BETTING/Ballpark Info.xlsx", header=2)[["Stadium", "Team", "Start Date", "End Date"]]

## Functions

In [3]:
def convert_wind_direction(df, wind_column):
    """ Given a df of batting stats with a column for wind speed and  a column for wind direction, 
    convert to a wind speed in each direction across multiple columns."""
    
    # When wind speed is 0, the direction is automatically listed as "in" --> convert it to "zero" to differentiate
    ind = df[df.wind_speed == 0].index
    df.loc[ind, "wind_direction"] = "zero"
    
    # Use pd.get_dummies to One Hot Encode
    wind_columns = pd.get_dummies(wind_column, columns=['categorical_column', ])
    
    wind_columns = pd.concat([df, wind_columns], axis = 1)
    
    # Finally multiply the binary wind direction columns by the wind speed to get the final wind speed in the correct direction
    for column in wind_columns.columns[-5:]:
        wind_columns[column] = wind_columns[column] * wind_columns["wind_speed"]
    
    return wind_columns

In [4]:
def convert_stadium_column(df, stadium_column):
    stadiums = pd.get_dummies(stadium_column, columns=["categorical_column", ])
    df = pd.concat([df, stadiums], axis=1)
    
    return df

In [5]:
def log5 (pB, pP, pL):
    """ Given the probability of a PA outcome for the pitcher, the batter, and the overall league, calculate the
    probability in that given at bat using the log5 equation. NOTE: DO NOT USE RIGHT NOW""" 
    one = (pB*pP)/pL
    two = ((1-pB)*(1-pP))/(1-pL)
    
    return one/(one + two)


def morey_z(pB, pP, pL):
    """ Given the probability of a PA outcome for the pitcher, the batter, and the overall league, calculate the
    probability in that given at bat using the Morey Z equation"""
    one = (pB-pL)/np.sqrt(pL*(1-pL))
    two = (pP-pL)/np.sqrt(pL*(1-pL))
    three = np.sqrt(pL*(1-pL))
    return ((one + two)/np.sqrt(2) * three) +pL

In [60]:
def ab_play_percentages(batting_percentages, pitching_percentages, league_percentages, pitbat_combo, function):
    """ Given a list of probabilities for all PA outcomes for the batter, the pitcher, and the league, along with
    the pitbat combo, and the desired probability funtion, return a list of the probabilities for all PA outcomes 
    for the specific PA"""
    
    ab_percentages = {}
    
    # Get the specific percentages for each play type
    for play in plays:
        batting_percent = batting_percentages["b_" + play]
        pitching_percent = pitching_percentages["p_" + play]
        league_percent = league_percentages[pitbat_combo][play]
        
        # Ensure we are using one of the two acceptable prediction functions
        if function not in ["morey z", "Morey Z", "log5", "Log5"]:
            while funtion not in ["morey z", "Morey Z", "log5", "Log5"]:
                function = input("Acceptable Functions are Morey Z and Log5. Please input one.")
        
        # Calculate the predicted percentage for the specific play for the PA
        if function == "morey z" or function == "Morey Z":
            expected_percent = morey_z(batting_percent, pitching_percent, league_percent)
        else:
            expected_percent = log5(batting_percent, pitching_percent, league_percent)
    
        # Insert the predicted percentage for the play type into our dictionary for delivery
        ab_percentages[play] = expected_percent
    
    return ab_percentages
        

# Build Data Sets for Odds Functions

In [7]:
all_plays_by_hand_combo = pkl.load(open("/users/jaredzirkes/Desktop/Python/MLB BETTING/all_plays_by_hand_combo.pkl", "rb"))

In [8]:
# Combine our first three years of data (maintaining hand combo seperation) to be the full initial data

all_training_data = {x:pd.DataFrame() for x in hand_combos}
for year in training_years:
    for pitbat_combo in hand_combos:
        print("Combining Pitch Data Across Years and Attatching Ballpark Info", year, pitbat_combo)
        df = all_plays_by_hand_combo[year][pitbat_combo]
        all_training_data[pitbat_combo] = all_training_data[pitbat_combo].append(df).reset_index(drop=True)
        
        all_training_data[pitbat_combo]["type_counter"] = 1
        
        all_training_data[pitbat_combo]["ballpark"] = all_training_data[pitbat_combo].apply(lambda x: ballpark_info[(ballpark_info.Team == x.home_team) & (ballpark_info["End Date"] > int(x.game_date.split("-")[0]))].Stadium.iloc[0],axis=1)
        clear_output(wait = True)
    
clear_output(wait = False)

In [9]:
# # Group all plays by the date and play type to get play_type_share showing the cumulative share of the play type at eod every day
# eod_play_shares = all_training_data.copy()
# for pitbat_combo in eod_play_shares:
#     eod_play_shares[pitbat_combo] = eod_play_shares[pitbat_combo].groupby(by = ["play_type"]).last()
    

# # Place the eod_play_share value for each play type into all training data pulling from the eod_play_share df
# for pitbat_combo in all_training_data:
#     all_training_data[pitbat_combo]["eod_play_share"] = all_training_data[pitbat_combo].apply(lambda x: eod_play_shares[pitbat_combo].loc[x.play_type].cum_play_type_share, axis = 1)

In [10]:
# For each game, calculate within the game (and pitbat_combo), the share of the plays that were each play type
game_play_shares = {x:{"games":{}, "players":{}} for x in hand_combos}
n = 0

for pitbat_combo in all_training_data:
    full_df = all_training_data[pitbat_combo].copy()
    # For each game
    for game in full_df.game_pk.unique():
        clear_output(wait = True)
        game_df = full_df[full_df.game_pk == game].copy()
        game_df["type_counter"] = game_df.groupby(by = "play_type").cumsum().type_counter #calculate the total number of the play in the specific game
        
        total = len(game_df)
        
        game_df = game_df.groupby(by = "play_type").max()
        
        game_df["play_share"]  = game_df.type_counter/total #divide by the total number of plays, getting the play share for the game
        
        game_play_shares[pitbat_combo]["games"][game] = game_df
        game_play_shares[pitbat_combo]["games"][game]["count"] = total
        
        if n%1000 == 0:
            print("Calculating Probability Vectors for Each Game. There are {}K Instances Remaining".format(round((sum([len(all_training_data[x].game_pk.unique()) for x in hand_combos])-n)/1000),6))
        n+= 1
        
#     # For each player
#     print("Player")
#     for player in full_df.batter.unique():
#         clear_output(wait = True)
#         player_df = full_df[full_df.game_pk == player].copy()
#         player_df["type_counter"] = player_df.groupby(by = "play_type").cumsum().type_counter #calculate the total number of the play
        
#         total = len(player_df)
        
        
#         player_df = player_df.groupby(by = "play_type").max()
        
#         player_df["play_share"]  = player_df.type_counter/total #divide by the total number of plays, getting the play share
        
#         game_play_shares[pitbat_combo]["players"][player] = player_df
        
#         # For printing updates - note: there are ~28,000 
#         n1+= 1
#         if n1%1000 == 0:
#             print("Player ", n1)
            
clear_output(wait = False)

In [11]:
# For every play, insert the % of all plays in the game it occured in that were of the same play type into all_training from game_play_shares df
for pitbat_combo in hand_combos:
    print("Inserting Data From Game by Game Probability Vectors To the Larger Data Set. There are {} Pitbat Combos Remaining".format(len(hand_combos) - hand_combos.index(pitbat_combo)))
    clear_output(wait = True)
    
    all_training_data[pitbat_combo]["game_play_share"] = all_training_data[pitbat_combo].apply(lambda x: game_play_shares[pitbat_combo]["games"][x.game_pk].loc[x.play_type].play_share, axis = 1)
    #all_training_data[pitbat_combo]["batter_play_share"] = all_training_data[pitbat_combo].apply(lambda x: game_play_shares[pitbat_combo]["players"][x.batter].loc[x.play_type].play_share, axis = 1)
    
# # Now that we have the MLB eod % of plays by play type for every day and the % of plays that are each play in every game,
# # calculate/insert the difference between the individual game and the MLB eod values for every play
# for pitbat_combo in hand_combos:
#     all_training_data[pitbat_combo]["game_share_delta"] = all_training_data[pitbat_combo].game_play_share / all_training_data[pitbat_combo].eod_play_share
#     #all_training_data[pitbat_combo]["batter_share_delta"] = all_training_data[pitbat_combo].batter_play_share - all_training_data[pitbat_combo].eod_play_share

Applying Game Play Shares To the Larger Data Set. There are 1 Pitbat Combos Remaining


## Calculating Batting Stats Factors!

#### Cleaning for Weather Regression

In [1]:
def weather_regression(all_training_data):
    return all_training_data

In [12]:
# Remove the first 300 (~10days) games from each season to let the rolling stats normalize
weather_training_data = {x:{} for x in hand_combos}
first_games = []


for pitbat_combo in hand_combos:  
    weather_training_df = all_training_data[pitbat_combo].copy()
    for year in training_years:
        first_game_pks = all_plays_by_hand_combo[year][pitbat_combo].game_pk.unique()[:300] # Find the game_ids for the first 300 games of each season
        first_games.append(list(first_game_pks))
        
    first_games_list = np.concatenate(first_games).ravel()
    
    weather_training_df = weather_training_df[weather_training_df.game_pk.isin(first_games_list) == False] # Pull out only the games that aren't in the first 100 games
    weather_training_data[pitbat_combo] = weather_training_df[["game_pk","game_date", "play_type", "temprature", "wind_speed", "wind_direction", "game_play_share"]]

In [13]:
# Group the weather training data by game and play type to get the game_share_delta for each play type for each game
# Eg. game 317795 doubles has a game_share_delta of .355
for pitbat_combo in hand_combos: 
    weather_training_data[pitbat_combo] = weather_training_data[pitbat_combo].groupby(by = ["game_pk", "play_type"]).last().reset_index()

In [14]:
# Because the only plays currently in our data are play types that happened in games, fill in all the missing play types for 
# Each game with a game_share of 0 for that play type
play_types = ['out', 'single', 'strikeout', 'double', 'walk', 'home_run','triple']
n = 0
for pitbat_combo in hand_combos:
    for game in weather_training_data[pitbat_combo].game_pk.unique():
        n += 1
        if n%500 == 0:
            print("Filling in the Missing Values for Probability Vectors. There are {}K Instances Remaining".format(round((sum([len(weather_training_data[x].game_pk.unique()) for x in hand_combos])-n)/1000),6))
        clear_output(wait = True)
        df = weather_training_data[pitbat_combo][weather_training_data[pitbat_combo].game_pk == game].copy()
        if len(df) < len(play_types):
            missing_plays = [play for play in play_types if play not in df.play_type.values]
            for play in missing_plays:
                #weather_training_data[pitbat_combo] =  weather_training_data[pitbat_combo].append(pd.Series({"game_pk":game, "game_date":df.iloc[0].game_date, "play_type":play, "temprature":df.iloc[0].temprature, "wind_speed":df.iloc[0].wind_speed, "wind_direction":df.iloc[0].wind_direction, "game_share_delta":all_training_data[pitbat_combo][(all_training_data[pitbat_combo].game_date < df.iloc[0].game_date) & (all_training_data[pitbat_combo].play_type == play)].iloc[-1].eod_play_share * -1}), ignore_index = True)
                weather_training_data[pitbat_combo] =  weather_training_data[pitbat_combo].append(pd.Series({"game_pk":game, "game_date":df.iloc[0].game_date, "play_type":play, "temprature":df.iloc[0].temprature, "wind_speed":df.iloc[0].wind_speed, "wind_direction":df.iloc[0].wind_direction, "game_play_share":0}), ignore_index=True)
clear_output(wait = False)

In [15]:
for pitbat_combo in hand_combos:
    # Filter down to only the relevant columns for the weather regression
    weather_training_data[pitbat_combo] = weather_training_data[pitbat_combo][["game_pk", "play_type", "temprature", "wind_speed", "wind_direction", "game_play_share"]]
    
    # Square temprature to use in the regression because I believe it behaves this way
    weather_training_data[pitbat_combo]["temprature_squared"] = weather_training_data[pitbat_combo]["temprature"].apply(lambda x: x**2)
    
    # Encode the wind directions and calculate final wind speeds in the direction
    weather_training_data[pitbat_combo] = convert_wind_direction(weather_training_data[pitbat_combo], weather_training_data[pitbat_combo].wind_direction)

#### Weather Regression

In [17]:
weather_coefficients = {}

for pitbat_combo in hand_combos:
    weather_coefficients[pitbat_combo] = {}
    for play_type in weather_training_data[pitbat_combo].play_type.unique():
        PAs = weather_training_data[pitbat_combo][weather_training_data[pitbat_combo].play_type == play_type]
        
        # Remove outliers for game_share_delta, most of which are caused by low pitbat_combo sample sizes in games
        PAs = PAs[(np.abs(stats.zscore(PAs.game_play_share)) < 3)]
        
        # Create 2 sets of x data, with and without squaring temprature
        x = PAs[PAs.columns[np.r_[2:4, 6:11]]] #grab only the weather related columns and then get rid of regular temprature
        
        x_sq = x[[col for col in x.columns if col != "temprature" and col != "wind_speed"]]
        
        y = PAs.game_play_share
        
        # Regress the temprature squared dataset on game_share_delta
        lin_sq = LinearRegression(fit_intercept = True)
        lin_sq.fit(x_sq, y)
        
        weather_coefficients[pitbat_combo][play_type] = {"intercept":lin_sq.intercept_, "temprature_sq":lin_sq.coef_[0], "wind_ltr":lin_sq.coef_[1],
                                                 "wind_rtl":lin_sq.coef_[2], "wind_in":lin_sq.coef_[3], "wind_out":lin_sq.coef_[4]}

#### Calculating Park Factors

In [18]:
park_factors_dict = {}
print("Calculating Ballpark Factors")

for pitbat_combo in hand_combos:
    park_factors_dict[pitbat_combo] = {}
    
    for ballpark in all_training_data["RR"].ballpark.unique():
        park_factors_dict[pitbat_combo][ballpark] = {}
        at_park_df = all_training_data[pitbat_combo][(all_training_data[pitbat_combo].ballpark == ballpark)]
        not_at_park_df = all_training_data[pitbat_combo][(all_training_data[pitbat_combo].ballpark != ballpark)]
    

        for play_type in ["out", "strikeout", "double", "walk", "single", "home_run", "triple"]:
            at_park_rate = len(at_park_df[at_park_df.play_type == play_type])/len(at_park_df)
            not_at_park_rate = len(not_at_park_df[not_at_park_df.play_type == play_type])/len(not_at_park_df)

            try:
                park_factor = at_park_rate/not_at_park_rate
            except:
                part_factor = "n/a"

            park_factors_dict[pitbat_combo][ballpark][play_type] = park_factor

clear_output(wait=False)

## Adjusting Stats for Batting Factors

In [19]:
# Start a new dictionary to hold the edited training stats
print("Neutralizing Batting Stats using Weather/Stadium Coefficients")

factored_training_stats = {}
for pitbat_combo in hand_combos:
    
    # Grab the relevant columns and games (not first ~10 days)
    df = all_training_data[pitbat_combo][["game_pk", "game_date", "batter", "pitcher", "play_type", "temprature", "wind_speed", "wind_direction", "ballpark"]].copy()
    df = df[df.game_pk.isin(weather_training_data[pitbat_combo].game_pk) == True]
    
    # Add information for the actual weather and stadium impacts for each game
    df = convert_wind_direction(df, df.wind_direction)
    df["weather_expectation"] = df.apply(lambda x: x["Left to Right"]*weather_coefficients[pitbat_combo][x.play_type]["wind_ltr"] + x["Right to Left"]*weather_coefficients[pitbat_combo][x.play_type]["wind_rtl"] +
                                    x["in"]*weather_coefficients[pitbat_combo][x.play_type]["wind_in"] + x["out"]*weather_coefficients[pitbat_combo][x.play_type]["wind_out"] +
                                    (x["temprature"]**2) * weather_coefficients[pitbat_combo][x.play_type]["temprature_sq"] + weather_coefficients[pitbat_combo][x.play_type]["intercept"], axis=1)
    
    df["neutral_weather_expectation"] = df.apply(lambda x: 72**2 * weather_coefficients[pitbat_combo][x.play_type]["temprature_sq"] + weather_coefficients[pitbat_combo][x.play_type]["intercept"], axis=1)
    df["weather_impact"] = df.weather_expectation/df.neutral_weather_expectation
    df["stadium_impact"] = df.apply(lambda x: park_factors_dict[pitbat_combo][x.ballpark][x.play_type], axis=1)
    
    # Multiply the weather and stadium impacts to get the total impact for the specific at-bat result
    df["play_value"] = 1
    df["impact"] = df.play_value * df.weather_impact * df.stadium_impact
    df.play_value = 1/df.impact
    
    factored_training_stats[pitbat_combo] = df[["game_pk", "game_date", "batter", "pitcher", "play_type","impact", "play_value"]]
    
clear_output(wait=False)

In [7]:
#pkl.dump(factored_training_stats, open("/Users/jaredzirkes/Documents/GitHub/MLB-Simulation/training_batting_stats_with_factors.pkl", "wb"))
factored_batting_stats = pkl.load(open("/Users/jaredzirkes/Documents/GitHub/MLB-Simulation/training_batting_stats_with_factors.pkl", "rb"))

#### Calculate League Averages Over the Length of Training

In [53]:
league_average_plays_dict = {}
for pitbat_combo in hand_combos:
    league_average_plays_dict[pitbat_combo] = {}
    for play in plays:
        df = factored_batting_stats[pitbat_combo]
        play_share = len(df[df.play_type == play])/len(df)
        league_average_plays_dict[pitbat_combo][play] = play_share
        
pkl.dump(league_average_plays_dict, open("league_averages.pkl", "wb"))

## Roll Stats Daily To Get Final Odds Functions Training Data Sets

### Rolling with Breaks for Years

In [35]:
def roll_factored_batting_stats(factored_batting_stats, with_year_breaks, rolling_period):
    # Create a rolling percentage for each play outcome for each batter and pitcher for each year 
    rolling_factored_batting_stats = {}
    rolling_factored_pitching_stats = {}

    for pitbat_combo in hand_combos:
        if with_year_breaks == True:
            factored_batting_stats[pitbat_combo]["year"] = factored_batting_stats[pitbat_combo].game_date.apply(lambda x: x.split("-")[0])
        else:
            factored_batting_stats[pitbat_combo]["year"] = factored_batting_stats[pitbat_combo].game_date.apply(lambda x: "All Years")

    for pitbat_combo in hand_combos:
        print("Rolling Batting Stats Daily. There are {} Hand Combos Left".format(len(hand_combos) - (hand_combos.index(pitbat_combo)) + 4))
        clear_output(wait=True)


        rolling_factored_batting_stats[pitbat_combo] = {}
        rolling_factored_pitching_stats[pitbat_combo] = {}
        for year in factored_batting_stats[pitbat_combo].year.unique():
            rolling_factored_batting_stats[pitbat_combo][year] = {}
            rolling_factored_pitching_stats[pitbat_combo][year] = {}

            # Filter down to the stats for just the relevant year
            df = factored_batting_stats[pitbat_combo][factored_batting_stats[pitbat_combo].year == str(year)]

            # Build rolling stats for pitchers
            b = 0
            for batter in df.batter.unique():
                clear_output(wait = True)


                batter_df = df[df.batter == batter]
                batter_df["at_bat_num"] = 1

                # Make a rolling count for the at bats for each batter
                if with_year_breaks == True:
                    batter_df["at_bat_num"] = batter_df.at_bat_num.rolling(len(batter_df), min_periods = 1).sum()
                else:
                    batter_df["at_bat_num"] = batter_df.at_bat_num.rolling(min(len(batter_df), rolling_period), min_periods = 1).sum()

                for play in plays:
                    # Multiply the situation impact by a binary vector for play outcomes with a 1 for the correct play
                    batter_df[play] = batter_df.apply(lambda x: 1*x.play_value if x.play_type==play else 0, axis = 1)

                    if with_year_breaks == True:
                        batter_df[play] = batter_df[play].rolling(len(batter_df), min_periods = 1).sum()
                    else:
                        batter_df[play] = batter_df[play].rolling(min(len(batter_df), rolling_period), min_periods = 1).sum()

                    batter_df[play] = batter_df[play]/batter_df.at_bat_num
                    batter_df["pitbat"] = pitbat_combo
                rolling_factored_batting_stats[pitbat_combo][year][batter] = batter_df[["game_pk", "game_date", "batter", "pitcher", "pitbat", "at_bat_num", "out", "strikeout", "walk", "single", "double", "triple", "home_run"]]

                if b%10 ==0:
                    if with_year_breaks == True:
                        print("Rolling Batting Stats. There are {} batters remaining in {}, with {} years remaining. Then {} Hand Combos Remaining".format(len(df.batter.unique()) - b, year, len(training_years) - training_years.index(year)-1, len(hand_combos) - hand_combos.index(pitbat_combo) - 1))
                    else:
                        print("Rolling Batting Stats. There are {} batters remaining. Then {} Hand Combos Remaining".format(len(df.batter.unique()) - b, len(hand_combos) - hand_combos.index(pitbat_combo) - 1))
                        
                b+=1

                # Repercentage factored batting stats to sum to 1
                rolling_factored_batting_stats[pitbat_combo][year][batter][plays] = rolling_factored_batting_stats[pitbat_combo][year][batter].apply(lambda x: pd.Series([x[plays][p]/x[plays].sum() for p in plays]), axis=1)


            # Build the rolling stats for pitchers        
            p=0
            for pitcher in df.pitcher.unique():
                clear_output(wait=True)           
                pitcher_df = df[df.pitcher == pitcher]
                pitcher_df["at_bat_num"] = 1
                pitcher_df["at_bat_num"] = pitcher_df.at_bat_num.rolling(len(pitcher_df), min_periods = 1).sum()

                for play in plays:
                    # Multiply the situation impact by a binary vector for play outcomes with a 1 for the correct play
                    pitcher_df[play] = pitcher_df.apply(lambda x: 1*x.play_value if x.play_type==play else 0, axis = 1)
                    pitcher_df[play] = pitcher_df[play].rolling(len(pitcher_df), min_periods = 1).sum()
                    pitcher_df[play] = pitcher_df[play]/pitcher_df.at_bat_num
                    pitcher_df["pitbat"] = pitbat_combo

                rolling_factored_pitching_stats[pitbat_combo][year][pitcher] = pitcher_df[["game_pk", "game_date", "batter", "pitcher", "pitbat", "at_bat_num", "out", "strikeout", "walk", "single", "double", "triple", "home_run"]]

                if p%5 ==0:
                    if with_year_breaks == True:
                        print("Rolling Pitching Stats. There are {} pitchers remaining in {}, with {} years remaining. Then {} Hand Combos Remaining".format(len(df.pitcher.unique()) - p, year, len(training_years) - training_years.index(year)-1, len(hand_combos) - hand_combos.index(pitbat_combo) - 1))
                    else:
                        print("Rolling Pitching Stats. There are {} pitchers remaining. Then {} Hand Combos Remaining".format(len(df.pitcher.unique()) - b, len(hand_combos) - hand_combos.index(pitbat_combo) - 1))
                p+=1

                # Repercentage factored pitching stats to sum to 1
                rolling_factored_pitching_stats[pitbat_combo][year][pitcher][plays] = rolling_factored_pitching_stats[pitbat_combo][year][pitcher].apply(lambda x: pd.Series([x[plays][p]/x[plays].sum() for p in plays]), axis=1)

    clear_output(wait=False)
    
    return {"pitching_stats":rolling_factored_pitching_stats, "batting_stats":rolling_factored_batting_stats}

In [33]:
# # Create a rolling percentage for each play outcome for each batter and pitcher for each year 
# rolling_factored_batting_stats = {}
# rolling_factored_pitching_stats = {}

# for pitbat_combo in hand_combos:
#     factored_batting_stats[pitbat_combo]["year"] = factored_batting_stats[pitbat_combo].game_date.apply(lambda x: x.split("-")[0])

# for pitbat_combo in hand_combos:
#     print("Rolling Batting Stats Daily. There are {} Hand Combos Left".format(len(hand_combos) - (hand_combos.index(pitbat_combo)) + 4))
#     clear_output(wait=True)
    
    
#     rolling_factored_batting_stats[pitbat_combo] = {}
#     rolling_factored_pitching_stats[pitbat_combo] = {}
#     for year in training_years:
#         rolling_factored_batting_stats[pitbat_combo][year] = {}
#         rolling_factored_pitching_stats[pitbat_combo][year] = {}
        
#         # Filter down to the stats for just the relevant year
#         df = factored_batting_stats[pitbat_combo][factored_batting_stats[pitbat_combo].year == str(year)]
        
#         # Build rolling stats for pitchers
#         b = 0
#         for batter in df.batter.unique():
#             clear_output(wait = True)
            
            
#             batter_df = df[df.batter == batter]
#             batter_df["at_bat_num"] = 1
            
#             # Make a rolling count for the at bats for each batter
#             batter_df["at_bat_num"] = batter_df.at_bat_num.rolling(len(batter_df), min_periods = 1).sum()
            
#             for play in plays:
#                 # Multiply the situation impact by a binary vector for play outcomes with a 1 for the correct play
#                 batter_df[play] = batter_df.apply(lambda x: 1*x.play_value if x.play_type==play else 0, axis = 1)
#                 batter_df[play] = batter_df[play].rolling(len(batter_df), min_periods = 1).sum()
#                 batter_df[play] = batter_df[play]/batter_df.at_bat_num
#                 batter_df["pitbat"] = pitbat_combo
#             rolling_factored_batting_stats[pitbat_combo][year][batter] = batter_df[["game_pk", "game_date", "batter", "pitcher", "pitbat", "at_bat_num", "out", "strikeout", "walk", "single", "double", "triple", "home_run"]]
                        
#             if b%5 ==0:
#                 print("Rolling Batting Stats. There are {} batters remaining in {}, with {} years remaining".format(len(df.batter.unique()) - b, year, len(training_years) - training_years.index(year)-1))
#             b+=1

#             # Repercentage factored batting stats to sum to 1
#             rolling_factored_batting_stats[pitbat_combo][year][batter][plays] = rolling_factored_batting_stats[pitbat_combo][year][batter].apply(lambda x: pd.Series([x[plays][p]/x[plays].sum() for p in plays]), axis=1)
        
        
#         # Build the rolling stats for pitchers        
#         p=0
#         for pitcher in df.pitcher.unique():
#             clear_output(wait=True)           
#             pitcher_df = df[df.pitcher == pitcher]
#             pitcher_df["at_bat_num"] = 1
#             pitcher_df["at_bat_num"] = pitcher_df.at_bat_num.rolling(len(pitcher_df), min_periods = 1).sum()
            
#             for play in plays:
#                 # Multiply the situation impact by a binary vector for play outcomes with a 1 for the correct play
#                 pitcher_df[play] = pitcher_df.apply(lambda x: 1*x.play_value if x.play_type==play else 0, axis = 1)
#                 pitcher_df[play] = pitcher_df[play].rolling(len(pitcher_df), min_periods = 1).sum()
#                 pitcher_df[play] = pitcher_df[play]/pitcher_df.at_bat_num
#                 pitcher_df["pitbat"] = pitbat_combo

#             rolling_factored_pitching_stats[pitbat_combo][year][pitcher] = pitcher_df[["game_pk", "game_date", "batter", "pitcher", "pitbat", "at_bat_num", "out", "strikeout", "walk", "single", "double", "triple", "home_run"]]

#             if p%5 ==0:
#                 print("Rolling Pitching Stats. There are {} batters remaining in {}, with {} years remaining".format(len(df.batter.unique()) - p, year, len(training_years) - training_years.index(year)-1))
#             p+=1
            
#             # Repercentage factored pitching stats to sum to 1
#             rolling_factored_pitching_stats[pitbat_combo][year][pitcher][plays] = rolling_factored_pitching_stats[pitbat_combo][year][pitcher].apply(lambda x: pd.Series([x[plays][p]/x[plays].sum() for p in plays]), axis=1)
            
# clear_output(wait=False)


KeyboardInterrupt



In [34]:
# pkl.dump(rolling_factored_pitching_stats, open("rolling_factored_pitching_stats.pkl","wb"))
# pkl.dump(rolling_factored_batting_stats, open("rolling_factored_batting_stats.pkl","wb"))

rolling_factored_pitching_stats = pkl.load(open("rolling_factored_pitching_stats.pkl","rb"))
rolling_factored_batting_stats = pkl.load(open("rolling_factored_batting_stats.pkl","rb"))

In [93]:
# Pull all the rolled individual player DFs out of the dictionary and into a large DF 
# that will be used for final training
def stitch_individual_stats(rolling_factored_stats, with_year_breaks):
    training_stats = pd.DataFrame()
    pitching_holder = pd.DataFrame()

    for pitbat_combo in hand_combos:
        for year in rolling_factored_stats["batting_stats"][pitbat_combo]:
            print("Stitching Together Individual Stats Into the Odds Functions Data Set. There are {} Pitbats Left and {} Years Remaining in the Current Pitbat".format(len(hand_combos)-hand_combos.index(pitbat_combo), "Insert Years Remaining"))
            clear_output(wait = True)
            for batter in rolling_factored_stats["batting_stats"][pitbat_combo][year]:
                # Find each specific player df of unique pitbat combo, year, and batter
                df_b = rolling_factored_stats["batting_stats"][pitbat_combo][year][batter]

                # We will through an error trying to look for games with dates less than our opening day, and there's
                # no need to stats for ~10 games anyways, so cut off the first 35 PAs of stats
                if len(df_b) > 35:
                    training_stats = training_stats.append(df_b[35:]) 

            # Do the same thing for pitchers. Note we can leave in the first 35 PAs, because we will be simply joining with
            # the batters, so these PAs will get cut off then.
            for pitcher in rolling_factored_stats["pitching_stats"][pitbat_combo][year]:
                df_p = rolling_factored_stats["pitching_stats"][pitbat_combo][year][pitcher]
                pitching_holder = pitching_holder.append(df_p) 

        clear_output(wait=False)

    if with_year_breaks == True:
        training_stats["year"] = training_stats.game_date.apply(lambda x: x.split("-")[0])
        pitching_holder["year"] = pitching_holder.game_date.apply(lambda x: x.split("-")[0])
    else:
        training_stats["year"] = training_stats.game_date.apply(lambda x: "All Years")
        pitching_holder["year"] = pitching_holder.game_date.apply(lambda x: "All Years")

    #pitching_holder = pitching_holder.rename(columns = {"batter":'pitcher'})

    clear_output(wait=False)
    
    return {"pitching_stats":pitching_holder, "batting_stats":training_stats}

In [94]:
y = stitch_individual_stats(x, False)

In [100]:
# Attach the pitching probability vector to the training set by "joining" on the pitbat combo, year, and pitcher name, where the date is just less than the given PA
def build_odds_dataset(stitched_dataset, rolling_factored_stats, factored_batting_stats):
    for play in plays:
        print("Attaching the Batter Probability Vectors to the Odds Functions Data Set. There are {} Plays Remaining".format(len(plays) - plays.index(play)))
        stitched_dataset["batting_stats"]["b_" + play] = stitched_dataset["batting_stats"].apply(lambda x: rolling_factored_stats["batting_stats"][x.pitbat][x.year][x.batter][rolling_factored_stats["batting_stats"][x.pitbat][x.year][x.batter].game_date < x.game_date].iloc[-1][play] if len(rolling_factored_stats["batting_stats"][x.pitbat][x.year][x.batter][rolling_factored_stats["batting_stats"][x.pitbat][x.year][x.batter].game_date < x.game_date])>0 else None, axis = 1)
        clear_output(wait=True)

        print("Attaching the Pitcher Probability Vectors to the Odds Functions Data Set. There are {} Plays Remaining".format(len(plays) - plays.index(play)))
        stitched_dataset["pitching_stats"]["p_" + play] = stitched_dataset["pitching_stats"].apply(lambda x: rolling_factored_stats["pitching_stats"][x.pitbat][x.year][x.pitcher][rolling_factored_stats["pitching_stats"][x.pitbat][x.year][x.pitcher].game_date < x.game_date].iloc[-1][play] if len(rolling_factored_stats["pitching_stats"][x.pitbat][x.year][x.pitcher][rolling_factored_stats["pitching_stats"][x.pitbat][x.year][x.pitcher].game_date < x.game_date])>0 else None, axis = 1)
        clear_output(wait=True)

    stitched_dataset["batting_stats"] = stitched_dataset["batting_stats"].drop(columns = [plays])

    # Add in a column for the actual play, to be used for comparison against our prediction vector
    stitched_dataset["batting_stats"]["play"] = stitched_dataset["batting_stats"].apply(lambda x: factored_batting_stats[x.pitbat].loc[x.name].play_type, axis=1)

In [None]:
z = build_odds_dataset(y, x, factored_batting_stats)

Attaching the Batter Probability Vectors to the Odds Functions Data Set. There are 6 Plays Remaining


In [59]:
pkl.dump(training_stats, open("odds_functions_data_set.pkl", "wb"))