In [19]:
import pandas as pd
import numpy as np
import pickle as pkl
import random
import time
import pybaseball
from pybbaseball import playerid_lookup, playerid_reverse_lookup
import warnings
import matplotlib.pyplot as plt
from IPython.display import clear_output

hand_combos = ["RR", "RL", "LR", "LL"]
training_years = ["2012", "2013", "2014"]

plays = ["out", "strikeout", "walk", "single", "double", "triple", "home_run"]

warnings.simplefilter("ignore")

In [188]:
def log5 (pB, pP, pL):
    one = (pB*pP)/pL
    two = ((1-pB)*(1-pP))/(1-pL)
    
    return one/(one + two)


def morey_z(pB, pP, pL):
    one = (pB-pL)/np.sqrt(pL*(1-pL))
    two = (pP-pL)/np.sqrt(pL*(1-pL))
    three = np.sqrt(pL*(1-pL))
    return ((one + two)/np.sqrt(2) * three) +pL



def ab_play_percentages(batting_percentages, pitching_percentages, league_percentages, pitbat_combo, function):
    ab_percentages = {}
    
    for play in plays:
        batting_percent = batting_percentages[play]
        pitching_percent = pitching_percentages["p_" + play]
        league_percent = league_percentages[pitbat_combo][play]
        
        if function not in ["morey z", "Morey Z", "log5", "Log5"]:
            while funtion not in ["morey z", "Morey Z", "log5", "Log5"]:
                function = input("Acceptable Functions are Morey Z and Log5. Please input one.")
        
        if function == "morey z" or function == "Morey Z":
            expected_percent = morey_z(batting_percent, pitching_percent, league_percent)
        else:
            expected_percent = log5(batting_percent, pitching_percent, league_percent)
    
        ab_percentages[play] = expected_percent
    
    return ab_percentages
        

In [4]:
factored_batting_stats = pkl.load(open("training_batting_stats_with_factors.pkl", "rb"))

### Calculate League Averages For Each Play Type

In [5]:
league_average_plays_dict = {}
for pitbat_combo in hand_combos:
    league_average_plays_dict[pitbat_combo] = {}
    for play in plays:
        df = factored_batting_stats[pitbat_combo]
        play_share = len(df[df.play_type == play])/len(df)
        league_average_plays_dict[pitbat_combo][play] = play_share

### Create Rolling AB Percentage Splits for Players for Each Year

In [62]:
# Create a rolling percentage for each play outcome for each batter and pitcher for each year 
# rolling_factored_batting_stats = {}
# rolling_factored_pitching_stats = {}

# for pitbat_combo in hand_combos:
#     factored_batting_stats[pitbat_combo]["year"] = factored_batting_stats[pitbat_combo].game_date.apply(lambda x: x.split("-")[0])

# for pitbat_combo in hand_combos:
#     rolling_factored_batting_stats[pitbat_combo] = {}
#     rolling_factored_pitching_stats[pitbat_combo] = {}
#     for year in training_years:
#         rolling_factored_batting_stats[pitbat_combo][year] = {}
#         rolling_factored_pitching_stats[pitbat_combo][year] = {}
        
#         # Filter down to the stats for just the relevant year
#         df = factored_batting_stats[pitbat_combo][factored_batting_stats[pitbat_combo].year == str(year)]
        
#         # Build rolling stats for pitchers
#         for batter in df.batter.unique():
#             print("Batter", pitbat_combo, year, batter)
#             clear_output(wait = True)
            
            
#             batter_df = df[df.batter == batter]
#             batter_df["at_bat_num"] = 1
            
#             # Make a rolling count for the at bats for each batter
#             batter_df["at_bat_num"] = batter_df.at_bat_num.rolling(len(batter_df), min_periods = 1).sum()
            
#             for play in plays:
#                 # Multiply the situation impact by a binary vector for play outcomes with a 1 for the correct play
#                 batter_df[play] = batter_df.apply(lambda x: 1*x.play_value if x.play_type==play else 0, axis = 1)
#                 batter_df[play] = batter_df[play].rolling(len(batter_df), min_periods = 1).sum()
#                 batter_df[play] = batter_df[play]/batter_df.at_bat_num
#                 batter_df["pitbat"] = pitbat_combo
#
#             rolling_factored_batting_stats[pitbat_combo][year][batter] = batter_df[["game_pk", "game_date", "batter", "pitcher", "at_bat_num", "out", "strikeout", "walk", "single", "double", "triple", "home_run"]]
            
#             # Repercentage factored batting stats to sum to 1
#             rolling_factored_batting_stats[pitbat_combo][year][batter][plays] = rolling_factored_batting_stats[pitbat_combo][year][batter].apply(lambda x: pd.Series([x[plays][p]/x[plays].sum() for p in plays]), axis=1)
        
#         # Build the rolling stats for pitchers
#         for pitcher in df.pitcher.unique():
#             print("Pitcher", pitbat_combo, year, batter)
#             clear_output(wait = True)
            
            
#             pitcher_df = df[df.pitcher == pitcher]
#             pitcher_df["at_bat_num"] = 1
            
#             pitcher_df["at_bat_num"] = pitcher_df.at_bat_num.rolling(len(pitcher_df), min_periods = 1).sum()
#             for play in plays:
#                 # Multiply the situation impact by a binary vector for play outcomes with a 1 for the correct play
#                 pitcher_df[play] = pitcher_df.apply(lambda x: 1*x.play_value if x.play_type==play else 0, axis = 1)
#                 pitcher_df[play] = pitcher_df[play].rolling(len(pitcher_df), min_periods = 1).sum()
#                 pitcher_df[play] = pitcher_df[play]/pitcher_df.at_bat_num
#                 pitcher_df["pitbat"] = pitbat_combo

#             rolling_factored_pitching_stats[pitbat_combo][year][pitcher] = pitcher_df[["game_pk", "game_date", "batter", "pitcher", "at_bat_num", "out", "strikeout", "walk", "single", "double", "triple", "home_run"]]
#             rolling_factored_pitching_stats[pitbat_combo][year][pitcher][plays] = rolling_factored_pitching_stats[pitbat_combo][year][pitcher].apply(lambda x: pd.Series([x[plays][p]/x[plays].sum() for p in plays]), axis=1)

In [14]:
# pkl.dump(rolling_factored_pitching_stats, open("rolling_factored_pitching_stats.pkl","wb"))
# pkl.dump(rolling_factored_batting_stats, open("rolling_factored_batting_stats.pkl","wb"))

rolling_factored_pitching_stats = pkl.load(open("rolling_factored_pitching_stats.pkl","rb"))
rolling_factored_batting_stats = pkl.load(open("rolling_factored_batting_stats.pkl","rb"))

In [84]:
# for pitbat_combo in hand_combos:
#     for year in training_years:
#         for batter in rolling_factored_batting_stats[pitbat_combo][year]:
#             clear_output(wait = True)
#             df = rolling_factored_batting_stats[pitbat_combo][year][batter]
#             df["pitcher"] = df.apply(lambda x: factored_batting_stats[pitbat_combo].loc[x.name].pitcher, axis = 1)

# for pitbat_combo in hand_combos:
#     for year in training_years:
#         for batter in rolling_factored_batting_stats[pitbat_combo][year]:
#             rolling_factored_batting_stats[pitbat_combo][year][batter]["pitbat"] = pitbat_combo
#         for pitcher in rolling_factored_pitching_stats[pitbat_combo][year]:
#             rolling_factored_pitching_stats[pitbat_combo][year][pitcher]["pitbat"] = pitbat_combo
            


In [142]:
# Pull all the DFs out of the dictionary and into a large DF that will be used for final training

training_stats = pd.DataFrame()
pitching_holder = pd.DataFrame()

for pitbat_combo in hand_combos:
    print(pitbat_combo)
    for year in training_years:
        clear_output(wait = True)
        for batter in rolling_factored_batting_stats[pitbat_combo][year]:
            df_b = rolling_factored_batting_stats[pitbat_combo][year][batter]
            #if len(df_b) > 35:
            training_stats = training_stats.append(df_b) 

        
        for pitcher in rolling_factored_pitching_stats[pitbat_combo][year]:
            clear_output(wait = True)
            df_p = rolling_factored_pitching_stats[pitbat_combo][year][pitcher]
            pitching_holder = pitching_holder.append(df_p) 
                
                
                
    clear_output(wait=False)
    
training_stats["year"] = training_stats.game_date.apply(lambda x: x.split("-")[0])
pitching_holder["year"] = pitching_holder.game_date.apply(lambda x: x.split("-")[0])

pitching_holder = pitching_holder.rename(columns = {"batter":'pitcher'})

In [168]:
for play in plays:
    print(play)
    training_stats["p_" + play] = training_stats.apply(lambda x: rolling_factored_pitching_stats[x.pitbat][x.year][x.pitcher][rolling_factored_pitching_stats[x.pitbat][x.year][x.pitcher].game_date < x.game_date].iloc[-1][play] if len(rolling_factored_pitching_stats[x.pitbat][x.year][x.pitcher][rolling_factored_pitching_stats[x.pitbat][x.year][x.pitcher].game_date < x.game_date])>0 else None, axis = 1)
    clear_output(wait=False)

we have the pitching stats as the pitching stats from the end of yesterdays game, but we need to do the same for batting stats

In [178]:
training_stats["play"] = training_stats.apply(lambda x: factored_batting_stats[x.pitbat].loc[x.name].play_type, axis=1)

In [201]:
training_stats["prediction"] = training_stats.apply(lambda x: ab_play_percentages(x[plays], x[["p_" + play for play in plays]], league_average_plays_dict, x.pitbat, "morey z"), axis = 1)

In [203]:
for play in plays:
    training_stats["f_"+play] = training_stats.prediction.apply(lambda x: x[play])

In [220]:
training_stats.f_walk.mean()

0.0881061847960437

In [221]:
training_stats[training_stats.play == "walk"].f_walk.mean()

0.10500879067914749

In [222]:
x =1