In [56]:
import pandas as pd
import numpy as np
import pickle as pkl
import random
import time
import math
import pybaseball
import sklearn
from sklearn.metrics import log_loss
from sklearn.preprocessing import MultiLabelBinarizer
from pybaseball import playerid_lookup, playerid_reverse_lookup
import warnings
import matplotlib.pyplot as plt
from IPython.display import clear_output

hand_combos = ["RR", "RL", "LR", "LL"]
training_years = ["2012", "2013", "2014"]

plays = ["out", "strikeout", "walk", "single", "double", "triple", "home_run"]

warnings.simplefilter("ignore")

### Odds Functions

In [57]:
def log5 (pB, pP, pL):
    """ Given the probability of a PA outcome for the pitcher, the batter, and the overall league, calculate the
    probability in that given at bat using the log5 equation. NOTE: DO NOT USE RIGHT NOW""" 
    one = (pB*pP)/pL
    two = ((1-pB)*(1-pP))/(1-pL)
    
    return one/(one + two)


def morey_z(pB, pP, pL):
    """ Given the probability of a PA outcome for the pitcher, the batter, and the overall league, calculate the
    probability in that given at bat using the Morey Z equation"""
    one = (pB-pL)/np.sqrt(pL*(1-pL))
    two = (pP-pL)/np.sqrt(pL*(1-pL))
    three = np.sqrt(pL*(1-pL))
    return ((one + two)/np.sqrt(2) * three) +pL

def ab_play_percentages(batting_percentages, pitching_percentages, league_percentages, pitbat_combo, function):
    """ Given a list of probabilities for all PA outcomes for the batter, the pitcher, and the league, along with
    the pitbat combo, and the desired probability funtion, return a list of the probabilities for all PA outcomes 
    for the specific PA"""
    
    ab_percentages = {}
    
    # Get the specific percentages for each play type
    for play in plays:
        batting_percent = batting_percentages["b_" + play]
        pitching_percent = pitching_percentages["p_" + play]
        league_percent = league_percentages[pitbat_combo][play]
        
        # Ensure we are using one of the two acceptable prediction functions
        if function not in ["morey z", "Morey Z", "log5", "Log5"]:
            while funtion not in ["morey z", "Morey Z", "log5", "Log5"]:
                function = input("Acceptable Functions are Morey Z and Log5. Please input one.")
        
        # Calculate the predicted percentage for the specific play for the PA
        if function == "morey z" or function == "Morey Z":
            expected_percent = max(morey_z(batting_percent, pitching_percent, league_percent), 0.000001)
        else:
            expected_percent = log5(batting_percent, pitching_percent, league_percent)
    
        # Insert the predicted percentage for the play type into our dictionary for delivery
        ab_percentages[play] = expected_percent
        
        # Get rid of negative and zero numbers and repercenage slightly if numbers are reset
        ab_percentages = {key: value/sum(list(ab_percentages.values())) for key, value in ab_percentages.items()}
    
    return ab_percentages

# League Average Guesser
def average_guesser(batting_percentages, pitching_percentages, league_percentages, pitbat_combo):
    ab_percentages = {}
    
    for play in plays:
        league_percent = league_percentages[pitbat_combo][play]
        ab_percentages[play] = league_percent
        
    ab_percentages = {key: value/sum(list(ab_percentages.values())) for key, value in ab_percentages.items()}
    
    return ab_percentages  
        

In [63]:
def log_loss(probabilities, actuals):
    """ Given a list of probabilities and acuals for a series of instances, calculate and return the average log loss"""
    log_loss = 0
    yhat_probabilities = []
    for instance in range(len(probabilities)):
        yhat_probabilities.append(max([n for n in np.array(probabilities.iloc[instance])*np.array(actuals.iloc[instance])]))
    log_loss -= sum([np.log10(x)for x in yhat_probabilities])

    return yhat_probabilities
    return log_loss/len(probabilities)

In [59]:
odds_dataset = pkl.load(open("odds_functions_data_set", "rb"))
odds_dataset = pkl.load(open("odds_functions_dataset_without_yearbreaks.pkl", "rb"))["batting_stats"]

league_averages = pkl.load(open("league_averages.pkl", "rb"))

## Make Predictions on Neutral Training Data

In [60]:
odds_dataset["morey_prediction"] = odds_dataset.apply(lambda x: ab_play_percentages(x[["b_" + play for play in plays]], x[["p_" + play for play in plays]], league_averages, x.pitbat, "morey z"), axis = 1)
odds_dataset["morey_prediction_list"] = odds_dataset.morey_prediction.apply(lambda x: list(x.values()))

odds_dataset["la_prediction"] = odds_dataset.apply(lambda x: average_guesser(x[["b_" + play for play in plays]], x[["p_" + play for play in plays]], league_averages, x.pitbat), axis = 1)
odds_dataset["la_prediction_list"] = odds_dataset.la_prediction.apply(lambda x: list(x.values()))

odds_dataset["actuals"] = odds_dataset.apply(lambda x: list(x.la_prediction.keys()).index(x.play),axis=1)

odds_dataset['yhat'] = odds_dataset.actuals.apply(lambda x: [0]*len(plays))
odds_dataset['yhat'] = odds_dataset.apply(lambda x: x.yhat[0:x.actuals] +[1] +  x.yhat[x.actuals+1:], axis=1)

odds_dataset = odds_dataset.drop(columns = ["morey_prediction", "la_prediction"])

# Remove some instances with NA from pitchers who only make 1 app
odds_dataset.dropna(inplace=True)

In [64]:
crappy_log_loss = log_loss(odds_dataset.la_prediction_list, odds_dataset.yhat)
morey_log_loss = log_loss(odds_dataset.morey_prediction_list, odds_dataset.yhat)

print("Crappy Log Loss: {}".format(crappy_log_loss))
print("Morey Log Loss: {}".format(morey_log_loss))

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [65]:
for play in plays:
    df = odds_dataset[odds_dataset.play == play].copy()
    play_percent = len(df)/len(odds_dataset)
    
    crappy_predictions = log_loss(df.la_prediction_list, df.yhat)
    morey_predictions = log_loss(df.morey_prediction_list, df.yhat)
    
    print(str(play) + "'s average value is {}".format(play_percent))
    print(str(play) + "'s crappy average prediction value is: {}".format(sum(crappy_predictions)/len(crappy_predictions)))
    print(str(play) + "'s morey average prediction value is: {}".format(sum(morey_predictions)/len(morey_predictions)))
    print("\n")
    
    

out's average value is 0.4821648345592577
out's crappy average prediction value is: 0.4822227813370794
out's morey average prediction value is: 0.6564536916713998


strikeout's average value is 0.19416227337019415
strikeout's crappy average prediction value is: 0.2017325250721229
strikeout's morey average prediction value is: 0.12347866440535127


walk's average value is 0.08925061602370149
walk's crappy average prediction value is: 0.08771363875109711
walk's morey average prediction value is: 0.0638860064167906


single's average value is 0.15794873948036203
single's crappy average prediction value is: 0.1553348468574084
single's morey average prediction value is: 0.11297797622883227


double's average value is 0.045776006172045774
double's crappy average prediction value is: 0.04495534493252304
double's morey average prediction value is: 0.038202234414635085


triple's average value is 0.004650027684984242
triple's crappy average prediction value is: 0.004788239065214969
triple's mor

In [None]:
sum([np.log10(x) for x in morey_log_loss])