In [237]:
import pandas as pd
import numpy
import sklearn 
import pickle as pkl
import warnings
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
from IPython.display import clear_output


import numpy as np
from scipy import stats

warnings.simplefilter("ignore")

hand_combos = ["RR", "RL", "LR", "LL"]
training_years = ["2012", "2013", "2014"]

In [140]:
ballpark_info = pd.read_excel("/Users/jaredzirkes/Desktop/Python/MLB BETTING/Ballpark Info.xlsx", header=2)[["Stadium", "Team", "Start Date", "End Date"]]

## Functions

In [51]:
def convert_wind_direction(df, wind_column):
    
    # When wind speed is 0, the direction is automatically listed as "in" --> convert it to "zero" to differentiate
    ind = df[df.wind_speed == 0].index
    df.loc[ind, "wind_direction"] = "zero"
    
    # Use pd.get_dummies to One Hot Encode
    wind_columns = pd.get_dummies(wind_column, columns=['categorical_column', ])
    
    wind_columns = pd.concat([df, wind_columns], axis = 1)
    
    # Finally multiply the binary wind direction columns by the wind speed to get the final wind speed in the correct direction
    for column in wind_columns.columns[-5:]:
        wind_columns[column] = wind_columns[column] * wind_columns["wind_speed"]
    
    
    return wind_columns

In [196]:
def convert_stadium_column(df, stadium_column):
    
    stadiums = pd.get_dummies(stadium_column, columns=["categorical_column", ])
    df = pd.concat([df, stadiums], axis=1)
    
    return df
    

# Begin Code

In [171]:
all_plays_by_hand_combo = pkl.load(open("all_plays_by_hand_combo.pkl", "rb"))

In [178]:
# Combine our first three years of data (maintaining hand combo seperation) to be the full initial training set

all_training_data = {x:pd.DataFrame() for x in hand_combos}
for year in training_years:
    for pitbat_combo in hand_combos:
        df = all_plays_by_hand_combo[year][pitbat_combo]
        all_training_data[pitbat_combo] = all_training_data[pitbat_combo].append(df).reset_index(drop=True)
        
        all_training_data[pitbat_combo]["type_counter"] = 1
        
        all_training_data[pitbat_combo]["ballpark"] = all_training_data[pitbat_combo].apply(lambda x: ballpark_info[(ballpark_info.Team == x.home_team) & (ballpark_info["End Date"] > int(x.game_date.split("-")[0]))].Stadium.iloc[0],axis=1)
        
# int(all_training_data["RR"].iloc[0].game_date.split("-")[0])



2012
RR
RL
LR
LL
2013
RR
RL
LR
LL
2014
RR
RL
LR
LL


In [181]:
# Group all plays by the date and play type to get play_type_share showing the cumulative share of the play type at eod every day
eod_play_shares = all_training_data.copy()
for pitbat_combo in eod_play_shares:
    eod_play_shares[pitbat_combo] = eod_play_shares[pitbat_combo].groupby(by = ["play_type"]).last()
    

# Place the eod_play_share value for each play type into all training data pulling from the eod_play_share df
for pitbat_combo in all_training_data:
    all_training_data[pitbat_combo]["eod_play_share"] = all_training_data[pitbat_combo].apply(lambda x: eod_play_shares[pitbat_combo].loc[x.play_type].cum_play_type_share, axis = 1)

In [182]:
# For each game, calculate within the game (and pitbat_combo), the share of the plays that were each play type
game_play_shares = {x:{"games":{}, "players":{}} for x in hand_combos}
n = 0
n1 = 0

for pitbat_combo in all_training_data:
    full_df = all_training_data[pitbat_combo].copy()
    # For each game
    for game in full_df.game_pk.unique():
        clear_output(wait = True)
        game_df = full_df[full_df.game_pk == game].copy()
        game_df["type_counter"] = game_df.groupby(by = "play_type").cumsum().type_counter #calculate the total number of the play
        
        total = len(game_df)
        
        
        game_df = game_df.groupby(by = "play_type").max()
        
        
        game_df["play_share"]  = game_df.type_counter/total #divide by the total number of plays, getting the play share
        
        game_play_shares[pitbat_combo]["games"][game] = game_df
        game_play_shares[pitbat_combo]["games"][game]["count"] = total
        
        n+= 1
        if n%1000 == 0:
            print("game ",n)
        # Note -- there are ~28,000 games in this 2012-2014 training set
        
    # For each player
    print("Player")
    for player in full_df.batter.unique():
        clear_output(wait = True)
        player_df = full_df[full_df.game_pk == player].copy()
        player_df["type_counter"] = player_df.groupby(by = "play_type").cumsum().type_counter #calculate the total number of the play
        
        total = len(player_df)
        
        
        player_df = player_df.groupby(by = "play_type").max()
        
        
        player_df["play_share"]  = player_df.type_counter/total #divide by the total number of plays, getting the play share
        
        game_play_shares[pitbat_combo]["players"][player] = player_df
        
        # For printing updates - note: there are ~28,000 
        n1+= 1
        if n1%1000 == 0:
            print("Player ", n1)
            
clear_output(wait = False)

In [183]:
# For every play, insert the % of all plays in the game it occured in that were of the same play type into all_training from game_play_shares df
for pitbat_combo in hand_combos:
    all_training_data[pitbat_combo]["game_play_share"] = all_training_data[pitbat_combo].apply(lambda x: game_play_shares[pitbat_combo]["games"][x.game_pk].loc[x.play_type].play_share, axis = 1)
    #all_training_data[pitbat_combo]["batter_play_share"] = all_training_data[pitbat_combo].apply(lambda x: game_play_shares[pitbat_combo]["players"][x.batter].loc[x.play_type].play_share, axis = 1)
    
# Now that we have the MLB eod % of plays by play type for every day and the % of plays that are each play in every game,
# calculate/insert the difference between the individual game and the MLB eod values for every play
for pitbat_combo in hand_combos:
    all_training_data[pitbat_combo]["game_share_delta"] = all_training_data[pitbat_combo].game_play_share / all_training_data[pitbat_combo].eod_play_share
    #all_training_data[pitbat_combo]["batter_share_delta"] = all_training_data[pitbat_combo].batter_play_share - all_training_data[pitbat_combo].eod_play_share

# Regressions!

#### Cleaning for Weather Regression

In [184]:
# Remove the first 100? games from each season to let the rolling stats normalize
weather_training_data = {x:{} for x in hand_combos}
first_games = []


for pitbat_combo in hand_combos:  
    weather_training_df = all_training_data[pitbat_combo].copy()
    for year in training_years:
        first_game_pks = all_plays_by_hand_combo[year][pitbat_combo].game_pk.unique()[:100] # Find the game_ids for the first 100 games of each season
        first_games.append(list(first_game_pks))
        
    first_games_list = np.concatenate(first_games).ravel()
    
    weather_training_df = weather_training_df[weather_training_df.game_pk.isin(first_games_list) == False] # Pull out only the games that aren't in the first 100 games
    weather_training_data[pitbat_combo] = weather_training_df[["game_pk","game_date", "play_type", "temprature", "wind_speed", "wind_direction", "game_play_share"]]

In [185]:
# Group the weather training data by game and play type to get the game_share_delta for each play type for each game
# Eg. game 317795 doubles has a game_share_delta of .355
for pitbat_combo in hand_combos: 
    weather_training_data[pitbat_combo] = weather_training_data[pitbat_combo].groupby(by = ["game_pk", "play_type"]).last().reset_index()

In [186]:
# Because the only plays currently in our data are play types that happened in games, fill in all the missing play types for 
# Each game with a game_share of 0 for that play type
play_types = ['out', 'single', 'strikeout', 'double', 'walk', 'home_run','triple']
n = 0
for pitbat_combo in hand_combos:
    for game in weather_training_data[pitbat_combo].game_pk.unique():
        n += 1
        if n%500 == 0:
            print (pitbat_combo, n)
        clear_output(wait = True)
        df = weather_training_data[pitbat_combo][weather_training_data[pitbat_combo].game_pk == game].copy()
        if len(df) < len(play_types):
            missing_plays = [play for play in play_types if play not in df.play_type.values]
            for play in missing_plays:
                #weather_training_data[pitbat_combo] =  weather_training_data[pitbat_combo].append(pd.Series({"game_pk":game, "game_date":df.iloc[0].game_date, "play_type":play, "temprature":df.iloc[0].temprature, "wind_speed":df.iloc[0].wind_speed, "wind_direction":df.iloc[0].wind_direction, "game_share_delta":all_training_data[pitbat_combo][(all_training_data[pitbat_combo].game_date < df.iloc[0].game_date) & (all_training_data[pitbat_combo].play_type == play)].iloc[-1].eod_play_share * -1}), ignore_index = True)
                weather_training_data[pitbat_combo] =  weather_training_data[pitbat_combo].append(pd.Series({"game_pk":game, "game_date":df.iloc[0].game_date, "play_type":play, "temprature":df.iloc[0].temprature, "wind_speed":df.iloc[0].wind_speed, "wind_direction":df.iloc[0].wind_direction, "game_play_share":0}), ignore_index=True)
clear_output(wait = False)

In [187]:
for pitbat_combo in hand_combos:
    
    # Filter down to only the relevant columns for the weather regression
    weather_training_data[pitbat_combo] = weather_training_data[pitbat_combo][["game_pk", "play_type", "temprature", "wind_speed", "wind_direction", "game_play_share"]]
    
    # Square temprature to use in the regression because I believe it behaves this way
    weather_training_data[pitbat_combo]["temprature_squared"] = weather_training_data[pitbat_combo]["temprature"].apply(lambda x: x**2)
    
    # Encode the wind directions and calculate final wind speeds in the direction
    weather_training_data[pitbat_combo] = convert_wind_direction(weather_training_data[pitbat_combo], weather_training_data[pitbat_combo].wind_direction)
    
    # Build interaction and magnitude into the wind direction by multiplying the binary wind direction by the wind speed
    # for column in weather_training_data[pitbat_combo].columns[-5:]:
    #     weather_training_data[pitbat_combo][column] = weather_training_data[pitbat_combo][column] * weather_training_data[pitbat_combo].wind_speed

#### Weather Regression

In [188]:
# Before regressing, remove outliers for game_share???, most of which are caused by low pitbat_combo sample sizes in games
weather_coefficients = {}

for pitbat_combo in hand_combos:
    weather_coefficients[pitbat_combo] = {}
    for play_type in weather_training_data[pitbat_combo].play_type.unique():
        plays = weather_training_data[pitbat_combo][weather_training_data[pitbat_combo].play_type == play_type]
        
        # Remove outliers for game_share_delta, most of which are caused by low pitbat_combo sample sizes in games
        plays = plays[(np.abs(stats.zscore(plays.game_play_share)) < 3)]
        
        # Create 2 sets of x data, with and without squaring temprature
        x = plays[plays.columns[np.r_[2:4, 6:11]]] #grab only the weather related columns and then get rid of regular temprature
        
        x_sq = x[[col for col in x.columns if col != "temprature" and col != "wind_speed"]]
        
        y = plays.game_play_share
        
        # Regress the temprature squared dataset on game_share_delta
        lin_sq = LinearRegression(fit_intercept = True)
        lin_sq.fit(x_sq, y)
        
        weather_coefficients[pitbat_combo][play_type] = {"intercept":lin_sq.intercept_, "temprature_sq":lin_sq.coef_[0], "wind_ltr":lin_sq.coef_[1],
                                                 "wind_rtl":lin_sq.coef_[2], "wind_in":lin_sq.coef_[3], "wind_out":lin_sq.coef_[4]}

#### Cleaning for Stadium Regression

In [301]:
ballpark_info

Unnamed: 0,Stadium,Team,Start Date,End Date
0,Fenway,BOS,1900,2023
1,Wrigley,CHC,1900,2023
2,Dodger,LAD,1900,2023
3,Coliseum,OAK,1900,2023
4,Angel,LAA,1900,2023
5,Kauffman,KC,1900,2023
6,Rodgers,TOR,1900,2023
7,Tropicana,TB,1900,2023
8,Guaranteed Rate,CWS,1900,2023
9,Camden,BAL,1900,2023


In [340]:
factors_dict = {}
for pitbat_combo in hand_combos:
    factors_dict[pitbat_combo] = {}
    
    for team in all_training_data["RR"].home_team.unique():
        factors_dict[pitbat_combo][team] = {}
        home_df = all_training_data[pitbat_combo][(all_training_data[pitbat_combo].home_team == team) & (all_training_data[pitbat_combo].inning_topbot == "Bot")]
        away_df = all_training_data[pitbat_combo][(all_training_data[pitbat_combo].away_team == team) & (all_training_data[pitbat_combo].inning_topbot == "Top")]
    

        for play_type in ["out", "strikeout", "double", "walk", "single", "home_run", "triple"]:
            home_rate = len(home_df[home_df.play_type == play_type])/len(home_df)
            away_rate = len(away_df[away_df.play_type == play_type])/len(away_df)
            try:
                park_factor = home_rate/away_rate
            except:
                part_factor = "n/a"
            
            if pitbat_combo == "LL" and team == "NYY" and play_type == "home_run":

            factors_dict[pitbat_combo][team][play_type] = park_factor

0.03389830508474576 0.015246636771300448 2.22333000997009


In [335]:
x = all_training_data["LL"][(all_training_data["LL"].away_team == "NYY") & (all_training_data["LL"].inning_topbot == "Top")]
len(x[x.play_type == "home_run"])/len(x)

0.015246636771300448

In [306]:
all_training_data["RR"].play_type.unique()

array(['out', 'strikeout', 'double', 'walk', 'single', 'home_run',
       'triple'], dtype=object)

In [189]:
# Remove the first 100? games from each season to let the rolling stats normalize
stadium_training_data = {x:{} for x in hand_combos}
first_games = []


for pitbat_combo in hand_combos:  
    stadium_training_df = all_training_data[pitbat_combo].copy()
    for year in training_years:
        first_game_pks = all_plays_by_hand_combo[year][pitbat_combo].game_pk.unique()[:100] # Find the game_ids for the first 100 games of each season
        first_games.append(list(first_game_pks))
        
    first_games_list = np.concatenate(first_games).ravel()
    
    stadium_training_df = stadium_training_df[stadium_training_df.game_pk.isin(first_games_list) == False] # Pull out only the games that aren't in the first 100 games
    stadium_training_data[pitbat_combo] = stadium_training_df[["game_pk","game_date", "play_type", "ballpark", "game_play_share"]]

In [192]:
# Group the weather training data by game and play type to get the game_share_delta for each play type for each game
# Eg. game 317795 doubles has a game_share_delta of .355
for pitbat_combo in hand_combos: 
    stadium_training_data[pitbat_combo] = stadium_training_data[pitbat_combo].groupby(by = ["game_pk", "play_type"]).last().reset_index()

In [193]:
# Because the only plays currently in our data are play types that happened in games, fill in all the missing play types for 
# Each game with a game_share of 0 for that play type
play_types = ['out', 'single', 'strikeout', 'double', 'walk', 'home_run','triple']
n = 0
for pitbat_combo in hand_combos:
    for game in stadium_training_data[pitbat_combo].game_pk.unique():
        n += 1
        if n%500 == 0:
            print (pitbat_combo, n)
        clear_output(wait = True)
        df = stadium_training_data[pitbat_combo][stadium_training_data[pitbat_combo].game_pk == game].copy()
        if len(df) < len(play_types):
            missing_plays = [play for play in play_types if play not in df.play_type.values]
            for play in missing_plays:
                #weather_training_data[pitbat_combo] =  weather_training_data[pitbat_combo].append(pd.Series({"game_pk":game, "game_date":df.iloc[0].game_date, "play_type":play, "temprature":df.iloc[0].temprature, "wind_speed":df.iloc[0].wind_speed, "wind_direction":df.iloc[0].wind_direction, "game_share_delta":all_training_data[pitbat_combo][(all_training_data[pitbat_combo].game_date < df.iloc[0].game_date) & (all_training_data[pitbat_combo].play_type == play)].iloc[-1].eod_play_share * -1}), ignore_index = True)
                stadium_training_data[pitbat_combo] = stadium_training_data[pitbat_combo].append(pd.Series({"game_pk":game, "game_date":df.iloc[0].game_date, "play_type":play, "ballpark":df.iloc[0].ballpark, "game_play_share":0}), ignore_index=True)
clear_output(wait = False)

In [206]:
for pitbat_combo in hand_combos:
    
    # Filter down to only the relevant columns for the weather regression
    stadium_training_data[pitbat_combo] = stadium_training_data[pitbat_combo][["game_pk", "play_type", "ballpark", "game_play_share"]]
    
    # Encode the wind directions and calculate final wind speeds in the direction
    stadium_training_data[pitbat_combo] = convert_stadium_column(stadium_training_data[pitbat_combo], stadium_training_data[pitbat_combo].ballpark)

#### Ballpark Regression

In [208]:
stadium_training_data["RR"]

Unnamed: 0,game_pk,play_type,ballpark,game_play_share,American Family,Angel,Busch,Camden,Chase,Citizens Bank,...,PNC,Petco,Progressive,Rodgers,T-Mobile,Target,Tropicana,Turner,Wrigley,Yankee
0,317795.0,double,Progressive,0.105263,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,317795.0,out,Progressive,0.473684,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,317795.0,single,Progressive,0.105263,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,317795.0,strikeout,Progressive,0.263158,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,317795.0,walk,Progressive,0.052632,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48911,382968.0,home_run,Progressive,0.000000,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
48912,382968.0,triple,Progressive,0.000000,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
48913,385292.0,single,Globe Life,0.000000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
48914,385292.0,home_run,Globe Life,0.000000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [232]:
# Before regressing, remove outliers for game_share???, most of which are caused by low pitbat_combo sample sizes in games
stadium_coefficients = {}

for pitbat_combo in hand_combos:
    stadium_coefficients[pitbat_combo] = {}
    for play_type in stadium_training_data[pitbat_combo].play_type.unique():
        plays = stadium_training_data[pitbat_combo][stadium_training_data[pitbat_combo].play_type == play_type]
        
        # Remove outliers for game_share_delta, most of which are caused by low pitbat_combo sample sizes in games
        plays = plays[(np.abs(stats.zscore(plays.game_play_share)) < 3)]
        
        # Create 2 sets of x data, with and without squaring temprature
        x = plays[[col for col in plays.columns if col not in ["game_pk", "play_type", "game_date", "ballpark", "game_play_share"]]]  #grab only the stadium related columns and then get rid of regular temprature
        
        y = plays.game_play_share
        
        # Regress the temprature squared dataset on game_share_delta
        stadium_lin_reg = LinearRegression(fit_intercept = True)
        stadium_lin_reg.fit(x, y)
        
        stadium_coefficients[pitbat_combo][play_type] = {stadium_training_data[pitbat_combo].columns[n+4]:stadium_lin_reg.coef_[n] for n in range(len(stadium_training_data[pitbat_combo].columns)-4)}
        stadium_coefficients[pitbat_combo][play_type]["intercept"] = stadium_lin_reg.intercept_   

# Adjusting Batting Stats with Weather Coefficients

In [276]:
# Start a new dictionary to hold the edited training stats
training_stats = {}
for pitbat_combo in hand_combos:
    
    # Grab the relevant columns and filter down to the relevant games (not first 100)
    df = all_training_data[pitbat_combo][["game_pk", "game_date", "batter", "play_type", "temprature", "wind_speed", "wind_direction", "ballpark"]]
    df = df[df.game_pk.isin(weather_training_data[pitbat_combo].game_pk) == True]
    
    df = convert_wind_direction(df, df.wind_direction)
    
    df["play_value"] = 1
    
    training_stats[pitbat_combo] = df
    
    df["weather_expectation"] = df.apply(lambda x: x["Left to Right"]*weather_coefficients[pitbat_combo][x.play_type]["wind_ltr"] + x["Right to Left"]*weather_coefficients[pitbat_combo][x.play_type]["wind_rtl"] +
                                    x["in"]*weather_coefficients[pitbat_combo][x.play_type]["wind_in"] + x["out"]*weather_coefficients[pitbat_combo][x.play_type]["wind_out"] +
                                    (x["temprature"]**2) * weather_coefficients[pitbat_combo][x.play_type]["temprature_sq"] + weather_coefficients[pitbat_combo][x.play_type]["intercept"], axis=1)
    
    df["neutral_weather_expectation"] = df.apply(lambda x: 72**2 * weather_coefficients[pitbat_combo][x.play_type]["temprature_sq"] + weather_coefficients[pitbat_combo][x.play_type]["intercept"], axis=1)
    df["weather_impact"] = df.weather_expectation-df.neutral_weather_expectation
    
    
    df["stadium_impact"] = df.apply(lambda x: stadium_coefficients[pitbat_combo][x.play_type][x.ballpark] + stadium_coefficients[pitbat_combo][x.play_type]["intercept"], axis=1)
    
    training_stats[pitbat_combo] = df

In [295]:
x = training_stats["RR"].weather_.mean()

#x[x.ballpark=="Target"].groupby(by="play_type").last().stadium_impact.sum()

game_pk                        350069.440041
batter                         446499.897562
temprature                         73.998269
wind_speed                          7.383785
Left to Right                       1.329781
Right to Left                       1.338929
in                                  1.927505
out                                 2.787570
zero                                0.000000
play_value                          1.000000
weather_expectation                 0.311324
neutral_weather_expectation         0.311269
weather_impact                      0.000055
stadium_impact                      0.311893
dtype: float64