In [1]:
import ast
import datetime
import dateutil.parser
import distutils.dir_util
import glob
import IPython.display
import json
import math
import numpy as np
import os
import pandas as pd
import pathlib
import pickle
import pyautogui
import pytz
import re
import requests
import selenium
import shutil
import statsapi
import statsmodels.formula.api as smf
import time
import unidecode
import warnings
import webbrowser
import xlrd
import random
import urllib
from urllib.request import urlopen, Request
import zipfile

from bs4 import BeautifulSoup
from datetime import date
from IPython.display import display, Javascript
from joblib import Parallel, delayed
from pathlib import Path
from scipy import stats
from sklearn.preprocessing import StandardScaler
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
from openpyxl import load_workbook
from functools import partial

from statsapi import get
from pydfs_lineup_optimizer import get_optimizer, Site, Sport, Player, TeamStack, PlayerFilter, RandomFantasyPointsStrategy

os.chdir(r"C:\Users\james\Documents\MLB\Code")

# from Utilities import *
# from Classes import *
# from simulation_functions_three import *

import smtplib
import ssl
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from email.mime.base import MIMEBase
from email import encoders

# Ensure the warning is ignored only once
warnings.simplefilter(action="ignore")

# Display the DataFrame
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.set_option("display.max_colwidth", None)

# Set paths
model_path = r"C:\Users\james\Documents\MLB\Code\Models"
baseball_path = r"C:\Users\james\Documents\MLB\Data2"
download_path = r"C:\Users\james\Downloads"

In [2]:
# This reads in a map of team name, codes, and the shorthand MLB uses in their URLs
team_map = pd.read_csv(os.path.join(baseball_path, "Utilities", "Team Map.csv"))

# We just need teams right now
team_map = team_map[['FULLNAME', 'BBREFTEAM', 'MLBURL', 'FANGRAPHSTEAM', 'VENUE_ID', 'SFBBTEAM', 'DKTEAM', 'ROTOWIRETEAM', 'FANPROSTEAM']]

In [3]:
# Read in API data (Stats API or Statcast)
def read_api(directory):
    # Specify the directory path
    directory_path = r'C:\Users\james\Documents\MLB\Data2\3. MLB API\\' + directory

    # Initialize an empty list to store DataFrames
    dataframes = []

    # Iterate through each file in the directory
    for filename in os.listdir(directory_path):
        if filename.endswith('.csv'):  # You can adjust the file extension if needed
            file_path = os.path.join(directory_path, filename)
            df = pd.read_csv(file_path)  # Read the CSV file into a DataFrame
            dataframes.append(df)  # Append the DataFrame to the list

    # Concatenate all DataFrames in the list into a single DataFrame
    df = pd.concat(dataframes, ignore_index=True)

    # Keep only regular season game 
    df = df[df['game_type'] == "R"]
    
    return df

In [4]:
# Create dataset of raw data, merging Stats API and Statcast
def raw_dataset():
    # Read in Stats API data
    statsapi = read_api('Stats API')
    # Read in Statcast data
    statcast = read_api('Statcast')
    
    # Merge Stats API and Statcast data
    df = pd.merge(statsapi, statcast, on=['gamePk', 'atBatIndex'], how='left', indicator=True)    
    
    # Sort
    df.sort_values(['game_date', 'gamePk', 'atBatIndex'], inplace=True)
    
    # Only keep one observation per at bat
    df.drop_duplicates(['gamePk', 'atBatIndex'], keep='last', inplace=True)
    
    return df

In [5]:
# Calculate wind vectors
# Note: 2 is to centerfield, 6 is from centerfield, clockwise
# Note: y vector is positive to centerfield, negative from centerfield
# Note: x vector is positive from left to right, negatives from right to left
# Assumption is wind is blowing in 8 cardinal directions, so we can use simple right isosceles triangles
def y_vect(df):
    wind_speed = df['windSpeed']
    angled = df['windSpeed'] / 2 * math.sqrt(2)
    
    if df['windDirection'] == "Out To CF": 
        y_vect = wind_speed
    elif df['windDirection'] == "Out To RF": 
        y_vect = angled
    elif df['windDirection'] == "L To R": 
        y_vect = 0
    elif df['windDirection'] == "In From LF": 
        y_vect = angled * -1
    elif df['windDirection'] == "In From CF": 
        y_vect = wind_speed * - 1
    elif df['windDirection'] == "In From RF": 
        y_vect = angled * -1
    elif df['windDirection'] == "R To L": 
        y_vect = 0
    elif df['windDirection'] == "Out To LF": 
        y_vect = angled
    else:
        y_vect = 0
        
    return y_vect

def x_vect(df):
    wind_speed = df['windSpeed']
    angled = df['windSpeed'] / 2 * math.sqrt(2)
    
    if df['windDirection'] == "L To R": 
        x_vect = wind_speed
    elif df['windDirection'] == "In From LF": 
        x_vect = angled
    elif df['windDirection'] == "In From CF": 
        x_vect = 0
    elif df['windDirection'] == "In From RF": 
        x_vect = angled * -1
    elif df['windDirection'] == "R To L": 
        x_vect = wind_speed * - 1
    elif df['windDirection'] == "Out To LF": 
        x_vect = angled * -1
    elif df['windDirection'] == "Out To CF": 
        x_vect = 0
    elif df['windDirection'] == "Out To RF": 
        x_vect = angled
    else:
        x_vect = 0
        
    return x_vect

In [6]:
def clean_weather(df):
    # Separate weather into temperature and weather type
    df[['temperature', 'weather']] = df['weather'].str.split(", ", expand=True)
    df['temperature'] = df['temperature'].str.replace(" degrees", "").astype('int')
    # Separate wind into speed and direction
    df[['windSpeed', 'windDirection']] = df['wind'].str.split(", ", expand=True)
    df['windSpeed'].fillna("0 mph", inplace=True)
    df['windSpeed'] = df['windSpeed'].str.replace(" mph", "")
    df['windSpeed'] = pd.to_numeric(df['windSpeed'], errors='coerce')
    df['windSpeed'].fillna(0, inplace=True)
    df['windDirection'].fillna('L to R', inplace=True)
    df['windSpeed'].unique()
    df['windDirection'] = df['windDirection'].str.replace(".", "")
    # Calculate vectors
    df['x_vect'] = df.apply(x_vect, axis=1)
    df['y_vect'] = df.apply(y_vect, axis=1)
    
    return df

In [7]:
# df2 = clean_weather(df)

In [8]:
def create_events(df):
    event_mapping = {
        'Strikeout': 'so',
        'Strikeout Double Play': 'so',
        'Groundout': 'go',
        'Fielders Choice': 'go',
        'Double Play': 'go',
        'Grounded Into DP': 'go',
        'Triple Play': 'go',
        'Field Error': 'go',
        'Forceout': 'go',
        'Lineout': 'lo',
        'Bunt Lineout': 'lo',
        'Flyout': 'fo',
        'Sac Fly': 'fo',
        'Sac Fly Double Play': 'fo',
        'Pop Out': 'po',
        'Bunt Pop Out': 'po',
        'Hit By Pitch': 'hbp',
        'Walk': 'bb',
        'Intent Walk': 'bb',
        'Single': 'b1',
        'Double': 'b2',
        'Triple': 'b3',
        'Home Run': 'hr'
    }

    df['eventsModel'] = df['event'].map(event_mapping).fillna('Cut')
    return df

In [9]:
# df3 = create_events(df2)

In [10]:
# This turns several variables, including events, venues, hands, and bases into dummies
def create_dummies(df):
    
    # Events
    event_dummies = pd.get_dummies(df['eventsModel'])
    # Venues
    venue_dummies = pd.get_dummies(df['venue_id'], prefix='venue')
    # Hands
    pitcher_dummies = pd.get_dummies(df['pitchHand'], prefix='p')
    batter_dummies = pd.get_dummies(df['batSide'], prefix='b')
    # Years
    df['year'] = df['game_date'].str[:4]
    year_dummies = pd.get_dummies(df['year'], prefix='year')
    
    # Create lists of dummies
    venue_list = venue_dummies.columns.tolist()
    year_list = year_dummies.columns.tolist()
    dummy_list = venue_list + year_list
    
    # Add dummies to dataframe
    df = pd.concat([df, event_dummies, venue_dummies, pitcher_dummies, batter_dummies, year_dummies], axis=1)
    
    # Create dummy for runners on base
    df['preOnFirst'] = df.groupby(['gamePk', 'inning', 'halfInning'])['postOnFirst'].shift(1)
    df['preOnSecond'] = df.groupby(['gamePk', 'inning', 'halfInning'])['postOnSecond'].shift(1)
    df['preOnThird'] = df.groupby(['gamePk', 'inning', 'halfInning'])['postOnThird'].shift(1)
    
    df['onFirst'] = df['preOnFirst'].notnull().astype('int')
    df['onSecond'] = df['preOnSecond'].notnull().astype('int')
    df['onThird'] = df['preOnThird'].notnull().astype('int')
    
    # Top of the inning dummy
    df['top'] = np.where(df['halfInning'] == "top", 1, 0)
    
    # Calculate PAs and ABs
    df['pa'] = np.where(df['eventsModel'] != "Cut", 1, 0)
    df['ab'] = df['pa'] - df['hbp'] - df['bb']
    
    return df, dummy_list

In [11]:
# df4, dummy_list = create_dummies(df3)

In [12]:
def find_max(lst):
    return max(lst) if lst else 0

def clean_statcast(df):
    # Hard hit dummy
    df['hard_hit'] = (df['launch_speed'] >= 95).astype('int')
    
    # Barrel dummy
    df['barrel'] = (df['launch_speed_angle'] == 6).astype('int')

    # Spray 
    df['spray_angle'] = np.arctan((df['hc_x'] - 125.42) / (198.27 - df['hc_y'])) * 180 / np.pi * 0.75
    df['to_left'] = (df['spray_angle'] < -15).astype('int')
    df['to_middle'] = ((df['spray_angle'] >= -15) & (df['spray_angle'] <= 15)).astype('int')
    df['to_right'] = (df['spray_angle'] > 15).astype('int')

    return df

In [13]:
# df5 = clean_statcast(df4)

In [14]:
def read_park_factors(team_map):
    # Read in park factors
    park_factors_l = pd.read_excel(r"C:\Users\james\Documents\MLB\Data2\Utilities\Statcast Park Factors.xlsx", sheet_name='L')
    park_factors_l['batSide'] = "L"
    park_factors_r = pd.read_excel(r"C:\Users\james\Documents\MLB\Data2\Utilities\Statcast Park Factors.xlsx", sheet_name='R')
    park_factors_r['batSide'] = "R"

    # Append 
    park_factors = pd.concat([park_factors_l, park_factors_r], axis=0)
    # Clean
    park_factors['Team'] = park_factors['Team'].str.strip()
  
    # Merge with team map to get venue ID
    park_factors = park_factors.merge(team_map[['FANGRAPHSTEAM', 'VENUE_ID']], left_on='Team', right_on='FANGRAPHSTEAM', how='inner')
    park_factors.rename(columns={'VENUE_ID':'venue_id'}, inplace=True)
    
    # Keep relevant variables
    park_factors = park_factors[['venue_id', 'batSide', 'Park Factor', '1B', '2B', '3B', 'HR', 'BB', 'SO']]
    
    # Convert to mean of 1, not 100
    factor_list = ['Park Factor', '1B', '2B', '3B', 'HR', 'BB', 'SO']
    for factor in factor_list:
        park_factors[factor] = park_factors[factor] / 100
    
    return park_factors

park_factors = read_park_factors(team_map)

In [15]:
def park_adjustments(df, park_factors):
    # Merge with park factors
    df = df.merge(park_factors, on=['venue_id', 'batSide'], how='left')
    
    # Old/other parks get all 1s
    df['Park Factor'].fillna(1, inplace=True)
    df['1B'].fillna(1, inplace=True)
    df['2B'].fillna(1, inplace=True)
    df['3B'].fillna(1, inplace=True)
    df['HR'].fillna(1, inplace=True)
    df['BB'].fillna(1, inplace=True)
    df['SO'].fillna(1, inplace=True)
    
    # Adjust stats by park factor
    df['b1'] = df['b1'] / df['1B']
    df['b2'] = df['b2'] / df['2B']
    df['b3'] = df['b3'] / df['3B']
    df['hr'] = df['hr'] / df['HR']
    df['bb'] = df['bb'] / df['BB']
    df['so'] = df['so'] / df['SO']
    
    return df

In [16]:
# df6 = park_adjustments(df5, park_factors)

In [17]:
# This will return a dataframe that can eventually be used as the model input. Has pitcher vs hitter stats, specific to hand
def rolling_pas(df, pa_num):
    # Rename for compatibility purposes
    df.rename(columns={'hit_distance_sc':'totalDistance', 'launch_speed':'launchSpeed'}, inplace=True)
    
    # Stats to calculate rolling averages/maximums
    stat_list = ['so', 'b1', 'b2', 'b3', 'hr', 'bb', 'hbp', 'lo', 'po', 'go', 'fo', 
                 'hard_hit', 'barrel', 'to_left', 'to_middle', 'to_right', 
                 'estimated_woba_using_speedangle',
                 'pa', 'ab']
    
    max_list = ['totalDistance', 'maxSpeed', 'maxSpin', 'launchSpeed']

                
    # 
    df['pa_num'] = df.index
    
    batter_stats = []
    pitcher_stats = []
    batter_stats2 = []
    pitcher_stats2 = []

    for stat in stat_list:
        batter_stat = stat + "_b"
        pitcher_stat = stat + "_p"
        batter_stats.append(batter_stat)
        pitcher_stats.append(pitcher_stat)

    for stat in max_list:
        batter_stat = stat + "_b"
        pitcher_stat = stat + "_p"
        batter_stats2.append(batter_stat)
        pitcher_stats2.append(pitcher_stat)
        
    df[batter_stats] = df.groupby(['batter', 'pitchHand'])[stat_list].transform(lambda x: x.shift().rolling(pa_num, min_periods=1).sum())
    df[batter_stats2] = df.groupby(['batter', 'pitchHand'])[max_list].transform(lambda x: x.shift().rolling(pa_num, min_periods=1).max())
                
    df[pitcher_stats] = df.groupby(['pitcher', 'batSide'])[stat_list].transform(lambda x: x.shift().rolling(pa_num, min_periods=1).sum())
    df[pitcher_stats2] = df.groupby(['pitcher', 'batSide'])[max_list].transform(lambda x: x.shift().rolling(pa_num, min_periods=1).max())
                
    df.sort_values(['pa_num'], axis=0, ascending=True, inplace=True)

    # wOBA - using 2022 values throughout
    df['woba_b'] = (0.690 * df['bb_b']) + (0.721 * df['hbp_b']) + (0.885 * df['b1_b']) + (1.262 * df['b2_b']) + (1.601 * df['b3_b']) + (2.070 * df['hr_b'])
    df['woba_p'] = (0.690 * df['bb_p']) + (0.721 * df['hbp_p']) + (0.885 * df['b1_p']) + (1.262 * df['b2_p']) + (1.601 * df['b3_p']) + (2.070 * df['hr_p'])
    
    # Slugging
    df['slg_b'] = (1 * df['b1_b']) + (2 * df['b2_b']) + (3 * df['b3_b']) + (4 * df['hr_b'])
    df['slg_b'] = df['slg_b'] / df['ab_b']
    df['slg_p'] = (1 * df['b1_p']) + (2 * df['b2_p']) + (3 * df['b3_p']) + (4 * df['hr_p'])
    df['slg_p'] = df['slg_p'] / df['ab_p']

    # OBP    
    df['obp_b'] = df[['b1_b', 'b2_b', 'b3_b', 'hr_b', 'bb_b', 'hbp_b']].sum(axis=1)
    df['obp_p'] = df[['b1_p', 'b2_p', 'b3_p', 'hr_p', 'bb_p', 'hbp_p']].sum(axis=1)
    
    # ISO
    df['iso_b'] = df['b2_b'] * 1 + df['b3_b'] * 2 + df['hr_b'] * 3
    df['iso_p'] = df['b2_p'] * 1 + df['b3_p'] * 2 + df['hr_p'] * 3

    
    
    # Calculate rates
    stat_short = ['so', 'b1', 'b2', 'b3', 'hr', 'bb', 'hbp', 'lo', 'po', 'go', 'fo', 
                  'estimated_woba_using_speedangle', 'woba', 'obp', 'iso', 'hard_hit', 'barrel', 
                  'to_left', 'to_middle', 'to_right']
    
    for stat in stat_short:
        batter_stat = stat + "_b"
        pitcher_stat = stat + "_p"  
        df[batter_stat] = df[batter_stat] / df['pa_b']
        df[pitcher_stat] = df[pitcher_stat] / df['pa_p']
        
    df.sort_values('pa_num', inplace=True)
    
    batter_stats = batter_stats + batter_stats2
    pitcher_stats = pitcher_stats + pitcher_stats2
                
        
    return df, batter_stats, pitcher_stats

In [18]:
# df7, batter_stats, pitcher_stats = rolling_pas(df6, 50)

In [19]:
# df7['game_date'].head()

In [20]:
# This converts raw data to clean model input
def create_model_input(date):
    keep_list = ['so_b', 'b1_b', 'b2_b', 'b3_b', 'hr_b', 'bb_b', 'hbp_b', 'lo_b', 'po_b', 'go_b', 'fo_b', 'pa_b', 'ab_b', 
                 'estimated_woba_using_speedangle_b', 'woba_b', 'slg_b', 'obp_b', 'iso_b', 'hard_hit_b', 'barrel_b', 
                 'to_left_b', 'to_middle_b', 'to_right_b', 
                 'totalDistance_b', 'maxSpeed_b', 'maxSpin_b', 'launchSpeed_b',
                 'so_p', 'b1_p', 'b2_p', 'b3_p', 'hr_p', 'bb_p', 'hbp_p', 'lo_p', 'po_p', 'go_p', 'fo_p', 'pa_p', 'ab_p', 
                 'estimated_woba_using_speedangle_p', 'woba_p', 'slg_p', 'obp_p', 'iso_p', 'hard_hit_p', 'barrel_p', 
                 'to_left_p', 'to_middle_p', 'to_right_p', 
                 'totalDistance_p', 'maxSpeed_p', 'maxSpin_p', 'launchSpeed_p']
    
    # Read in raw data
    df = raw_dataset()    
    # Clean weather variables
    df2 = clean_weather(df)
    # Create events
    df3 = create_events(df2)
    # Make dummies
    df4, dummy_list = create_dummies(df3)
    # Create Statcast stats
    df5 = clean_statcast(df4)
    # Make park adjustments
    df6 = park_adjustments(df5, park_factors)
    
    # Restrict on data
    # date = datetime.datetime.strptime(date, "%Y%m%d")
    df6['date'] = df6['game_date'].str.replace("-", "")
    df6 = df6[df6['date'] < date]
    # Cut if event isn't one we care about (usually these are weird base running things)
    df6 = df6[df6['Cut'] != 1]

    # Calculate short time frame rolling stats (100 PAs for now)
    dfshort, batter_stats, pitcher_stats = rolling_pas(df6, 100)
    dfmain = dfshort.copy()
    # Calculate long time frame rolling stats (300 PAs for now)
    dflong, batter_stats, pitcher_stats = rolling_pas(df6, 300)
    dflong = dflong[keep_list]
    dflong = dflong.add_suffix("_long")
    # Concatenate them together
    sample = pd.concat([dfmain, dflong], axis=1)
            
    # Delete intermediate DFs
    del dfmain, dfshort, dflong, df, df2, df3, df4, df5, df6    
    
    
    # Determine score before PA
    sample['preAwayScore'] = sample.groupby(['gamePk', 'inning', 'halfInning'])['awayScore'].shift(1)
    sample['preHomeScore'] = sample.groupby(['gamePk', 'inning', 'halfInning'])['homeScore'].shift(1)
    
    sample['preAwayScore'].fillna(sample['awayScore'], inplace=True)
    sample['preHomeScore'].fillna(sample['homeScore'], inplace=True)
    
    # Calculate score differential
    sample['score_diff'] = np.where(sample['top'] == 1, sample['preAwayScore'] - sample['preHomeScore'], sample['preHomeScore'] - sample['preAwayScore'])
    
    
    # Start dummy (=1 for first batter for each starter)
    sample['start'] = 0

    # Group by 'gamePk' and 'halfInning', then find the index of the first occurrence
    top_first_idx = sample[sample['halfInning'] == 'top'].groupby('gamePk').head(1).index
    bottom_first_idx = sample[sample['halfInning'] == 'bottom'].groupby('gamePk').head(1).index

    # Update 'start' column based on the first occurrences
    sample.loc[top_first_idx, 'start'] = 1
    sample.loc[bottom_first_idx, 'start'] = 1
    
    # Add them up
    sample['starts'] = sample[sample['date'] > "20190330"].groupby(['pitcher'])['start'].cumsum()
    
    
    # Group by 'gamePk' and 'pitcher', then identify the index of the last observation
    last_observation_idx = sample.groupby(['gamePk', 'pitcher']).tail(1).index   
    
    # Pulled dummy (=1 for last batter for each pitcher)
    sample['pulled'] = 0

    # Update 'pulled' column for the last observations
    sample.loc[last_observation_idx, 'pulled'] = 1
    
    
    # Batters faced
    sample['faced'] = 1
    games = sample.groupby(['pitcher', 'gamePk'])['faced'].sum().reset_index()
    # Average of last n games, rolling, shifted
    games['avgFaced'] = games.groupby('pitcher')['faced'].rolling(30, min_periods=1).mean().shift().reset_index(level=0, drop=True)
    
    # Merge to get avgFaced back
    sample = sample.merge(games, on=['pitcher', 'gamePk'], how='left')
    
    
    
    
    
    return sample

In [21]:
# sample = create_model_input("20220818")
# sample.tail(5)

In [22]:
# Create batter inputs
def create_batter_df(df, date):
    # Stats of interest
    batter_list = ['batter',  'batterName', 'batSide', 'p_L',
     'so_b', 'b1_b', 'b2_b', 'b3_b', 'hr_b', 'bb_b', 'hbp_b', 'lo_b', 'po_b', 'go_b', 'fo_b', 
     'pa_b', 'ab_b', 'estimated_woba_using_speedangle_b', 'woba_b', 'slg_b', 'obp_b', 'iso_b', 
     'hard_hit_b', 'barrel_b', 'to_left_b', 'to_middle_b', 'to_right_b', 
     'totalDistance_b', 'maxSpeed_b', 'maxSpin_b', 'launchSpeed_b', 
     'so_b_long', 'b1_b_long', 'b2_b_long', 'b3_b_long', 'hr_b_long', 'bb_b_long', 'hbp_b_long', 'lo_b_long', 'po_b_long', 'go_b_long', 'fo_b_long', 
     'pa_b_long', 'ab_b_long', 'estimated_woba_using_speedangle_b_long', 'woba_b_long', 'slg_b_long', 'obp_b_long', 'iso_b_long', 
     'hard_hit_b_long', 'barrel_b_long', 'to_left_b_long', 'to_middle_b_long', 'to_right_b_long', 
     'totalDistance_b_long', 'maxSpeed_b_long', 'maxSpin_b_long', 'launchSpeed_b_long']

    # Only keep relevant stats
    batters = df[batter_list]
    # Only care about most recent stats of each batter before PA
    batters.drop_duplicates(subset=['batter', 'p_L'], keep='last', inplace=True)

    # Create separate dataframes for vs RHP and LHP
    vs_r = batters.query('p_L == 0')
    vs_l = batters.query('p_L == 1')

    # Merge them together
    batters = vs_l.merge(vs_r, on='batter', how='outer', suffixes=('_l', '_r'))

    # Drop duplicate columns
    batters.drop(columns={'batterName_r', 'p_L_l', 'p_L_r'}, inplace=True)
    # Only need this once
    batters.rename(columns={'batterName_l': 'batterName'}, inplace=True)
    
    
    # Merge with Chadwick
    batters = batters.merge(chadwick, left_on='batter', right_on='key_mlbam', how='left')
    
    
    # Export
    batters.to_csv(os.path.join(baseball_path, "4. Dataset", "Batters", "Batters" + date + ".csv"))

In [23]:
# Create pitcher inputs
def create_pitcher_df(df, date):
    # Stats of interest
    pitcher_list =  ['pitcher',  'pitcherName', 'pitchHand', 'b_L',
                     'b1_p', 'b2_p', 'b3_p', 'hr_p', 'bb_p', 'hbp_p', 
                     'so_p', 'lo_p', 'po_p', 'go_p', 'fo_p', 
                     'estimated_woba_using_speedangle_p', 'woba_p', 'slg_p', 'obp_p', 'iso_p',
                     'to_left_p', 'to_middle_p', 'to_right_p', 
                     'hard_hit_p', 'barrel_p', 'maxSpeed_p', 'maxSpin_p', 'totalDistance_p', 'launchSpeed_p',
                     'pa_p', 'ab_p',
                     
                     'b1_p_long', 'b2_p_long', 'b3_p_long', 'hr_p_long', 'bb_p_long', 'hbp_p_long', 
                     'so_p_long', 'lo_p_long', 'po_p_long', 'go_p_long', 'fo_p_long', 
                     'estimated_woba_using_speedangle_p_long', 'woba_p_long', 'slg_p_long', 'obp_p_long', 'iso_p_long', 
                     'to_left_p_long', 'to_middle_p_long', 'to_right_p_long',
                     'hard_hit_p_long', 'barrel_p_long', 'maxSpeed_p_long', 'maxSpin_p_long', 'totalDistance_p_long', 'launchSpeed_p_long',
                     'pa_p_long', 'ab_p_long', 
                     'avgFaced', 'starts',
                     'inning', 'outs', 'gamePk', 'eventsModel', 'game_date']
    
    # Only keep relevant stats
    pitchers = df[pitcher_list]
    
    # Calculate average outs
    # Create a copy of the dataframe
    pitchers_cut = pitchers.copy()
    # Only look at PAs since 2019
    pitchers_cut = pitchers_cut[pitchers_cut['game_date'] > '2019-04-01']
    pitchers_cut.drop_duplicates(subset=['pitcher', 'gamePk', 'inning'], keep='last', inplace=True)
    # Identify if they're a starter
    pitchers_cut['starter'] = (pitchers_cut['inning'] == 1).astype('int')
    # Add up starts
    pitchers_cut = pitchers_cut.groupby(['pitcher', 'gamePk'])['outs', 'starter'].sum().reset_index()
    # Calculate mean outs and sum of starts
    pitchers_cut = pitchers_cut.groupby('pitcher').agg({'outs': np.mean, 'starter': np.sum}).reset_index()

    # Only care about most recent stats of each pitcher before PA
    pitchers.drop_duplicates(subset=['pitcher', 'b_L'], keep='last', inplace=True)
    
    # Create separate dataframes for vs RHB and LHB
    vs_r = pitchers.query('b_L == 0')
    vs_l = pitchers.query('b_L == 1')

    # Merge them together
    pitchers = vs_l.merge(vs_r, on='pitcher', how='outer', suffixes=('_l', '_r'))
    # And add outs/starts
    pitchers = pitchers.merge(pitchers_cut, on='pitcher', how='left')
    
    # Drop duplicate columns
    pitchers.drop(columns={'pitcherName_r', 'b_L_l', 'b_L_r', 'inning_r', 'outs_r', 'gamePk_r', 'eventsModel_r', 'game_date_r',  'inning_l',
                           'outs_l', 'gamePk_l', 'eventsModel_l', 'game_date_l', 'starts_l', 'avgFaced_l'}, inplace=True)
    # Only need this once
    pitchers.rename(columns={'pitcherName_l': 'pitcherName', 'starts_r': 'start', 'avgFaced_r':'avgFaced'}, inplace=True)
    
    # Merge with Chadwick
    pitchers = pitchers.merge(chadwick, left_on='pitcher', right_on='key_mlbam', how='left')
    
    
    # Export    
    pitchers.to_csv(os.path.join(baseball_path, "4. Dataset", "Pitchers", "Pitchers" + date + ".csv"))

In [24]:
# This creates inputs on a given date
def create_datasets(date):
    # Create data for model and data for model inputs 
    sample = create_model_input(date)
    # Create batter and pitcher csvfiles
    create_batter_df(sample, date)
    create_pitcher_df(sample, date)
    
    return sample

In [25]:
# %run "Utilities.ipynb"
# # This reads in Chadwick register with player codes.
# keep_list = ['key_mlbam', 'key_fangraphs', 'key_bbref_minors', 'key_bbref', 'name_first', 'name_last']
# chadwick = read_chadwick(keep_list)

In [27]:
# for filename in os.listdir(r"C:\Users\james\Documents\MLB\Data2\4. Dataset\Batters"): 
#     # 2023 
#     if filename.startswith("Batters2023"):
#         # Pull out date
#         date = filename[7:15]
#         print(date)
#         sample = create_datasets(date)

# # date_list = 
# # import os

# # directory = r'C:\Users\james\Documents\MLB\Data2\4. Dataset\Batters'
# # file_list = []

# # for filename in os.listdir(directory):
# #     if os.path.isfile(os.path.join(directory, filename)):
# #         extracted_name = filename[7:15]
# #         file_list.append(extracted_name)

# # list_2023 = [file for file in file_list if file.startswith("2023")]

# # # Can't do max/near max because it'll take up too much memory
# # # Note: There are better ways to do this! Just create dataset once and then work backwards to create daily 
# # Parallel(n_jobs=4, verbose=5)(delayed(create_datasets)(date) for date in list_2023)

# # print(list_2023)


20230330
20230331
20230401
20230402
20230403
20230404
20230405
20230406
20230407
20230408
20230409
20230410
20230411
20230412
20230413
20230414
20230415
20230416
20230417
20230418
20230419
20230420
20230421
20230422
20230423
20230424
20230425
20230426
20230427
20230428
20230429
20230430
20230501
20230502
20230503
20230504
20230505
20230506
20230507
20230508
20230509
20230510
20230511
20230512
20230513
20230514
20230515
20230516
20230517
20230518
20230519
20230520
20230521
20230522
20230523
20230524
20230525
20230526
20230527
20230528
20230529
20230530
20230531
20230601
20230602
20230603
20230604
20230605
20230606
20230607
20230608
20230609
20230610
20230611
20230612
20230613
20230614
20230615
20230616
20230617
20230618
20230619
20230620
20230621
20230622
20230623
20230624
20230625
20230626
20230627
20230628
20230629
20230630
20230701
20230702
20230703
20230704
20230705
20230706
20230707
20230708
20230709
20230714
20230715
20230716
20230717
20230718
20230719
20230720
20230721
20230722
2