# A3. Raw API
Source: MLB Stats API <br>

Description: 
This reads in plate appearance data from the MLB Stats API <br>
On a daily basis, it will take the current year's file and update it with data since the last update  <br>
It can also create the entire dataset from scratch

# Imports

In [1]:
import pandas as pd
import numpy as np
import os
import datetime
import regex as re

import statsapi
from statsapi import get

baseball_path = r"C:\Users\james\Documents\MLB\Data"

# Box Score

In [2]:
# Creates box score variables
def create_box(gamePk):
    # Read in boxscore as json
    box = pd.json_normalize(statsapi.boxscore_data(gamePk, timecode=None), record_path='gameBoxInfo')
    # Take value where label = variable of interest
    try:
        weather = box.loc[box['label']=="Weather", "value"].item() 
    except:
        weather = "75 degrees, Clear."
    try:
        wind = box.loc[box['label']=="Wind", "value"].item()
    except:
        wind = "0 mph, L To R."
    try:
        venue = box.loc[box['label']=="Venue", "value"].item()
    except:
        venue = "Missing Park."
    # Date should be last label
    try:
        date = box.iloc[-1,box.columns.get_loc('label')]
    except:
        date = "November 30, 1993"
    
    return weather, wind, venue, date

# Game Events

In [3]:
# Create dataframe of game events
# Observations are runners, including the batter
def create_game(gamePk):
    list_of_lists = []
    # Read in game data
    game = statsapi.get('game_playByPlay', {'gamePk':gamePk})
    # Loop over all play
    for i in game['allPlays']:
        atBatIndex = i['about']['atBatIndex']
        inning = i['about']['inning']
        halfInning = i['about']['halfInning']
        
        outs = i['count']['outs']
        
        try:
            type = i['result']['type']
        except:
            type = None
        try:
            event = i['result']['event']
        except:
            event = None
        try:
            eventType = i['result']['eventType']
        except:
            eventType = None
        try:
            description = i['result']['description']
        except:
            description = None
        try:
            rbi = i['result']['rbi']
        except:
            rbi = 0
        try:
            awayScore = i['result']['awayScore']
        except:
            awayScore = 0
        try:
            homeScore = i['result']['homeScore']
        except:
            homeScore = 0
        
        try:
            batter = i['matchup']['batter']['id']
        except:
            batter = 999999
        try:
            batterName = i['matchup']['batter']['fullName']
        except:
            batterName = "Missing Name"
        try:
            batSide = i['matchup']['batSide']['code']
        except:
            batSide = "R"
        try:
            pitcher = i['matchup']['pitcher']['id']
        except:
            pitcher = 999999
        try:
            pitcherName = i['matchup']['pitcher']['fullName']
        except:
            pitcherName = "Missing Name"
        try:
            pitchHand = i['matchup']['pitchHand']['code']
        except:
            pitchHand = "R"
        
        try:
            postOnFirst = i['matchup']['postOnFirst']['id']
        except:
            postOnFirst = None
        try:
            postOnSecond = i['matchup']['postOnSecond']['id']
        except:
            postOnSecond = None
        try:
            postOnThird = i['matchup']['postOnThird']['id']
        except:
            postOnThird = None
        
        # Loop over runner data
        for j in i['runners']:
            id = j['details']['runner']['id']
            start = j['movement']['start']
            end = j['movement']['end']
            movementReason = j['details']['movementReason']

            list = [atBatIndex, inning, halfInning, outs, type, id, event, eventType, description, 
                    rbi, awayScore, homeScore, 
                    batter, batterName, batSide, pitcher, pitcherName, pitchHand, 
                    postOnFirst, postOnSecond, postOnThird,
                    start, end, movementReason]
            list_of_lists.append(list)
    
    df = pd.DataFrame(list_of_lists, columns=['atBatIndex', 'inning', 'halfInning', 'outs', 'type', 'id', 'event', 'eventType', 'description', 
                                              'rbi', 'awayScore', 'homeScore', 
                                              'batter', 'batterName', 'batSide', 'pitcher', 'pitcherName', 'pitchHand', 
                                              'postOnFirst', 'postOnSecond', 'postOnThird',
                                              'start', 'end', 'movementReason'])
    
    df['gamePk'] = gamePk
    
    # Get information from box score and create variables
    weather, wind, venue, date = create_box(gamePk)
    df['weather'] = weather
    df['wind'] = wind
    df['venue'] = venue
    df['date'] = date
          
    return df

# Dataset

In [5]:
# Creates dataset of all games between dates
# Note1: Only do one year at a time - gets weird otherwise 
# Note2: There may be duplicate game_id values. To avoid counting twice, I make a list of ids and only create game df if I haven't already
def create_dataset(start_date, end_date):
    # List of all game dfs
    play_list = []
    # Delete exist dataframes (shouldn't really happen anymore)
    try:
        del play_df
    except:
        pass
    # Pick days
    games = statsapi.schedule(start_date=start_date,end_date=end_date)
    # List of all game_id values that have already been done
    game_list = []
    # Loop over games
    for x in games:
        # Check if this game has already been done, and if not,
        if x['game_id'] not in game_list:
            # Add its gamePk to the list of games that have been done
            game_list.append(x['game_id'])
            
            # Create game df
            game_df = create_game(x['game_id'])
            
            # Add information from the schedule
            game_df['away_name'] = x['away_name']
            game_df['home_name'] = x['home_name']
            game_df['game_date'] = x['game_date']
            game_df['venue_id'] = x['venue_id']
            game_df['game_type'] = x['game_type']
            game_df['gamePk'] = x['game_id']

            # Add it to list of dataframes
            play_list.append(game_df)
            
        # If the game has already been done, skip
        else:
            pass
    
    # Create dataframe from all games
    play_df = pd.concat(play_list, axis=0)
    
    return play_df

# Update

In [6]:
# Takes existing dataframe and adds missing dates
def update_current_year(year):
    # Read in dataset to update
    playfile = "Play" + str(year) + ".csv"
    old_play_df = pd.read_csv(os.path.join(baseball_path, "A3. Raw API", playfile))

    # Convert game dates to date format
    old_play_df['game_date'] = pd.to_datetime(old_play_df['game_date'])
    # Find the last date in the sample
    last_date = old_play_df['game_date'].max()
    print("Last date in sample: " + str(last_date))
    # Add a day to it
    start_date = last_date + datetime.timedelta(days=1)
    # Only keep the date (not the time)
    start_date = start_date.date()
    start_date = str(start_date.strftime("%m/%d/%Y"))
    print("Start date in scrape: " + start_date)
    # Determine today's date
    yesterday = datetime.date.today()
    # Subtract a day
    yesterday = yesterday - datetime.timedelta(days=1)
    yesterday = str(yesterday.strftime("%m/%d/%Y"))
    # Make it a string
    print("End date in scrape: " + yesterday)
    
    new_play_df = create_dataset(start_date, yesterday)
    
    play_df = pd.concat([old_play_df, new_play_df], axis=0)
    
    play_df.drop(columns={'index'}, inplace=True)
    play_df.reset_index(inplace=True)
    
    play_df = play_df.loc[:,~play_df.columns.str.startswith('Unnamed')]
    
    play_df.to_csv(os.path.join(baseball_path, "A3. Raw API", playfile))

# From Scratch

In [7]:
# This creates complete year datasets
# Use this when recreating a year or multiple years from scratch
def create_complete_datasets(start_year, end_year):
    for year in range(start_year, end_year):
        print(year)
        # Choose start and end dates
        start_date = "03/26/" + str(year)
        end_date = "11/15/" + str(year)
        # Create dataset
        play_df = create_dataset(start_date, end_date)
        play_df.reset_index(inplace=True)

        # Save as csv
        playfile = "Play" + str(year) + ".csv"
        play_df.to_csv(os.path.join(baseball_path, "A3. Raw API", playfile), index=False)

In [8]:
# # Run this if you want to create the datasets from scratch
# create_complete_datasets(2023, 2024)

In [9]:
# Run this if you want to update the existing year
update_current_year(2023)

Last date in sample: 2023-04-04 00:00:00
Start date in scrape: 04/05/2023
End date in scrape: 04/05/2023


Notes: <br>
Date ranges seem to be problematic when you go between years. Can only do one year at a time <br> 
There may be duplicate games in a year. Not sure why. It's corrected by making a list of unique games and keeping them, but just good to keep in mind. <br>

Wishlist: <br>
May want to add more stats: launch angles, launch rate, babip, estimated_woba_using_speedangle, iso_value, etc...
https://baseballsavant.mlb.com/csv-docs

In [10]:
print("Code was last run on: {} at {}.".format(datetime.date.today(), datetime.datetime.now().strftime("%H:%M:%S")))

Code was last run on: 2023-04-06 at 08:39:43.
