Skip to content

Commit

Permalink
Statcast Hotfix (#155)
Browse files Browse the repository at this point in the history
  • Loading branch information
schorrm committed Oct 15, 2020
1 parent e69adb3 commit 06fac38
Show file tree
Hide file tree
Showing 3 changed files with 178 additions and 155 deletions.
4 changes: 4 additions & 0 deletions README.md
@@ -1,5 +1,9 @@
# pybaseball

**2.1.1 Hotfix: October 15, 2020**

- Fix statcast perf regression

**2.1.0 Release: October 14, 2020**

## Release Notes
Expand Down
327 changes: 173 additions & 154 deletions pybaseball/statcast.py
@@ -1,157 +1,176 @@
from datetime import date, timedelta
from typing import List, Optional, Union

import numpy as np
import pandas as pd

import pybaseball.datasources.statcast as statcast_ds

from .utils import sanitize_date_range

_SC_SINGLE_GAME_REQUEST = "/statcast_search/csv?all=true&type=details&game_pk={game_pk}"
# pylint: disable=line-too-long
_SC_SMALL_REQUEST = "/statcast_search/csv?all=true&hfPT=&hfAB=&hfBBT=&hfPR=&hfZ=&stadium=&hfBBL=&hfNewZones=&hfGT=R%7CPO%7CS%7C=&hfSea=&hfSit=&player_type=pitcher&hfOuts=&opponent=&pitcher_throws=&batter_stands=&hfSA=&game_date_gt={start_dt}&game_date_lt={end_dt}&team={team}&position=&hfRO=&home_road=&hfFlag=&metric_1=&hfInn=&min_pitches=0&min_results=0&group_by=name&sort_col=pitches&player_event_sort=h_launch_speed&sort_order=desc&min_abs=0&type=details&"


def _chunk_small_request(start_date: date, end_date: date, team: Optional[str], verbose: bool,
max_retries: int = 3) -> Optional[pd.DataFrame]:
data = small_request(start_date, end_date, team=team)

# Append to list of dataframes if not empty or failed
# (failed requests have one row saying "Error: Query Timeout")
if not data.empty:
return data

# If it failed, retry up to three times
retry_success = False

# Count failed requests. If > 2, break
error_counter = 0
# A flag for passing over the success message of requests are failing
unsuccessful_msg_flag = False
dataframe_list: List[pd.DataFrame] = []
while not retry_success:
data = small_request(start_date, end_date, team=team)

if not data.empty:
return data

error_counter += 1

if error_counter < max_retries:
continue

# This request is probably too large. Cut a day off of this request
# and make that its own separate request. For each, append to
# dataframe list if successful, skip and print error message if failed.
tmp_end = end_date - timedelta(days=1)
smaller_data_1 = small_request(start_date, tmp_end, team=team)
smaller_data_2 = small_request(end_date, end_date, team=team)

if not smaller_data_1.empty:
dataframe_list.append(smaller_data_1)
print(f"Completed sub-query from {start_date} to {tmp_end}")
retry_success = True
else:
print(f"Query unsuccessful for data from {end_date} to {tmp_end}. Skipping these dates.")

if not smaller_data_2.empty:
dataframe_list.append(smaller_data_2)
print(f"Completed sub-query from {end_date} to {end_date}")
retry_success = True
else:
print(f"Query unsuccessful for data from {end_date} to {end_date}. Skipping these dates.")

# Flag for passing over the success message since this request failed
unsuccessful_msg_flag = not retry_success

# Reset counter
error_counter = 0
break

if verbose and not unsuccessful_msg_flag:
print(f"Completed sub-query from {start_date} to {end_date}")

if dataframe_list:
return pd.concat(dataframe_list, axis=0)

return None


def small_request(start_dt: date, end_dt: date, team: Optional[str] = None) -> pd.DataFrame:
data = statcast_ds.get_statcast_data_from_csv_url(
_SC_SMALL_REQUEST.format(start_dt=str(start_dt), end_dt=str(end_dt), team=team if team else '')
)
if data is not None and not data.empty:
data = data.sort_values(
['game_date', 'game_pk', 'at_bat_number', 'pitch_number'],
ascending=False
)

import requests
import datetime
import warnings
import io

def validate_datestring(date_text):
try:
datetime.datetime.strptime(date_text, '%Y-%m-%d')
except ValueError:
raise ValueError("Incorrect data format, should be YYYY-MM-DD")

def sanitize_input(start_dt, end_dt):
# if no dates are supplied, assume they want yesterday's data
# send a warning in case they wanted to specify
if start_dt is None and end_dt is None:
today = datetime.datetime.today()
start_dt = (today - datetime.timedelta(1)).strftime("%Y-%m-%d")
end_dt = today.strftime("%Y-%m-%d")
print("Warning: no date range supplied. Returning yesterday's Statcast data. For a different date range, try get_statcast(start_dt, end_dt).")
#if only one date is supplied, assume they only want that day's stats
#query in this case is from date 1 to date 1
if start_dt is None:
start_dt = end_dt
if end_dt is None:
end_dt = start_dt
# now that both dates are not None, make sure they are valid date strings
validate_datestring(start_dt)
validate_datestring(end_dt)
return start_dt, end_dt

def single_game_request(game_pk):

url = "https://baseballsavant.mlb.com/statcast_search/csv?all=true&type=details&game_pk={game_pk}".format(game_pk=game_pk)
s=requests.get(url, timeout=None).content
data = pd.read_csv(io.StringIO(s.decode('utf-8')))#, error_bad_lines=False) # skips 'bad lines' breaking scrapes. still testing this.
return data

def small_request(start_dt,end_dt):
url = "https://baseballsavant.mlb.com/statcast_search/csv?all=true&hfPT=&hfAB=&hfBBT=&hfPR=&hfZ=&stadium=&hfBBL=&hfNewZones=&hfGT=R%7CPO%7CS%7C=&hfSea=&hfSit=&player_type=pitcher&hfOuts=&opponent=&pitcher_throws=&batter_stands=&hfSA=&game_date_gt={}&game_date_lt={}&team=&position=&hfRO=&home_road=&hfFlag=&metric_1=&hfInn=&min_pitches=0&min_results=0&group_by=name&sort_col=pitches&player_event_sort=h_launch_speed&sort_order=desc&min_abs=0&type=details&".format(start_dt, end_dt)
s=requests.get(url, timeout=None).content
data = pd.read_csv(io.StringIO(s.decode('utf-8')))#, error_bad_lines=False) # skips 'bad lines' breaking scrapes. still testing this.
return data

def large_request(start_date: date, end_date: date, step: int, verbose: bool,
team: Optional[str] = None) -> pd.DataFrame:
def large_request(start_dt,end_dt,d1,d2,step,verbose):
"""
Break start and end date into smaller increments, collecting all data in small chunks
and appending all results to a common dataframe start_date_date and end_date_date
are date objects for first and last day of query, for doing date math a third date
object (d) will be used to increment over time for the several intermediate queries.
break start and end date into smaller increments, collecting all data in small chunks and appending all results to a common dataframe
end_dt is the date strings for the final day of the query
d1 and d2 are datetime objects for first and last day of query, for doing date math
a third datetime object (d) will be used to increment over time for the several intermediate queries
"""

dataframe_list = []

subq_start = start_date

error_counter = 0 # count failed requests. If > X, break
no_success_msg_flag = False # a flag for passing over the success message of requests are failing
print("This is a large query, it may take a moment to complete")

# While intermediate query end_dt <= global query end_dt, keep looping
subq_end = subq_start + timedelta(days=step)
while subq_end <= end_date:
# Dates before 3/15 and after 11/15 will always be offseason.
# If these dates are detected, check if the next season is within the user's query.
# If yes, fast-forward to the next season to avoid empty requests
# If no, break the loop. all useful data has been pulled.
if (subq_end.month == 3 and subq_end.day < 15) or subq_end.month <= 2:
print('Skipping offseason dates')
subq_start = subq_start.replace(month=3, day=15)
subq_end = subq_start + timedelta(days=step+1)
elif (subq_start.month == 11 and subq_start.day > 14) or subq_start.month > 11:
if end_date.year > subq_end.year:
dataframe_list = []
#step = 3 # number of days per mini-query (test this later to see how large I can make this without losing data)
d = d1 + datetime.timedelta(days=step)
while d <= d2: #while intermediate query end_dt <= global query end_dt, keep looping
# dates before 3/15 and after 11/15 will always be offseason
# if these dates are detected, check if the next season is within the user's query
# if yes, fast-forward to the next season to avoid empty requests
# if no, break the loop. all useful data has been pulled.
if ((d.month < 4 and d.day < 15) or (d1.month > 10 and d1.day > 14)):
if d2.year > d.year:
print('Skipping offseason dates')
subq_start = subq_start.replace(month=3, day=15, year=subq_start.year+1)
subq_end = subq_start + timedelta(days=step+1)
d1 = d1.replace(month=3,day=15,year=d1.year+1)
d = d1 + datetime.timedelta(days=step+1)
else:
break

data = _chunk_small_request(subq_start, subq_end, team=team, verbose=verbose)

# Append to list of dataframes if not empty or failed
# (failed requests have one row saying "Error: Query Timeout")
if data is not None and not data.empty:
start_dt = d1.strftime('%Y-%m-%d')
intermediate_end_dt = d.strftime('%Y-%m-%d')
data = small_request(start_dt,intermediate_end_dt)
# append to list of dataframes if not empty or failed (failed requests have one row saying "Error: Query Timeout")
if data.shape[0] > 1:
dataframe_list.append(data)
# if it failed, retry up to three times
else:
success = 0
while success == 0:
data = small_request(start_dt,intermediate_end_dt)
if data.shape[0] > 1:
dataframe_list.append(data)
success = 1
else:
error_counter += 1
if error_counter > 2:
# this request is probably too large. Cut a day off of this request and make that its own separate request.
# For each, append to dataframe list if successful, skip and print error message if failed
tmp_end = d - datetime.timedelta(days=1)
tmp_end = tmp_end.strftime('%Y-%m-%d')
smaller_data_1 = small_request(start_dt, tmp_end)
smaller_data_2 = small_request(intermediate_end_dt,intermediate_end_dt)
if smaller_data_1.shape[0] > 1:
dataframe_list.append(smaller_data_1)
print("Completed sub-query from {} to {}".format(start_dt,tmp_end))
else:
print("Query unsuccessful for data from {} to {}. Skipping these dates.".format(start_dt,tmp_end))
if smaller_data_2.shape[0] > 1:
dataframe_list.append(smaller_data_2)
print("Completed sub-query from {} to {}".format(intermediate_end_dt,intermediate_end_dt))
else:
print("Query unsuccessful for data from {} to {}. Skipping these dates.".format(intermediate_end_dt,intermediate_end_dt))

no_success_msg_flag = True # flag for passing over the success message since this request failed
error_counter = 0 # reset counter
break


# Increment dates
subq_start = subq_end + timedelta(days=1)
subq_end = subq_end + timedelta(days=step+1)

# If start date > end date after being incremented,
# the loop captured each date's data
if subq_start <= end_date:
# If start date <= end date, then there are a few leftover dates to grab data for.
# start_dt from the earlier loop will work,
# but instead of d we now want the original end_dt
data = small_request(subq_start, end_date, team=team)
if verbose:
if no_success_msg_flag is False:
print("Completed sub-query from {} to {}".format(start_dt,intermediate_end_dt))
else:
no_success_msg_flag = False # if failed, reset this flag so message will send again next iteration
# increment dates
d1 = d + datetime.timedelta(days=1)
d = d + datetime.timedelta(days=step+1)

# if start date > end date after being incremented, the loop captured each date's data
if d1 > d2:
pass
# if start date <= end date, then there are a few leftover dates to grab data for.
else:
# start_dt from the earlier loop will work, but instead of d we now want the original end_dt
start_dt = d1.strftime('%Y-%m-%d')
data = small_request(start_dt,end_dt)
dataframe_list.append(data)
if verbose:
print(f"Completed sub-query from {subq_start} to {end_date}")
print("Completed sub-query from {} to {}".format(start_dt,end_dt))

# Concatenate all dataframes into final result set
# concatenate all dataframes into final result set
final_data = pd.concat(dataframe_list, axis=0)
return final_data

def postprocessing(data, team):
#replace empty entries and 'null' strings with np.NaN
data.replace(r'^\s*$', np.nan, regex=True, inplace = True)
data.replace(r'^null$', np.nan, regex=True, inplace = True)

# convert columns to numeric
not_numeric = ['sv_id', 'umpire', 'type', 'inning_topbot', 'bb_type', 'away_team', 'home_team', 'p_throws',
'stand', 'game_type', 'des', 'description', 'events', 'player_name', 'game_date', 'pitch_type', 'pitch_name']

numeric_cols = ['release_speed','release_pos_x','release_pos_z','batter','pitcher','zone','hit_location','balls',
'strikes','game_year','pfx_x','pfx_z','plate_x','plate_z','on_3b','on_2b','on_1b','outs_when_up','inning',
'hc_x','hc_y','fielder_2','vx0','vy0','vz0','ax','ay','az','sz_top','sz_bot',
'hit_distance_sc','launch_speed','launch_angle','effective_speed','release_spin_rate','release_extension',
'game_pk','pitcher.1','fielder_2.1','fielder_3','fielder_4','fielder_5',
'fielder_6','fielder_7','fielder_8','fielder_9','release_pos_y',
'estimated_ba_using_speedangle','estimated_woba_using_speedangle','woba_value','woba_denom','babip_value',
'iso_value','launch_speed_angle','at_bat_number','pitch_number','home_score','away_score','bat_score',
'fld_score','post_away_score','post_home_score','post_bat_score','post_fld_score']

data[numeric_cols] = data[numeric_cols].astype(float)

# convert date col to datetime data type and sort so that this returns in an order that makes sense (by date and game)
data['game_date'] = pd.to_datetime(data['game_date'], format='%Y-%m-%d')
data = data.sort_values(['game_date', 'game_pk', 'at_bat_number', 'pitch_number'], ascending=False)

#select only pitches from a particular team
valid_teams = ['MIN', 'PHI', 'BAL', 'NYY', 'LAD', 'OAK', 'SEA', 'TB', 'MIL', 'MIA',
'KC', 'TEX', 'CHC', 'ATL', 'COL', 'HOU', 'CIN', 'LAA', 'DET', 'TOR',
'PIT', 'NYM', 'CLE', 'CWS', 'STL', 'WSH', 'SF', 'SD', 'BOS','ARI','ANA','WAS']

if(team in valid_teams):
data = data.loc[(data['home_team']==team)|(data['away_team']==team)]
elif(team != None):
raise ValueError('Error: invalid team abbreviation. Valid team names are: {}'.format(valid_teams))
data = data.reset_index()
return data

def statcast(start_dt: str = None, end_dt: str = None, team: str = None, verbose: bool = True) -> pd.DataFrame:
def statcast(start_dt=None, end_dt=None, team=None, verbose=True):
"""
Pulls statcast play-level data from Baseball Savant for a given date range.
Expand All @@ -160,38 +179,38 @@ def statcast(start_dt: str = None, end_dt: str = None, team: str = None, verbose
end_dt: YYYY-MM-DD : the last date for which you want statcast data
team: optional (defaults to None) : city abbreviation of the team you want data for (e.g. SEA or BOS)
If no arguments are provided, this will return yesterday's statcast data.
If one date is provided, it will return that date's statcast data.
If no arguments are provided, this will return yesterday's statcast data. If one date is provided, it will return that date's statcast data.
"""

start_dt_date, end_dt_date = sanitize_date_range(start_dt, end_dt)

# 5 days or less -> a quick one-shot request.
# Greater than 5 days -> break it into multiple smaller queries
start_dt, end_dt = sanitize_input(start_dt, end_dt)
# 3 days or less -> a quick one-shot request. Greater than 3 days -> break it into multiple smaller queries
small_query_threshold = 5
# inputs are valid if either both or zero dates are supplied. Not valid of only one given.

# How many days worth of data are needed?
days_in_query = (end_dt_date - start_dt_date).days
if days_in_query <= small_query_threshold:
return small_request(start_dt_date, end_dt_date, team=team)

return large_request(start_dt_date, end_dt_date, step=small_query_threshold, verbose=verbose, team=team)
if start_dt and end_dt:
# how many days worth of data are needed?
date_format = "%Y-%m-%d"
d1 = datetime.datetime.strptime(start_dt, date_format)
d2 = datetime.datetime.strptime(end_dt, date_format)
days_in_query = (d2 - d1).days
if days_in_query <= small_query_threshold:
data = small_request(start_dt,end_dt)
else:
data = large_request(start_dt,end_dt,d1,d2,step=small_query_threshold,verbose=verbose)

data = postprocessing(data, team)
return data

def statcast_single_game(game_pk: Union[str, int]) -> pd.DataFrame:
def statcast_single_game(game_pk, team=None):
"""
Pulls statcast play-level data from Baseball Savant for a single game,
identified by its MLB game ID (game_pk in statcast data)
INPUTS:
game_pk : 6-digit integer MLB game ID to retrieve
"""

data = statcast_ds.get_statcast_data_from_csv_url(
_SC_SINGLE_GAME_REQUEST.format(game_pk=game_pk)
)

return data.sort_values(
['game_date', 'game_pk', 'at_bat_number', 'pitch_number'],
ascending=False
)
data = single_game_request(game_pk)
data = postprocessing(data, team)
return data
2 changes: 1 addition & 1 deletion setup.py
Expand Up @@ -18,7 +18,7 @@
# Versions should comply with PEP440. For a discussion on single-sourcing
# the version across setup.py and the project code, see
# https://packaging.python.org/en/latest/single_source_version.html
version='2.1.0',
version='2.1.1',

description='Retrieve baseball data in Python',
long_description=long_description,
Expand Down

0 comments on commit 06fac38

Please sign in to comment.