In [1]:
from pybaseball import statcast, pitching_stats, pitching_stats_range
from collections import Counter
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_samples, silhouette_score, mean_absolute_error, r2_score
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [4]:
# Create lists of NL and AL team names i.e Braves, Mets, Phillies, Marlins, Nationals
NL = ['Braves', 'Mets', 'Phillies', 'Marlins', 'Nationals', 'Cubs', 'Reds', 'Brewers', 'Pirates', 'Cardinals', 'Diamondbacks', 'Rockies', 'Dodgers', 'Padres', 'Giants']
AL = ['Yankees', 'Rays', 'Blue Jays', 'Orioles', 'Red Sox', 'Indians', 'Tigers', 'Royals', 'Twins', 'White Sox', 'Astros', 'Athletics', 'Mariners', 'Angels', 'Rangers']

# create a function team to league to return the league of a team
def team_to_league(team):
    if team in NL:
        return 0
    elif team in AL:
        return 1
    else:
        return 2

# create a function to return the average batters faced per game for a pitcher
def batters_faced_per_game(df, pitcher):
    temp_df = df[df['pitcher'] == pitcher]

    #drop duplicates of subset ['game_pk', 'inning', 'batter'] and count number of batters faced in each game
    temp_df = temp_df.drop_duplicates(subset=['game_pk', 'inning', 'batter'])
    total_batters_faced = temp_df.shape[0]

    # count number of games pitched in
    games_pitched = temp_df['game_pk'].nunique()

    # return average batters faced per game
    return round(total_batters_faced / games_pitched, 1)

# create a function to get speed location data with start and end date
def get_speed_location_data(start_date, end_date):
    #get raw pitch data from statcast
    df = statcast(start_dt=start_date, end_dt=end_date)

    # check columns for int type and convert them if not
    for col in df.columns:
        if df[col].dtype == 'int64':
            df[col] = df[col].astype('int64')

    # get the data for K rate and to compute the pitcher's strike percentage
    strike_df = pitching_stats(start_date[:4])
    strike_df = strike_df[['Season', 'Name', 'Team', 'SO', 'TBF', 'Pitches', 'Strikes']]
    strike_df.columns = ['game_year', 'player_name', 'team', 'SO', 'TBF', 'Pitches', 'Strikes']
    strike_df['K_rate'] = strike_df['SO'] / strike_df['TBF']
    strike_df['strike_pct'] = strike_df['Strikes'] / strike_df['Pitches']
    strike_df['league'] = strike_df['team'].apply(team_to_league)
    strike_df.drop(['team', 'SO', 'TBF', 'Pitches', 'Strikes'], axis=1, inplace=True)
    strike_df['game_year'] = strike_df['game_year'].astype('int64')

    # merge the strike_df with the statcast df
    df = df.merge(strike_df, how='inner', on=['player_name', 'game_year'])

    #select only the columns we need
    cols_to_keep = ['game_pk', 'inning', 'game_year', 'player_name', 'league', 'pitcher', 'batter', 'release_speed', 'pfx_x', 'pfx_z', 'plate_x', 'plate_z', 'strike_pct', 'K_rate']
    df = df[cols_to_keep]

    # drop any rows that have a null game_pk
    df.dropna(subset=['game_pk'], inplace=True)

    # make an index for observations
    df['obs_index'] = df['game_year'].astype(str) + "_" + df['pitcher'].astype(str)

    # get a count of pitchers and number of pitches thrown
    pitcher_count = dict(Counter(df['pitcher']))

    # list of pitchers with at least 1000 pitches thrown
    pitchers = [pitcher for pitcher in pitcher_count if pitcher_count[pitcher] >= 1000]

    #subset dataframe to only include pitchers with at least 1000 pitches thrown
    df = df[df['pitcher'].isin(pitchers)]

    # get the list of pitchers with at least 1000 pitches thrown and the average batters faced per game greater than 10
    pitchers = [pitcher for pitcher in pitchers if batters_faced_per_game(df, pitcher) >= 10]

    #subset dataframe to only include pitchers with at least 1000 pitches thrown and the average batters faced per game greater than 10
    df = df[df['pitcher'].isin(pitchers)]

    # keep only the columns we need
    cols_to_keep = ['obs_index', 'pitcher', 'player_name', 'league', 'release_speed', 'pfx_x', 'pfx_z', 'plate_x', 'plate_z', 'strike_pct', 'K_rate']
    df = df[cols_to_keep]

    return df


In [5]:
#get the data using pybaseball
# create training data dates list
train_dates = [('2019-03-20', '2019-09-29'), ('2021-04-01', '2021-10-03')]

# create train data list
train_data = []
for dates in train_dates:
    train_data.append(get_speed_location_data(dates[0], dates[1]))

# concat the train data list
train_data = pd.concat(train_data)
# save the train data to a csv
train_data.to_csv('train_data.csv', index=False)
# print the shape of the train data, the number of unique pitchers, and the number of unique observations
print(f"Shape: {train_data.shape}")
print(f"Unique pitchers: {train_data['pitcher'].nunique()}")
print(f"Unique observations: {train_data['obs_index'].nunique()}")


This is a large query, it may take a moment to complete


100%|██████████| 194/194 [05:01<00:00,  1.56s/it]


This is a large query, it may take a moment to complete


100%|██████████| 186/186 [05:32<00:00,  1.79s/it]


Shape: (0, 11)
Unique pitchers: 0
Unique observations: 0


In [6]:
train_data

Unnamed: 0,obs_index,pitcher,player_name,league,release_speed,pfx_x,pfx_z,plate_x,plate_z,strike_pct,K_rate
