In [41]:
import pybaseball as pyb
import pandas as pd

In [57]:
# Define the date range for which we want to gather data
start_date = '2024-05-13'
end_date = '2024-05-27'

# Retrieve batting statistics for the specified date range
batting_data = pyb.batting_stats_range(start_dt=start_date, end_dt=end_date)

# Retrieve Statcast data for the specified date range
statcast_data = pyb.statcast(start_dt=start_date, end_dt=end_date)

# Filter relevant columns from batting data
batting_relevant_columns = ['Name', 'mlbID', 'HR', 'PA']
batting_data = batting_data[batting_relevant_columns]

# Rename columns for consistency
batting_data = batting_data.rename(columns={
    'Name': 'Player',
    'mlbID': 'Batter',
    'HR': 'HR'
})

# Filter relevant columns from Statcast data
statcast_relevant_columns = ['batter', 'events', 'launch_speed', 'launch_angle', 'hit_distance_sc', 'plate_x', 'plate_z']
statcast_data = statcast_data[statcast_relevant_columns]

# Rename columns for consistency
statcast_data = statcast_data.rename(columns={
    'batter': 'Batter'
})

# Merge datasets on Batter ID
merged_data = pd.merge(batting_data, statcast_data, on='Batter', how='inner')

# Handle missing values by filling with a default value, e.g., 0
merged_data = merged_data.fillna(0)

# Calculate additional metrics
merged_data['xISO'] = merged_data['launch_speed'] * merged_data['launch_angle'] / 1000  # Example calculation
merged_data['Hard Hit %'] = merged_data['launch_speed'].apply(lambda x: 1 if x >= 95 else 0)
merged_data['Out of Zone Swing %'] = merged_data['plate_x'].apply(lambda x: 1 if x < -0.5 or x > 0.5 else 0)
merged_data['In Zone Contact %'] = merged_data['plate_x'].apply(lambda x: 1 if x >= -0.5 and x <= 0.5 else 0)
merged_data['xwOBA'] = merged_data['launch_angle'] * merged_data['launch_speed'] / 1000  # Example calculation

# Summarize advanced metrics for each player, but keep HR from batting data
advanced_metrics = merged_data.groupby(['Batter', 'Player']).agg({
    'xwOBA': 'mean',
    'xISO': 'mean',
    'Hard Hit %': 'mean',
    'Out of Zone Swing %': 'mean',
    'In Zone Contact %': 'mean'
}).reset_index()

# Add HR and Year columns from batting data
advanced_batting_data = pd.merge(batting_data[['Batter', 'Player', 'HR']], advanced_metrics, on=['Batter', 'Player'])
advanced_batting_data['Year'] = start_date[:4]

# Reorder columns
advanced_batting_data = advanced_batting_data[['Player', 'Year', 'HR', 'xwOBA', 'xISO', 'Hard Hit %', 'Out of Zone Swing %', 'In Zone Contact %']]

# Sort by HR in descending order and add a ranking column
advanced_batting_data = advanced_batting_data.sort_values(by='HR', ascending=False).reset_index(drop=True)
advanced_batting_data['Rank'] = advanced_batting_data['HR'].rank(method='dense', ascending=False).astype(int)

# Reorder columns to place Rank at the front
advanced_batting_data = advanced_batting_data[['Rank', 'Player', 'Year', 'HR', 'xwOBA', 'xISO', 'Hard Hit %', 'Out of Zone Swing %', 'In Zone Contact %']]

# Display the final DataFrame
print(advanced_batting_data.head(10))  # Display top 10 players

This is a large query, it may take a moment to complete


100%|███████████████████████████████████████████| 15/15 [00:00<00:00, 18.48it/s]
  final_data = pd.concat(dataframe_list, axis=0).convert_dtypes(convert_string=False)


   Rank                      Player  Year  HR     xwOBA      xISO  Hard Hit %  \
0     1                 Aaron Judge  2024   7  0.529442  0.529442    0.111628   
1     1                Corey Seager  2024   7  0.777914  0.777914    0.156069   
2     2               Rafael Devers  2024   6  0.701574  0.701574    0.106280   
3     2  Jos\xc3\xa9 Ram\xc3\xadrez  2024   6  0.704588  0.704588    0.143541   
4     3                   Juan Soto  2024   5  0.303954  0.303954    0.092593   
5     3                Ryan McMahon  2024   5  0.525849  0.525849    0.097345   
6     3            Gunnar Henderson  2024   5  0.187105  0.187105    0.109091   
7     3                 Kyle Tucker  2024   5  0.602085  0.602085    0.063291   
8     3           Giancarlo Stanton  2024   5   0.57442   0.57442    0.100592   
9     4                  Matt Olson  2024   4  0.706547  0.706547    0.090129   

   Out of Zone Swing %  In Zone Contact %  
0             0.530233           0.469767  
1             0.5664

In [96]:
# Define the date range for which we want to gather data
start_date = '2024-05-13'
end_date = '2024-05-27'

# Retrieve pitching statistics for the specified date range
pitching_data = pyb.pitching_stats_range(start_dt=start_date, end_dt=end_date)

# Filter for Games Started (GS) greater than 0
pitching_data = pitching_data[pitching_data['GS'] > 0]

# Retrieve Statcast data for the specified date range
statcast_data = pyb.statcast(start_dt=start_date, end_dt=end_date)

# Display columns to understand the structure (optional)
print("Pitching Data Columns:", pitching_data.columns)
print("Statcast Data Columns:", statcast_data.columns)

# Filter relevant columns from pitching data
pitching_relevant_columns = ['Name', 'mlbID','GS', 'HR', 'ERA', 'WHIP', 'SO9', 'BAbip', 'BB', 'SO', 'IP', 'Tm']
pitching_data = pitching_data[pitching_relevant_columns]

# Rename columns for consistency
pitching_data = pitching_data.rename(columns={
    'Name': 'Player',
    'mlbID': 'Pitcher',
    'WHIP': 'WHIP',
    'SO9': 'K/9',
    'BAbip': 'BABIP',
    'Tm': 'Team',
    'HR': 'HR',
    'IP': 'IP',
    'GS': 'GS'
})

# Filter relevant columns from Statcast data
statcast_relevant_columns = ['player_name', 'pitcher', 'events', 'launch_speed', 'launch_angle', 'hit_distance_sc', 'plate_x', 'plate_z', 'description']
statcast_data = statcast_data[statcast_relevant_columns]

# Rename columns for consistency
statcast_data = statcast_data.rename(columns={
    'pitcher': 'Pitcher',
    'player_name': 'Player'
})

# Ensure the 'Pitcher' columns in both dataframes are of the same type
pitching_data['Pitcher'] = pitching_data['Pitcher'].astype(str)
statcast_data['Pitcher'] = statcast_data['Pitcher'].astype(str)

# Merge datasets on Pitcher ID
merged_data = pd.merge(pitching_data, statcast_data, on='Pitcher', how='inner')

# Verify the columns of the merged data
print("Merged Data Columns:", merged_data.columns)

# Use the correct Player column
merged_data['Player'] = merged_data['Player_x'].combine_first(merged_data['Player_y'])

# Handle missing values by filling with a default value, e.g., 0
merged_data = merged_data.fillna(0)

# Define contact events
contact_events = ['hit_into_play', 'hit_into_play_no_out', 'hit_into_play_score', 'foul', 'foul_tip']

# Calculate additional metrics per player
hard_hit = merged_data.groupby('Player').apply(lambda x: (x['launch_speed'] >= 95).sum() / len(x[x['launch_speed'] > 0])).reset_index()
hard_hit.columns = ['Player', 'Hard Hit %']

# Calculate Out of Zone Swing %: swings at pitches out of the zone / total swings
out_of_zone_swing = merged_data.groupby('Player').apply(lambda x: (x['description'].str.contains('swinging_strike')).sum() / len(x[(x['plate_x'] < -0.5) | (x['plate_x'] > 0.5)])).reset_index()
out_of_zone_swing.columns = ['Player', 'Out of Zone Swing %']

# Calculate In Zone Contact %: contacts on pitches in the zone / total pitches in the zone
in_zone_contact = merged_data.groupby('Player').apply(lambda x: (x['description'].str.contains('|'.join(contact_events))).sum() / len(x[(x['plate_x'] >= -0.5) & (x['plate_x'] <= 0.5)])).reset_index()
in_zone_contact.columns = ['Player', 'In Zone Contact %']

# Merge the calculated metrics back into the merged data
metrics = pd.merge(hard_hit, out_of_zone_swing, on='Player')
metrics = pd.merge(metrics, in_zone_contact, on='Player')

merged_data = pd.merge(merged_data, metrics, on='Player')

# Calculate wOBA, K/BB, HR/9, HR/FB, FIP, and xFIP using only pitching data HR values
merged_data['wOBA'] = merged_data['launch_angle'] * merged_data['launch_speed'] / 1000  # Example calculation
merged_data['K/BB'] = merged_data['SO'] / merged_data['BB']

FIP_constant = 3.1  # Example FIP constant; this should be adjusted based on the league-average ERA
merged_data['HR/9'] = merged_data['HR'] / (merged_data['IP'] / 9)
merged_data['HR/FB'] = merged_data['HR'] / (merged_data['events'].apply(lambda x: 1 if x == 'home_run' else 0).sum() + 1)  # Avoid division by zero
merged_data['FIP'] = ((13 * merged_data['HR'] + 3 * merged_data['BB'] - 2 * merged_data['SO']) / merged_data['IP']) + FIP_constant
merged_data['xFIP'] = merged_data['FIP']  # Placeholder for xFIP, as exact calculation would need more data

# Summarize advanced metrics for each player
advanced_metrics = merged_data.groupby(['Pitcher', 'Player', 'Team', 'HR', 'IP', 'GS']).agg({
    'ERA': 'mean',
    'wOBA': 'mean',
    'Hard Hit %': 'mean',
    'Out of Zone Swing %': 'mean',
    'In Zone Contact %': 'mean',
    'K/9': 'mean',
    'BABIP': 'mean',
    'WHIP': 'mean',
    'K/BB': 'mean',
    'HR/9': 'mean',
    'HR/FB': 'mean',
    'FIP': 'mean',
    'xFIP': 'mean'
}).reset_index()

# Add Year column
advanced_metrics['Year'] = start_date[:4]

# Rename columns for clarity
advanced_metrics = advanced_metrics.rename(columns={'ERA': 'ERA'})

# Reorder columns
final_data = advanced_metrics[['Player', 'Year', 'GS', 'IP', 'HR', 'ERA', 'wOBA', 'Hard Hit %', 'Out of Zone Swing %', 'In Zone Contact %', 'Team', 'K/9', 'HR/9', 'BABIP', 'HR/FB', 'FIP', 'xFIP']]

# Sort by HR in descending order and add a ranking column
final_data = final_data.sort_values(by='IP', ascending=False).reset_index(drop=True)
final_data['Rank'] = final_data['IP'].rank(method='dense', ascending=False).astype(int)
final_data = final_data[['Rank', 'Player', 'Year', 'GS', 'IP', 'HR', 'ERA', 'wOBA', 'Hard Hit %', 'Out of Zone Swing %', 'In Zone Contact %', 'Team', 'K/9', 'HR/9', 'BABIP', 'HR/FB', 'FIP', 'xFIP']]

# Display the final DataFrame
print(final_data.head(10))  # Display top 10 players


This is a large query, it may take a moment to complete


100%|███████████████████████████████████████████| 15/15 [00:00<00:00, 17.22it/s]
  final_data = pd.concat(dataframe_list, axis=0).convert_dtypes(convert_string=False)


Pitching Data Columns: Index(['Name', 'Age', '#days', 'Lev', 'Tm', 'G', 'GS', 'W', 'L', 'SV', 'IP',
       'H', 'R', 'ER', 'BB', 'SO', 'HR', 'HBP', 'ERA', 'AB', '2B', '3B', 'IBB',
       'GDP', 'SF', 'SB', 'CS', 'PO', 'BF', 'Pit', 'Str', 'StL', 'StS',
       'GB/FB', 'LD', 'PU', 'WHIP', 'BAbip', 'SO9', 'SO/W', 'mlbID'],
      dtype='object')
Statcast Data Columns: Index(['pitch_type', 'game_date', 'release_speed', 'release_pos_x',
       'release_pos_z', 'player_name', 'batter', 'pitcher', 'events',
       'description', 'spin_dir', 'spin_rate_deprecated',
       'break_angle_deprecated', 'break_length_deprecated', 'zone', 'des',
       'game_type', 'stand', 'p_throws', 'home_team', 'away_team', 'type',
       'hit_location', 'bb_type', 'balls', 'strikes', 'game_year', 'pfx_x',
       'pfx_z', 'plate_x', 'plate_z', 'on_3b', 'on_2b', 'on_1b',
       'outs_when_up', 'inning', 'inning_topbot', 'hc_x', 'hc_y',
       'tfs_deprecated', 'tfs_zulu_deprecated', 'fielder_2', 'umpire', 'sv_id',


In [84]:
pitching_data

Unnamed: 0,Player,Pitcher,GS,HR,ERA,WHIP,K/9,BABIP,BB,SO,IP,Team
1,Andrew Abbott,671096,2,1,0.64,0.786,3.9,0.171,3,6,14.0,Cincinnati
2,Bryan Abreu,650556,0,1,2.35,1.304,11.7,0.375,3,10,7.2,Houston
3,Jason Adam,592094,0,0,3.18,1.588,9.5,0.333,5,6,5.2,Tampa Bay
4,Austin Adams,613534,0,1,7.36,1.636,17.2,0.429,2,7,3.2,Oakland
5,Keegan Akin,669211,0,1,5.14,1.143,6.4,0.167,4,5,7.0,Baltimore
...,...,...,...,...,...,...,...,...,...,...,...,...
469,Simeon Woods Richardson,680573,2,0,0.90,0.800,2.7,0.219,1,3,10.0,Minnesota
470,Yoshinobu Yamamoto,808967,3,1,5.29,1.353,11.6,0.370,5,22,17.0,Los Angeles
471,Ryan Yarbrough,642232,0,1,5.79,2.571,1.9,0.389,4,1,4.2,Los Angeles
472,Kirby Yates,489446,0,0,3.00,1.667,18.0,0.400,3,6,3.0,Texas


In [97]:
statcast_data = pyb.statcast(start_dt=start_date, end_dt=end_date)
statcast_data

This is a large query, it may take a moment to complete


100%|███████████████████████████████████████████| 15/15 [00:00<00:00, 15.29it/s]
  final_data = pd.concat(dataframe_list, axis=0).convert_dtypes(convert_string=False)


Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,...,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,spin_axis,delta_home_win_exp,delta_run_exp,bat_speed,swing_length
2650,SL,2024-05-26,88.4,2.69,5.09,"Scott, Tanner",553993,656945,strikeout,swinging_strike,...,1,1,3,Standard,Standard,279,-0.014,-0.121,71.368221,8.98095
2807,FF,2024-05-26,96.3,2.55,5.13,"Scott, Tanner",553993,656945,,foul,...,1,1,3,Standard,Standard,147,0.0,-0.037,71.264883,7.43312
2881,SL,2024-05-26,87.0,2.69,5.14,"Scott, Tanner",553993,656945,,ball,...,1,1,3,Standard,Standard,282,0.0,0.046,,
2980,SL,2024-05-26,87.4,2.58,5.19,"Scott, Tanner",553993,656945,,called_strike,...,1,1,3,Standard,Standard,285,0.0,-0.021,,
3044,FF,2024-05-26,97.6,2.5,5.3,"Scott, Tanner",553993,656945,,ball,...,1,1,3,Standard,Standard,150,0.0,0.025,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3446,ST,2024-05-13,83.6,-1.18,5.65,"Lorenzen, Michael",671289,547179,,foul,...,0,0,0,Standard,Standard,50,0.0,0.0,74.239045,7.74598
3530,FF,2024-05-13,96.1,-1.22,5.75,"Lorenzen, Michael",671289,547179,,foul,...,0,0,0,Standard,Standard,225,0.0,0.0,72.09437,6.61324
3674,SL,2024-05-13,86.4,-1.17,5.73,"Lorenzen, Michael",671289,547179,,ball,...,0,0,0,Standard,Standard,51,0.0,0.021,,
3853,SI,2024-05-13,95.2,-1.28,5.84,"Lorenzen, Michael",671289,547179,,foul,...,0,0,0,Standard,Standard,225,0.0,-0.047,66.053625,5.59145
