# Data Collection (_pybaseball_)

The goal of this notebook is to collect batting and pitching data from the _pybaseball_ library for building an at-bat predcition model.

In [21]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pybaseball import statcast, playerid_lookup
from datetime import datetime

### Establishing the Batter and Pitcher Matchup: ###

The model will ask for input on who the batter and pitcher matchup is.

The model covers the 2024 MLB season. However, this can easily be modified so that it applies to a different year by swapping the values for the two variables below.

In [22]:
# Defining the start date as the opening day of the 2024 MLB season and the end date as today
start_date = '2024-04-01'
end_date = datetime.now().strftime('%Y-%m-%d')

# Define the batter
batter_name = 'Shohei Ohtani' # input("Enter the batter's name: ")

# Define the pitcher
pitcher_name = 'Yoshinobu Yamamoto'  # input("Enter pitcher's name: ")

# Separate into first and last names
def name_splitter(name):
    name_parts = name.split()
    return name_parts[0], ' '.join(name_parts[1:])

batter_first_name, batter_last_name = name_splitter(batter_name)
pitcher_first_name, pitcher_last_name = name_splitter(pitcher_name)

# Get player IDs
def get_player_id(first_name, last_name):
    try:
        player_df = playerid_lookup(last_name, first_name)
        if not player_df.empty:
            print(player_df)
            return player_df['key_mlbam'].values[0]
        else:
            raise ValueError(f"Player '{first_name} {last_name}' not found.")
    except Exception as e:
        print(f"Error occurred: {e}")
        return None

batter_id = get_player_id(batter_first_name, batter_last_name)
pitcher_id = get_player_id(pitcher_first_name, pitcher_last_name)


  name_last name_first  key_mlbam key_retro  key_bbref  key_fangraphs  \
0    ohtani     shohei     660271  ohtas001  ohtansh01          19755   

   mlb_played_first  mlb_played_last  
0            2018.0           2024.0  
  name_last name_first  key_mlbam key_retro  key_bbref  key_fangraphs  \
0  yamamoto  yoshinobu     808967  yamay001  yamamyo01          33825   

   mlb_played_first  mlb_played_last  
0            2024.0           2024.0  


### Fetch Batting Stats:

In [23]:
# Fetch statcast data for the batter
def fetch_and_save_data(start_date, end_date, player_id, player_name):
    # Create directory if it does not exist
    folder_name = 'batter_data'
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
    
    # Query Statcast data
    data = statcast(start_date, end_date)

    print(data.columns)
    
    # Define the columns to keep
    columns_to_keep = [
        'description',  # Output variable
        'events', 'pitch_type', 'release_speed', 'player_name', 'pitcher', 'p_throws', 'stand', 
        'type', 'hit_location', 'bb_type', 'balls', 'strikes', 'outs_when_up', 'inning', 'umpire', 
        'hit_distance_sc', 'launch_speed', 'launch_angle', 'at_bat_number', 'pitch_number', 'bat_score', 
        'fld_score', 'if_fielding_alignment', 'of_fielding_alignment', 'bat_speed', 'swing_length',
        'release_spin_rate', 'effective_speed', 'plate_x', 'plate_z'
    ]
    
    # Filter the data to include only the relevant columns
    filtered_data = data[columns_to_keep]
    
    # Renaming column titles for easier understanding
    filtered_data = filtered_data.rename(columns={'player_name': 'pitcher_name', 'pitcher': 'pitcher_id'})

    # Preparing the player name for the filename
    formatted_player_name = '_'.join(player_name.split()).lower()
    
    # Define the full path for the CSV file
    file_path = os.path.join(folder_name, f"{formatted_player_name}_batting_data.csv")
    
    # Save to CSV
    filtered_data.to_csv(file_path, index=False)
    
    print(f"Data for {player_id} saved to {file_path}")
    print(filtered_data.head())

fetch_and_save_data(start_date, end_date, batter_id, batter_name)


This is a large query, it may take a moment to complete


That's a nice request you got there. It'd be a shame if something were to happen to it.
We strongly recommend that you enable caching before running this. It's as simple as `pybaseball.cache.enable()`.
Since the Statcast requests can take a *really* long time to run, if something were to happen, like: a disconnect;
gremlins; computer repair by associates of Rudy Giuliani; electromagnetic interference from metal trash cans; etc.;
you could lose a lot of progress. Enabling caching will allow you to immediately recover all the successful
subqueries if that happens.
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[col

Index(['pitch_type', 'game_date', 'release_speed', 'release_pos_x',
       'release_pos_z', 'player_name', 'batter', 'pitcher', 'events',
       'description', 'spin_dir', 'spin_rate_deprecated',
       'break_angle_deprecated', 'break_length_deprecated', 'zone', 'des',
       'game_type', 'stand', 'p_throws', 'home_team', 'away_team', 'type',
       'hit_location', 'bb_type', 'balls', 'strikes', 'game_year', 'pfx_x',
       'pfx_z', 'plate_x', 'plate_z', 'on_3b', 'on_2b', 'on_1b',
       'outs_when_up', 'inning', 'inning_topbot', 'hc_x', 'hc_y',
       'tfs_deprecated', 'tfs_zulu_deprecated', 'fielder_2', 'umpire', 'sv_id',
       'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'sz_top', 'sz_bot',
       'hit_distance_sc', 'launch_speed', 'launch_angle', 'effective_speed',
       'release_spin_rate', 'release_extension', 'game_pk', 'pitcher.1',
       'fielder_2.1', 'fielder_3', 'fielder_4', 'fielder_5', 'fielder_6',
       'fielder_7', 'fielder_8', 'fielder_9', 'release_pos_y',
       'estima

# Data Wrangling

In [27]:
# Load the data
file_path = 'batter_data/shohei_ohtani_batting_data.csv'
data = pd.read_csv(file_path)

# Feature Engineering
# Count-based Feature
data['count'] = data['balls'] + data['strikes']

# Calculate Batting Average
# Assuming 'description' contains the outcomes like 'hit', 'strikeout', 'walk', etc.
# You might need to adjust the criteria for what constitutes a 'hit'.
hit_outcomes = ['single', 'double', 'triple', 'home_run']  # Add all hit types
data['is_hit'] = data['description'].apply(lambda x: 1 if x in hit_outcomes else 0)

# Calculate batting average
total_pitches = len(data)
total_hits = data['is_hit'].sum()
batting_avg = total_hits / total_pitches if total_pitches > 0 else 0

# Add batting average as a constant value to the DataFrame (or handle as a separate variable if needed)
data['batting_avg'] = batting_avg

# Pitcher and Batter Features
data['release_speed_avg'] = data.groupby('pitcher_id')['release_speed'].transform('mean')

# Encoding Categorical Variables
data = pd.get_dummies(data, columns=['pitch_type', 'stand', 'p_throws'])

# Dropping unnecessary columns for feature selection and EDA
data = data.drop(columns=['pitcher_name', 'pitcher_id', 'umpire', 'hit_location', 'type', 'bat_score', 'fld_score', 'balls', 'strikes'])

# Save the cleaned data
data.to_csv('batter_data/shohei_ohtani_cleaned_batting_data.csv', index=False)

print(f"Cleaned data saved to 'batter_data/shohei_ohtani_cleaned_batting_data.csv'")
print(data.head())

Cleaned data saved to 'batter_data/shohei_ohtani_cleaned_batting_data.csv'
     description     events  release_speed      bb_type  outs_when_up  inning  \
0  hit_into_play  field_out           86.5  ground_ball             2       9   
1   blocked_ball        NaN           79.2          NaN             2       9   
2           foul        NaN           85.8          NaN             2       9   
3  called_strike        NaN           93.8          NaN             2       9   
4  hit_into_play  field_out           87.3  ground_ball             1       9   

   hit_distance_sc  launch_speed  launch_angle  at_bat_number  ...  \
0              5.0          65.6         -18.0             73  ...   
1              NaN           NaN           NaN             73  ...   
2              NaN           NaN           NaN             73  ...   
3              NaN           NaN           NaN             73  ...   
4              4.0          45.9         -19.0             72  ...   

   pitch_type_PO 

# Feature Selection and Exploratory Data Analysis (EDA)

In [None]:
# Perform EDA
print(data.describe())
print(data.info())

# Visualize distributions and relationships
sns.pairplot(data, hue='description')
plt.show()
plt.clf()

# Correlation matrix
correlation_matrix = data.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.show()
plt.clf()

       release_speed     pitcher_id   hit_location          balls  \
count  641022.000000  641318.000000  144630.000000  641318.000000   
mean       89.132020  637175.785008       5.134094       0.872433   
std         5.985334   53920.422475       2.633275       0.965154   
min        31.900000  434378.000000       1.000000       0.000000   
25%        84.800000  607259.000000       2.000000       0.000000   
50%        90.000000  657277.000000       5.000000       1.000000   
75%        94.000000  671212.000000       8.000000       1.000000   
max       105.500000  814005.000000       9.000000       4.000000   

             strikes   outs_when_up         inning  umpire  hit_distance_sc  \
count  641318.000000  641318.000000  641318.000000     0.0    216226.000000   
mean        0.906968       0.982985       4.951709     NaN       156.023133   
std         0.828826       0.817782       2.595236     NaN       120.164138   
min         0.000000       0.000000       1.000000     NaN    

KeyboardInterrupt: 

### Random Forest Classifier

Next, we will use the Random Forest Classifier to classify how important each dataframe column is to our model. This is a popular machine learning algorithm used for classification tasks. It’s an ensemble learning method that combines multiple decision trees to improve the overall performance of the model. Here’s a detailed breakdown of how it works and what it does:

1. Basics of Decision Trees

A decision tree is a model that splits the data into subsets based on the values of input features. It makes predictions by following the tree from the root to a leaf node, where each node represents a decision based on a feature. The tree’s structure allows it to handle both numerical and categorical data and make decisions based on feature values.

2. Random Forest Algorithm

Random Forest is an ensemble method that constructs a multitude of decision trees and merges their results to produce a more accurate and stable prediction. Here’s how it works:

    a. Bootstrap Aggregating (Bagging):

        - Sampling: Random Forest uses a technique called bootstrap aggregating or bagging. It generates multiple subsets of the original dataset by sampling with replacement.
        - Training: Each subset is used to train a separate decision tree. Because these subsets are different, each tree will be slightly different from the others.

    b. Feature Randomness:

        - Feature Subsets: When splitting nodes in each tree, Random Forest does not consider all features. Instead, it randomly selects a subset of features to find the best split. This process increases diversity among the trees and helps to prevent overfitting.

    c. Voting/Averaging:

        - Classification: For classification tasks, each decision tree in the forest casts a vote for the class label. The class with the majority vote across all trees becomes the final prediction.
        - Regression: For regression tasks, the Random Forest algorithm averages the predictions from all trees to produce the final output.

3. Advantages of Random Forest

    - Accuracy: Combining multiple trees reduces the risk of overfitting and generally improves predictive accuracy compared to individual decision trees.
    - Robustness: It is less sensitive to noise and overfitting due to the averaging of multiple trees.
    - Feature Importance: Random Forest provides estimates of feature importance, helping to identify which features contribute most to the prediction.

In [None]:
# Feature importance analysis
from sklearn.ensemble import RandomForestClassifier

# Establishing the dataframe's features and target
X = data.drop(columns=['description'])
Y = data['description']

# Initialize and train the model
model = RandomForestClassifier()
model.fit(X, y)

# Get feature importances
importances = model.feature_importances_
feature_names = X.columns

# Combine the feature names and their importances into a list of tuples
feature_importances = sorted(list(zip(feature_names, importances)), key=lambda x: x[1], reverse=True)

# Print the sorted feature importances
for feature, importance in feature_importances:
    print(f"Feature: {feature}, Importance: {importance}")