# Data Collection (_pybaseball_)

The goal of this notebook is to collect batting and pitching data from the _pybaseball_ library for building an at-bat predcition model.

In [17]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pybaseball
from pybaseball import statcast, playerid_lookup, statcast_pitcher, statcast_batter
from datetime import datetime

### Establishing the Batter and Pitcher Matchup: ###

The model will ask for input on who the batter and pitcher matchup is.

The model covers the 2024 MLB season. However, this can easily be modified so that it applies to a different year by swapping the values for the two variables below.

In [18]:
# Defining the start date as the opening day of the 2024 MLB season and the end date as today
start_date = '2024-07-16'
end_date = datetime.now().strftime('%Y-%m-%d')

# Define the batter
batter_name = 'Shohei Ohtani' # input("Enter the batter's name: ")

# Define the pitcher
pitcher_name = 'Yoshinobu Yamamoto'  # input("Enter pitcher's name: ")

# Separate into first and last names
def name_splitter(name):
    name_parts = name.split()
    return name_parts[0], ' '.join(name_parts[1:])

batter_first_name, batter_last_name = name_splitter(batter_name)
pitcher_first_name, pitcher_last_name = name_splitter(pitcher_name)

# Get player IDs
def get_player_id(first_name, last_name):
    try:
        player_df = playerid_lookup(last_name, first_name)
        if not player_df.empty:
            print(player_df)
            return player_df['key_mlbam'].values[0]
        else:
            raise ValueError(f"Player '{first_name} {last_name}' not found.")
    except Exception as e:
        print(f"Error occurred: {e}")
        return None

batter_id = get_player_id(batter_first_name, batter_last_name)
pitcher_id = get_player_id(pitcher_first_name, pitcher_last_name)


  name_last name_first  key_mlbam key_retro  key_bbref  key_fangraphs  \
0    ohtani     shohei     660271  ohtas001  ohtansh01          19755   

   mlb_played_first  mlb_played_last  
0            2018.0           2024.0  
  name_last name_first  key_mlbam key_retro  key_bbref  key_fangraphs  \
0  yamamoto  yoshinobu     808967  yamay001  yamamyo01          33825   

   mlb_played_first  mlb_played_last  
0            2024.0           2024.0  


### Fetch Batting Stats:

In [19]:
# Fetch statcast data for the batter
def fetch_and_save_data(start_date, end_date, player_id, player_name, pitcher=False):
    # Create directory if it does not exist
    folder_name = 'batter_data'
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)

    if pitcher:
        player_data = statcast_pitcher(start_date, end_date, player_id)
    else:
        player_data = statcast_batter(start_date, end_date, player_id)
    
    # Filter the data to include only the relevant columns
    filtered_data = player_data #[columns_to_keep]
    

    # Preparing the player name for the filename
    formatted_player_name = '_'.join(player_name.split()).lower()
    
    # Define the full path for the CSV file
    file_path = os.path.join(folder_name, f"{formatted_player_name}_batting_data.csv")
    
    # Save to CSV
    filtered_data.to_csv(file_path, index=False)
    
    print("********************************************")
    print(f"Data for {player_id} saved to {file_path}")
    print("********************************************")
    print(filtered_data.head())
    print("********************************************")
    print(filtered_data.columns)

# Query Statcast data
pybaseball.cache.enable()
data = statcast(start_date, end_date)

fetch_and_save_data(start_date, end_date, batter_id, batter_name, False)
fetch_and_save_data(start_date, end_date, pitcher_id, pitcher_name, True)


This is a large query, it may take a moment to complete


100%|██████████| 65/65 [00:00<00:00, 84.30it/s]
  final_data = pd.concat(dataframe_list, axis=0).convert_dtypes(convert_string=False)


Gathering Player Data
********************************************
Data for 660271 saved to batter_data/shohei_ohtani_batting_data.csv
********************************************
  pitch_type   game_date  release_speed  release_pos_x  release_pos_z  \
0         ST  2024-09-17           82.2          -2.61           5.08   
1         SI  2024-09-17           90.4          -2.43           5.37   
2         ST  2024-09-17           81.8          -2.69           5.08   
3         CH  2024-09-17           87.1          -1.40           5.45   
4         CH  2024-09-17           87.3          -1.45           5.33   

      player_name  batter  pitcher     events      description  ...  \
0  Ohtani, Shohei  660271   670766   home_run    hit_into_play  ...   
1  Ohtani, Shohei  660271   670766        NaN    called_strike  ...   
2  Ohtani, Shohei  660271   670766        NaN             ball  ...   
3  Ohtani, Shohei  660271   669199  strikeout  swinging_strike  ...   
4  Ohtani, Shohei  660271 

# Data Wrangling

In [20]:
import pandas as pd

# Load the data
file_path = 'batter_data/shohei_ohtani_batting_data.csv'
data = pd.read_csv(file_path)

# Feature Engineering
# Count-based Feature
data['count'] = data['balls'] + data['strikes']

# Calculate Batting Average
# Assuming 'description' contains the outcomes like 'hit', 'strikeout', 'walk', etc.
# You might need to adjust the criteria for what constitutes a 'hit'.
hit_outcomes = ['single', 'double', 'triple', 'home_run']  # Add all hit types
data['is_hit'] = data['description'].apply(lambda x: 1 if x in hit_outcomes else 0)

# Calculate batting average
total_pitches = len(data)
total_hits = data['is_hit'].sum()
batting_avg = total_hits / total_pitches if total_pitches > 0 else 0

# Add batting average as a constant value to the DataFrame (or handle as a separate variable if needed)
data['batting_avg'] = batting_avg

# Pitcher and Batter Features
data['release_speed_avg'] = data.groupby('pitcher_id')['release_speed'].transform('mean')

# Encoding Categorical Variables
data = pd.get_dummies(data, columns=['pitch_type', 'stand', 'p_throws'])

# Dropping unnecessary columns for feature selection and EDA
data = data.drop(columns=['pitcher_name', 'pitcher_id', 'umpire', 'hit_location', 'type', 'bat_score', 'fld_score', 'balls', 'strikes'])

# Save the cleaned data
data.to_csv('batter_data/shohei_ohtani_cleaned_batting_data.csv', index=False)

print(f"Cleaned data saved to 'batter_data/shohei_ohtani_cleaned_batting_data.csv'")
print(data.head())

KeyError: 'pitcher_id'

# Feature Selection and Exploratory Data Analysis (EDA)

In [None]:
import seaborn as sns

# Perform EDA
print(data.describe())
print(data.info())

# Sample the data for easier visualization
data_sample = data.sample(frac=0.05, random_state=42)

# Visualize distributions and relationships
sns.pairplot(data_sample, hue='description')
plt.show()
plt.clf()

# Correlation matrix
correlation_matrix = data_sample.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.show()
plt.clf()

       release_speed  release_pos_x  release_pos_z         batter  spin_dir  \
count  240244.000000  240244.000000  240244.000000  240287.000000       0.0   
mean       89.214868      -0.838886       5.752583  645268.671318       NaN   
std         6.050425       1.878182       0.521332   50603.287690       NaN   
min        32.300000      -4.730000       0.780000  444482.000000       NaN   
25%        85.000000      -2.160000       5.480000  624413.000000       NaN   
50%        90.100000      -1.530000       5.790000  664983.000000       NaN   
75%        94.100000       0.780000       6.070000  673962.000000       NaN   
max       105.500000       4.600000       8.060000  807799.000000       NaN   

       spin_rate_deprecated  break_angle_deprecated  break_length_deprecated  \
count                   0.0                     0.0                      0.0   
mean                    NaN                     NaN                      NaN   
std                     NaN                     

KeyboardInterrupt: 

### Random Forest Classifier

Next, we will use the Random Forest Classifier to classify how important each dataframe column is to our model. This is a popular machine learning algorithm used for classification tasks. It’s an ensemble learning method that combines multiple decision trees to improve the overall performance of the model. Here’s a detailed breakdown of how it works and what it does:

1. Basics of Decision Trees

A decision tree is a model that splits the data into subsets based on the values of input features. It makes predictions by following the tree from the root to a leaf node, where each node represents a decision based on a feature. The tree’s structure allows it to handle both numerical and categorical data and make decisions based on feature values.

2. Random Forest Algorithm

Random Forest is an ensemble method that constructs a multitude of decision trees and merges their results to produce a more accurate and stable prediction. Here’s how it works:

    a. Bootstrap Aggregating (Bagging):

        - Sampling: Random Forest uses a technique called bootstrap aggregating or bagging. It generates multiple subsets of the original dataset by sampling with replacement.
        - Training: Each subset is used to train a separate decision tree. Because these subsets are different, each tree will be slightly different from the others.

    b. Feature Randomness:

        - Feature Subsets: When splitting nodes in each tree, Random Forest does not consider all features. Instead, it randomly selects a subset of features to find the best split. This process increases diversity among the trees and helps to prevent overfitting.

    c. Voting/Averaging:

        - Classification: For classification tasks, each decision tree in the forest casts a vote for the class label. The class with the majority vote across all trees becomes the final prediction.
        - Regression: For regression tasks, the Random Forest algorithm averages the predictions from all trees to produce the final output.

3. Advantages of Random Forest

    - Accuracy: Combining multiple trees reduces the risk of overfitting and generally improves predictive accuracy compared to individual decision trees.
    - Robustness: It is less sensitive to noise and overfitting due to the averaging of multiple trees.
    - Feature Importance: Random Forest provides estimates of feature importance, helping to identify which features contribute most to the prediction.

In [None]:
# Feature importance analysis
from sklearn.ensemble import RandomForestClassifier

# Establishing the dataframe's features and target
X = data.drop(columns=['description'])
Y = data['description']

# Initialize and train the model
model = RandomForestClassifier()
model.fit(X, y)

# Get feature importances
importances = model.feature_importances_
feature_names = X.columns

# Combine the feature names and their importances into a list of tuples
feature_importances = sorted(list(zip(feature_names, importances)), key=lambda x: x[1], reverse=True)

# Print the sorted feature importances
for feature, importance in feature_importances:
    print(f"Feature: {feature}, Importance: {importance}")