# MLB Pitcher Performance: Feature Engineering

This notebook performs comprehensive feature engineering on the MLB pitcher performance dataset to prepare it for predictive modeling of strikeouts and ERA.

## 1. Data Loading and Initial Preprocessing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.impute import SimpleImputer
import datetime as dt
import warnings

# Filter out warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

In [2]:
# Load the dataset
def load_data():
    """
    Load the pitcher game level data from CSV
    """
    try:
        # Try to read the CSV directly
        df = pd.read_csv('pitcher_game_level_data.csv')
        print(f"Successfully loaded data from CSV: {df.shape[0]} rows, {df.shape[1]} columns")
        return df
    except FileNotFoundError:
        # If CSV not found, try connecting to the database
        try:
            from src.data.db import get_db_connection
            
            conn = get_db_connection()
            print("Connected to database successfully")
            
            # Create a query that joins all necessary tables
            query = """
            SELECT 
                p.pitcher_id,
                p.player_name,
                g.game_id,
                g.game_date,
                g.season,
                g.strikeouts,
                g.hits,
                g.walks,
                g.home_runs,
                g.release_speed_mean,
                g.release_speed_max,
                g.release_spin_rate_mean,
                g.swinging_strike_pct,
                g.called_strike_pct,
                g.zone_rate,
                t.team,
                t.era,
                t.k_per_9,
                t.bb_per_9,
                t.k_bb_ratio,
                t.whip,
                t.babip,
                t.lob_pct,
                t.fip,
                t.xfip,
                t.war,
                f.last_3_games_strikeouts_avg,
                f.last_5_games_strikeouts_avg,
                f.last_3_games_k9_avg,
                f.last_5_games_k9_avg,
                f.last_3_games_era_avg,
                f.last_5_games_era_avg,
                f.last_3_games_fip_avg,
                f.last_5_games_fip_avg,
                f.last_3_games_velo_avg,
                f.last_5_games_velo_avg,
                f.last_3_games_swinging_strike_pct_avg,
                f.last_5_games_swinging_strike_pct_avg,
                f.days_rest,
                f.team_changed
            FROM 
                game_stats g
            JOIN 
                pitchers p ON g.pitcher_id = p.pitcher_id
            LEFT JOIN 
                traditional_stats t ON g.pitcher_id = t.pitcher_id AND g.season = t.season
            LEFT JOIN
                prediction_features f ON g.pitcher_id = f.pitcher_id AND g.game_id = f.game_id
            """
            
            df = pd.read_sql_query(query, conn)
            conn.close()
            print(f"Successfully loaded data from database: {df.shape[0]} rows, {df.shape[1]} columns")
            return df
        except Exception as e:
            print(f"Error accessing database: {e}")
            return None

# Load the data
df = load_data()

Connected to database successfully
Successfully loaded data from database: 104453 rows, 40 columns


In [3]:
# Display basic information about the dataset
print("\nDataset Information:")
df.info()

# Check for missing values
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100
missing_data = pd.DataFrame({
    'Missing Values': missing_values,
    'Percentage': missing_percentage
})
print("\nMissing values summary:")
print(missing_data[missing_data['Missing Values'] > 0].sort_values('Percentage', ascending=False))

# Display a sample of the data
print("\nSample of the dataset:")
df.head()


Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104453 entries, 0 to 104452
Data columns (total 40 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   pitcher_id                            104453 non-null  int64  
 1   player_name                           104453 non-null  object 
 2   game_id                               104453 non-null  object 
 3   game_date                             104453 non-null  object 
 4   season                                104453 non-null  int64  
 5   strikeouts                            104453 non-null  int64  
 6   hits                                  104453 non-null  int64  
 7   walks                                 104453 non-null  int64  
 8   home_runs                             104453 non-null  int64  
 9   release_speed_mean                    104453 non-null  float64
 10  release_speed_max                     104453 n

Unnamed: 0,pitcher_id,player_name,game_id,game_date,season,strikeouts,hits,walks,home_runs,release_speed_mean,...,last_3_games_era_avg,last_5_games_era_avg,last_3_games_fip_avg,last_5_games_fip_avg,last_3_games_velo_avg,last_5_games_velo_avg,last_3_games_swinging_strike_pct_avg,last_5_games_swinging_strike_pct_avg,days_rest,team_changed
0,4666,"Sabathia, CC",564747,2019-04-24,2019,0,0,0,0,84.2,...,0.0,0.0,0.0,0.0,85.551369,85.551369,0.0,0.0,5,1
1,4666,"Sabathia, CC",564845,2019-04-30,2019,0,0,0,0,85.168966,...,0.0,0.0,0.0,0.0,85.100913,85.100913,0.0,0.0,6,1
2,4666,"Sabathia, CC",564954,2019-05-22,2019,0,0,0,0,84.903409,...,0.0,0.0,0.0,0.0,85.828952,85.371164,0.0,0.0,5,1
3,4666,"Sabathia, CC",565078,2019-07-27,2019,0,0,0,0,84.681538,...,0.0,0.0,0.0,0.0,85.181053,84.91004,0.0,0.0,5,1
4,4666,"Sabathia, CC",565349,2019-06-08,2019,0,0,0,0,85.195652,...,0.0,0.0,0.0,0.0,85.229784,85.241908,0.0,0.0,6,1


## 2. Data Cleaning and Type Conversion