# Batting Stance vs Pitch Effectiveness (Neural Network)
This notebook builds a model to predict how effective different batting stances are against pitch types using Statcast + batting stance data.

In [None]:
# === Imports ===
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from datetime import datetime
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans
import os


In [None]:
# Load datasets
data_dir = "../data"

# Use the small sample for testing
pitch_df = pd.read_csv(os.path.join(data_dir, "sample_statcast.csv"))
stance_df = pd.read_csv(os.path.join(data_dir, "batting-stance.csv"))


# Normalize player names in pitch data (e.g., 'Matt Olson') → 'Olson, Matt'
pitch_df['name'] = pitch_df['player_name'].apply(lambda x: ', '.join(x.split()[::-1]) if isinstance(x, str) else x)

# Extract year and month from pitch date
pitch_df['game_date'] = pd.to_datetime(pitch_df['game_date'])
pitch_df['year'] = pitch_df['game_date'].dt.year
pitch_df['month'] = pitch_df['game_date'].dt.month

stance_df.columns


Index(['id', 'name', 'year', 'api_game_date_month_mm', 'bat_side', 'side',
       'avg_batter_y_position', 'avg_batter_x_position', 'avg_foot_sep',
       'avg_stance_angle', 'avg_intercept_y_vs_batter',
       'avg_intercept_y_vs_plate'],
      dtype='object')

In [39]:
# 1. Create 'stance_date' in stance_df
stance_df['stance_date'] = pd.to_datetime(
    stance_df['year'].astype(str) + '-' + 
    stance_df['api_game_date_month_mm'].astype(str).str.zfill(2) + '-01'
)

# 2. Reset pitch_df index for merging and keep pitch_index
pitch_df_reset = pitch_df.reset_index().rename(columns={'index': 'pitch_index'})

# 3. Merge on 'name' with suffixes to distinguish columns
merged = pitch_df_reset.merge(
    stance_df,
    on='name',
    how='left',
    suffixes=('_pitch', '_stance')
)

# 4. Drop rows where date columns are missing
merged = merged.dropna(subset=['stance_date', 'game_date'])

# 5. Calculate absolute date difference
merged['date_diff'] = (merged['stance_date'] - merged['game_date']).abs()

# 6. Find index of closest stance_date per pitch_index
idx = merged.groupby('pitch_index')['date_diff'].idxmin()

# 7. Select closest stance per pitch
closest_stance = merged.loc[idx].set_index('pitch_index')

# 8. Columns from stance_df to keep (with _stance suffix now)
# Use the correct column names matching closest_stance.columns
stance_cols = [
    'avg_batter_y_position',
    'avg_batter_x_position',
    'avg_foot_sep',
    'avg_stance_angle',
    'avg_intercept_y_vs_batter',
    'avg_intercept_y_vs_plate',
    'bat_side',
    'stance_date'
]

# Drop any columns that already exist in pitch_df to avoid duplicates
cols_to_drop = [col for col in stance_cols if col in pitch_df.columns]
pitch_df = pitch_df.drop(columns=cols_to_drop, errors='ignore')

# Join stance columns back
pitch_df = pitch_df.reset_index(drop=True).join(
    closest_stance[stance_cols]
)
print(closest_stance.columns.tolist())

# Now pitch_df has stance features joined with proper names, and no duplicates

# 11. Sanity check
assert not any(pitch_df.columns.duplicated()), "Duplicate columns detected!"

# Preview result
print(pitch_df[stance_cols].head())


['pitch_type', 'game_date', 'release_speed', 'release_pos_x', 'release_pos_z', 'player_name', 'batter', 'pitcher', 'events', 'description', 'spin_dir', 'spin_rate_deprecated', 'break_angle_deprecated', 'break_length_deprecated', 'zone', 'des', 'game_type', 'stand', 'p_throws', 'home_team', 'away_team', 'type', 'hit_location', 'bb_type', 'balls', 'strikes', 'game_year', 'pfx_x', 'pfx_z', 'plate_x', 'plate_z', 'on_3b', 'on_2b', 'on_1b', 'outs_when_up', 'inning', 'inning_topbot', 'hc_x', 'hc_y', 'tfs_deprecated', 'tfs_zulu_deprecated', 'umpire', 'sv_id', 'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'sz_top', 'sz_bot', 'hit_distance_sc', 'launch_speed', 'launch_angle', 'effective_speed', 'release_spin_rate', 'release_extension', 'game_pk', 'fielder_2', 'fielder_3', 'fielder_4', 'fielder_5', 'fielder_6', 'fielder_7', 'fielder_8', 'fielder_9', 'release_pos_y', 'estimated_ba_using_speedangle', 'estimated_woba_using_speedangle', 'woba_value', 'woba_denom', 'babip_value', 'iso_value', 'launch_speed_a

In [40]:
# Assuming pitch_df already has stance features merged (with missing values)

# List stance features to impute
stance_features = [
    'avg_batter_y_position', 'avg_batter_x_position', 'avg_foot_sep',
    'avg_stance_angle', 'avg_intercept_y_vs_batter', 'avg_intercept_y_vs_plate'
]

# Impute numeric stance features by player average
for col in stance_features:
    # Group by player name and fill missing with group mean
    pitch_df[col] = pitch_df.groupby('name')[col].transform(lambda g: g.fillna(g.mean()))
    # If still missing (e.g., all NaN in group), fill with overall mean
    pitch_df[col] = pitch_df[col].fillna(pitch_df[col].mean())

# For categorical 'bat_side', fill missing by most common per player (mode)
def fill_mode(series):
    mode = series.mode()
    if not mode.empty:
        return series.fillna(mode.iloc[0])
    else:
        return series.fillna('Unknown')  # or some default

pitch_df['bat_side'] = pitch_df.groupby('name')['bat_side'].transform(fill_mode)



In [41]:
# List stance features again
stance_features = [
    'avg_batter_y_position', 'avg_batter_x_position', 'avg_foot_sep',
    'avg_stance_angle', 'avg_intercept_y_vs_batter', 'avg_intercept_y_vs_plate',
    'bat_side'
]

print("Missing values BEFORE imputation:")
print(pitch_df[stance_features].isna().sum())

# Perform imputation (same as before)
for col in stance_features[:-1]:  # all numeric stance features
    pitch_df[col] = pitch_df.groupby('name')[col].transform(lambda g: g.fillna(g.mean()))
    pitch_df[col] = pitch_df[col].fillna(pitch_df[col].mean())

def fill_mode(series):
    mode = series.mode()
    if not mode.empty:
        return series.fillna(mode.iloc[0])
    else:
        return series.fillna('Unknown')

pitch_df['bat_side'] = pitch_df.groupby('name')['bat_side'].transform(fill_mode)

print("\nMissing values AFTER imputation:")
print(pitch_df[stance_features].isna().sum())

# Also check total missing in all model columns before dropna
model_cols = [
    'pitch_type', 'release_speed', 'release_pos_x', 'release_pos_z',
    'plate_x', 'plate_z', 'pfx_x', 'pfx_z',
    'zone', 'release_spin_rate', 'release_extension', 'stand',
] + stance_features + ['description']

print("\nMissing values in all model columns BEFORE dropna:")
print(pitch_df[model_cols].isna().sum())

# Now create model_df and drop rows missing pitch or target data
model_df = pitch_df[model_cols].dropna()

print("\nRemaining missing values in model_df after dropna:")
print(model_df.isna().sum())


Missing values BEFORE imputation:
avg_batter_y_position        0
avg_batter_x_position        0
avg_foot_sep                 0
avg_stance_angle             0
avg_intercept_y_vs_batter    0
avg_intercept_y_vs_plate     0
bat_side                     0
dtype: int64

Missing values AFTER imputation:
avg_batter_y_position        0
avg_batter_x_position        0
avg_foot_sep                 0
avg_stance_angle             0
avg_intercept_y_vs_batter    0
avg_intercept_y_vs_plate     0
bat_side                     0
dtype: int64

Missing values in all model columns BEFORE dropna:
pitch_type                   19267
release_speed                19683
release_pos_x                19682
release_pos_z                19682
plate_x                      19682
plate_z                      19682
pfx_x                        19760
pfx_z                        19683
zone                         19682
release_spin_rate            22764
release_extension            20751
stand                            0


In [42]:
# List of pitch columns you want to impute
pitch_feature_cols = [
    'pitch_type', 'release_speed', 'release_pos_x', 'release_pos_z',
    'plate_x', 'plate_z', 'pfx_x', 'pfx_z',
    'zone', 'release_spin_rate', 'release_extension', 'stand',
]

# Subset pitch_df to just those columns
pitch_features = pitch_df[pitch_feature_cols]

# Separate numeric and categorical columns by dtype
numeric_cols = pitch_features.select_dtypes(include=['number']).columns.tolist()
categorical_cols = pitch_features.select_dtypes(include=['object', 'category']).columns.tolist()

# Impute numeric columns with median (assign back instead of inplace)
for col in numeric_cols:
    median_val = pitch_df[col].median()
    pitch_df[col] = pitch_df[col].fillna(median_val)

# Impute categorical columns with mode (assign back instead of inplace)
for col in categorical_cols:
    mode_val = pitch_df[col].mode()[0]
    pitch_df[col] = pitch_df[col].fillna(mode_val)

# Check missing values after imputation (optional)
print("Missing values in pitch features after imputation:")
print(pitch_df[pitch_feature_cols].isna().sum())


Missing values in pitch features after imputation:
pitch_type           0
release_speed        0
release_pos_x        0
release_pos_z        0
plate_x              0
plate_z              0
pfx_x                0
pfx_z                0
zone                 0
release_spin_rate    0
release_extension    0
stand                0
dtype: int64


In [43]:
# Now create your model_df and drop rows with missing pitch features only (stance features are imputed)
model_df = pitch_df[[
    # Pitch features
    'pitch_type', 'release_speed', 'release_pos_x', 'release_pos_z',
    'plate_x', 'plate_z', 'pfx_x', 'pfx_z',
    'zone', 'release_spin_rate', 'release_extension', 'stand', 'player_name', 'game_date', 
    
    # Stance features
    'avg_batter_y_position', 'avg_batter_x_position',
    'avg_foot_sep', 'avg_stance_angle',
    'avg_intercept_y_vs_batter', 'avg_intercept_y_vs_plate',
    'bat_side',
    
    # Target
    'description'
]].dropna()  # Now only drops rows with missing pitch features or description


In [44]:
model_df['game_date']

0        2023-09-30
1        2023-09-30
2        2023-09-30
3        2023-09-30
4        2023-09-30
            ...    
675806   2025-03-02
675807   2025-03-02
675808   2025-03-02
675809   2025-03-02
675810   2025-03-02
Name: game_date, Length: 675811, dtype: datetime64[ns]

In [45]:
model_df.columns

Index(['pitch_type', 'release_speed', 'release_pos_x', 'release_pos_z',
       'plate_x', 'plate_z', 'pfx_x', 'pfx_z', 'zone', 'release_spin_rate',
       'release_extension', 'stand', 'player_name', 'game_date',
       'avg_batter_y_position', 'avg_batter_x_position', 'avg_foot_sep',
       'avg_stance_angle', 'avg_intercept_y_vs_batter',
       'avg_intercept_y_vs_plate', 'bat_side', 'description'],
      dtype='object')

In [46]:
# Define the mapping dictionary
outcome_scores = {
    'swinging_strike': 0,
    'swinging_strike_blocked': 0,
    'called_strike': 1,
    'missed_bunt': 1,
    'pitchout': 1,
    'blocked_ball': 2,
    'ball': 3,
    'foul': 4,
    'bunt_foul_tip': 4,
    'foul_bunt': 4,
    'foul_tip': 4,
    'hit_by_pitch': 6,
    'hit_into_play': 7
}

# Map the description column to numeric outcome
model_df['outcome_score'] = model_df['description'].map(outcome_scores)

# Now compute average outcomes by stance cluster or pitch type


In [47]:


feature_cols = [
    'release_speed', 'release_pos_x', 'release_pos_z',
    'plate_x', 'plate_z', 'pfx_x', 'pfx_z',
    'zone', 'release_spin_rate', 'release_extension',
    'avg_batter_y_position', 'avg_batter_x_position',
    'avg_foot_sep', 'avg_stance_angle',
    'avg_intercept_y_vs_batter', 'avg_intercept_y_vs_plate'
]


In [None]:

# Create sequences (your function)
def create_sequences(df, feature_cols, label_col, seq_len=5):
    sequences = []
    labels = []

    players = df['player_name'].unique()
    for player in players:
        player_df = df[df['player_name'] == player].sort_values(by='game_date')

        features = player_df[feature_cols].values
        target = player_df[label_col].values

        for i in range(len(player_df) - seq_len):
            seq_x = features[i:i+seq_len]
            seq_y = target[i+seq_len]  # next outcome

            sequences.append(seq_x)
            labels.append(seq_y)

    return np.array(sequences), np.array(labels)

# 🟡 Replace 'description_enc' with 'outcome_score'
X, y = create_sequences(model_df, feature_cols, label_col='outcome_score')
print(f"Input sequences shape: {X.shape}, Labels shape: {y.shape}")


Input sequences shape: (675106, 5, 16), Labels shape: (675106,)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [None]:

# Normalize features
num_samples, seq_len, num_features = X_train.shape
X_train_2d = X_train.reshape(-1, num_features)
X_test_2d = X_test.reshape(-1, num_features)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_2d)
X_test_scaled = scaler.transform(X_test_2d)

X_train = X_train_scaled.reshape(num_samples, seq_len, num_features)
X_test = X_test_scaled.reshape(X_test.shape[0], seq_len, num_features)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)



In [None]:

# LSTM model for regression
class PitchLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=1):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        _, (h_n, _) = self.lstm(x)
        out = self.fc(h_n[-1])  # (batch, 1)
        return out


In [52]:
# Instantiate model
input_size = num_features
hidden_size = 64
model = PitchLSTM(input_size, hidden_size)

# Loss and optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Train
epochs = 14
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_X).squeeze()
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")


Epoch 1, Loss: 19778.7403
Epoch 2, Loss: 19365.4228
Epoch 3, Loss: 19352.4922
Epoch 4, Loss: 19333.8276
Epoch 5, Loss: 19321.2257
Epoch 6, Loss: 19310.1107
Epoch 7, Loss: 19299.3776
Epoch 8, Loss: 19285.5657
Epoch 9, Loss: 19266.8990
Epoch 10, Loss: 19249.0709
Epoch 11, Loss: 19230.0789
Epoch 12, Loss: 19208.8240
Epoch 13, Loss: 19182.0822
Epoch 14, Loss: 19156.7938


In [53]:

# Evaluate
model.eval()
with torch.no_grad():
    outputs = model(X_test_tensor).squeeze()
    mse = ((outputs - y_test_tensor) ** 2).mean().item()
    rmse = mse ** 0.5
    print(f"Test RMSE: {rmse:.4f}")


Test RMSE: 2.1462


In [None]:

# Encode 'bat_side' categorical variable to numeric
model_df['bat_side_enc'] = model_df['bat_side'].map({'R': 0, 'L': 1})

# Define stance features for clustering (numeric only)
stance_features_numeric = [
    'avg_batter_y_position', 'avg_batter_x_position',
    'avg_foot_sep', 'avg_stance_angle',
    'avg_intercept_y_vs_batter', 'avg_intercept_y_vs_plate',
    'bat_side_enc'
]

# Drop rows with missing values in stance features
stance_df = model_df[stance_features_numeric].dropna()

# Run KMeans clustering on cleaned data
kmeans = KMeans(n_clusters=5, random_state=42)
stance_clusters = kmeans.fit_predict(stance_df)

# Add stance_cluster back to the original dataframe
# Create a new column initialized with NaN
model_df['stance_cluster'] = float('nan')
# Assign cluster labels only for rows without NaNs in stance features
model_df.loc[stance_df.index, 'stance_cluster'] = stance_clusters

# Calculate average stance features per cluster (exclude NaNs)
stance_cluster_avgs = model_df.dropna(subset=['stance_cluster']).groupby('stance_cluster')[stance_features_numeric].mean().to_dict(orient='index')

# Optional: print cluster averages
for cluster_id, features_avg in stance_cluster_avgs.items():
    print(f"Stance Cluster {int(cluster_id)}:")
    for feature, avg_val in features_avg.items():
        print(f"  {feature}: {avg_val:.4f}")
    print()


Stance Cluster 0:
  avg_batter_y_position: 28.8767
  avg_batter_x_position: 27.9094
  avg_foot_sep: 26.8924
  avg_stance_angle: -7.5211
  avg_intercept_y_vs_batter: 28.9129
  avg_intercept_y_vs_plate: 0.0531
  bat_side_enc: 0.5333

Stance Cluster 1:
  avg_batter_y_position: 29.8070
  avg_batter_x_position: 26.2374
  avg_foot_sep: 27.3198
  avg_stance_angle: -42.0639
  avg_intercept_y_vs_batter: 33.9570
  avg_intercept_y_vs_plate: 4.1502
  bat_side_enc: 0.4153

Stance Cluster 2:
  avg_batter_y_position: 26.3459
  avg_batter_x_position: 27.6142
  avg_foot_sep: 30.6316
  avg_stance_angle: -1.5466
  avg_intercept_y_vs_batter: 32.3900
  avg_intercept_y_vs_plate: 6.0592
  bat_side_enc: 0.4579

Stance Cluster 3:
  avg_batter_y_position: 29.2270
  avg_batter_x_position: 27.8148
  avg_foot_sep: 29.1449
  avg_stance_angle: -20.1987
  avg_intercept_y_vs_batter: 31.7528
  avg_intercept_y_vs_plate: 2.5427
  bat_side_enc: 0.5043

Stance Cluster 4:
  avg_batter_y_position: 26.0438
  avg_batter_x_posi

Stance Cluster 0: 


Batter Y position: Moderate at ~28.9 units, indicating a fairly centered vertical stance.

Batter X position: Around 27.9 units, showing moderate horizontal placement in the box.

Foot separation: Relatively narrow (~26.9 units), suggesting a more compact stance.

Stance angle: Slightly open, about -7.5°, indicating the batter’s front foot is angled a bit outward.

Intercept Y vs batter: Moderate (~28.9), relates to timing or vertical placement relative to the batter.

Intercept Y vs plate: Near zero (~0.05), indicating minimal vertical shift relative to the plate.

Bat side encoding: ~0.53 (midway), indicating a fairly balanced mix of left- and right-handed batters.

Summary:
This cluster features moderately centered batters with compact stances and slightly open foot angles, representing a balanced group with nearly equal left/right batters.
___________________________________________________________________________________________

Stance Cluster 1


Batter Y position: Higher vertical position (~29.8 units).

Batter X position: Slightly more inward at ~26.2 units.

Foot separation: Slightly wider than cluster 0 (~27.3 units).

Stance angle: Much more open stance at about -42°, meaning the batter's front foot is angled substantially outward.

Intercept Y vs batter: Higher (~34), indicating distinct vertical timing/placement.

Intercept Y vs plate: Elevated (~4.15), suggesting a notable vertical offset relative to the plate.

Bat side encoding: Lower (~0.41), leaning towards right-handed batters.

Summary:
Batters in this cluster tend to have an open, wide stance with a high vertical position and more pronounced foot angle, mostly right-handed.

___________________________________________________________________________________________

Stance Cluster 2


Batter Y position: Lower vertical position (~26.3 units).

Batter X position: Around 27.6 units, moderate horizontal position.

Foot separation: Wider stance (~30.6 units).

Stance angle: Nearly square stance with a small angle (~-1.5°).

Intercept Y vs batter: High (~32.4), showing some vertical timing variation.

Intercept Y vs plate: Higher (~6.06), notable vertical offset.

Bat side encoding: Mid to lower (~0.46), mixed handedness but slightly right-leaning.

Summary:
This cluster shows batters with a lower, wider, and fairly square stance, with a significant vertical offset relative to the plate.

___________________________________________________________________________________________

Stance Cluster 3


Batter Y position: Moderate-high vertical position (~29.2).

Batter X position: Moderate horizontal position (~27.8).

Foot separation: Fairly wide stance (~29.1).

Stance angle: Moderately open (~-20.2°).

Intercept Y vs batter: High (~31.8).

Intercept Y vs plate: Moderate (~2.54).

Bat side encoding: About 0.5, balanced left/right split.

Summary:
Batters here have a moderately open, wider stance with elevated vertical placement and balanced handedness.
___________________________________________________________________________________________

Stance Cluster 4


Batter Y position: Low vertical position (~26.0).

Batter X position: Higher horizontal position (~28.9).

Foot separation: Widest stance (~38.1 units).

Stance angle: Slightly open (~-2.9°).

Intercept Y vs batter: Lowest vertical timing (~26.1).

Intercept Y vs plate: Near zero (~0.1).

Bat side encoding: Highest (~0.62), skewed towards left-handed batters.

Summary:
This cluster groups batters with the widest, lowest stance, nearly square foot angle, and a strong left-handed tendency.

In [55]:
# Count pitches of each type in each stance cluster
pitch_counts = model_df.groupby(['stance_cluster', 'pitch_type']).size().unstack(fill_value=0)

print("Pitch counts per stance cluster:")
print(pitch_counts)

pitch_counts_normalized = pitch_counts.div(pitch_counts.sum(axis=1), axis=0)
print(pitch_counts_normalized)

avg_outcomes = model_df.groupby(['stance_cluster', 'pitch_type'])['outcome_score'].mean().unstack()
print("Average pitch outcome per stance cluster:")
print(avg_outcomes)

best_pitches_per_cluster = avg_outcomes.idxmin(axis=1)
print("Best pitch type per stance cluster:")
print(best_pitches_per_cluster)

best_clusters_per_pitch = avg_outcomes.idxmin(axis=0)
print("Best stance cluster per pitch type:")
print(best_clusters_per_pitch)


Pitch counts per stance cluster:
pitch_type         CH  CS     CU  EP   FA     FC     FF   FO    FS    KC   KN  \
stance_cluster                                                                  
0.0             12777  27   8003  79  110  10367  45410   58  3814  2190   73   
1.0              4745   9   3012  21   50   3786  16221   37  1455   872   23   
2.0             19513  23  11355  95  172  14241  61165  147  5285  3490   81   
3.0             12023  14   7477  72   76   8932  38101   88  3276  2079   90   
4.0             12787  10   7891  65   60  10133  41856   75  3622  2194  104   

pitch_type      PO  SC     SI     SL     ST   SV  
stance_cluster                                    
0.0              8  13  20130  19227   8086  531  
1.0              6   5   7202   7365   3072  241  
2.0             12  28  27824  29027  11611  834  
3.0             10  17  16775  17199   6854  529  
4.0             12  12  18961  18231   7304  511  
pitch_type            CH        CS        

Description Recap:
Moderate, compact stance with slight openness (stance_angle ≈ -7.5°), average foot separation, and balanced handedness (bat_side_enc ≈ 0.53).

Pitch Outcomes:

Performs well on SI (Sinker) and KN (Knuckleball) — these are the most effective pitch types against this cluster.

Moderate performance on FA (Fastball), CH (Changeup), and FC (Cutter).

Less effective on SL (Slider) and ST (Sweeper).

Interpretation:
Well-rounded and balanced, with hitters capable of handling vertical movement (sinkers, knuckleballs) and standard fastballs. The compact posture may help with timing but isn’t as adaptable to sharp lateral movement, leading to lower effectiveness against sliders and sweepers.


____________________________________________________________________________________________

Stance Cluster 1

Description Recap:
Very closed stance (stance_angle ≈ -42°), higher vertical body position (batter_y ≈ 29.8), narrow stance width, and a slight right-handed skew (bat_side_enc ≈ 0.42).

Pitch Outcomes:

Best performance against SC (Screwball), SL (Slider), and ST (Sweeper) — all pitches with heavy lateral break.

Performs relatively poorly on FA (Fastball) and KN (Knuckleball).

Solid against FC (Cutter) and FS (Splitter).

Interpretation:
This closed stance may set up hitters better to read and track breaking pitches that move across the plate, especially those with glove-side movement. However, the more closed position may reduce bat speed or timing precision against faster pitches.

____________________________________________________________________________________________

Stance Cluster 2

Description Recap:
Low, wide, almost square stance (stance_angle ≈ -1.5°, foot_sep ≈ 30.6), moderate x/y position, mixed handedness.

Pitch Outcomes:

Very strong vs SC (Screwball), EP (Eephus), SL (Slider), and ST (Sweeper).

Slightly lower performance vs FA (Fastball) and CH (Changeup).

Moderate against CU (Curveball) and FS (Splitter).

Interpretation:
This wide, balanced stance helps hitters adjust to variable pitch speeds and movement, especially breaking pitches. The stance seems to trade off some reaction speed (hence lower FA performance) for stability and plate coverage against movement-heavy pitches.

____________________________________________________________________________________________

Stance Cluster 3

Description Recap:
Moderately open stance (stance_angle ≈ -18.5°), average foot separation, mid-level batter position, balanced left/right handedness (bat_side_enc ≈ 0.53).

Pitch Outcomes:

Best performance on EP (Eephus) — unique among clusters.

Decent against FA (Fastball), FC (Cutter), and SL (Slider).

Lower effectiveness on CH (Changeup) and CU (Curveball).

Interpretation:
This stance favors off-speed and slower pitches when they are extremely off-tempo (e.g., Eephus), suggesting hitters with strong discipline or tracking ability. Struggles with traditional off-speed like changeups may indicate difficulty adjusting to late movement or deceptive speed drops.


____________________________________________________________________________________________

Stance Cluster 4


Widest stance (foot_sep ≈ 27.6), lowest vertical position (batter_y ≈ 26.6), and strongest left-handed skew (bat_side_enc ≈ 0.61).

Pitch Outcomes:

Best performance across most pitch types, including CH (Changeup), FF (Four-seam Fastball), FS (Splitter), and KC (Knuckle Curve).

Performs well even on SL (Slider) and FO (Forkball).

Slightly weaker vs ST (Sweeper) and SC (Screwball) compared to Cluster 1/2.

Interpretation:
This stance offers power and balance, allowing solid coverage across the strike zone. It seems well-suited for both timing fastballs and adjusting to off-speed, making it the most "all-around" effective stance. It likely benefits from strong lower-body stability and plate discipline.
____________________________________________________________________________________________

General Takeaway


Closed or very open stances (Cluster 1, 3) often show strength vs slower or laterally breaking pitches but have mixed fastball results.

Wide, low stances (Cluster 2, 4) show greater success vs breaking and off-speed pitches, with Cluster 4 standing out as most versatile.

Compact, neutral stances (Cluster 0) show balanced outcomes, neither excelling nor struggling strongly, with particular success against sinkers and knuckleballs.



Clusters 1 & 2 handle screwballs the best — these stances likely prepare hitters for lateral break.

Cluster 3 excels at slower pitches, while 0 & 4 both perform best vs KN, suggesting either good vision, timing, or general unpredictability tolerance.

Cluster 4 dominates against fastballs and changeups, which are common — this could mean it's a "default effective" stance.

Cluster 1 is strongest vs tricky breaking pitches.

Cluster 0 is surprisingly good vs SI (Sinker) and KN, despite having a more neutral stance.
