# Predicting Expected Total Bases from Batted Ball Data

This notebook uses the TrackMan 2022 dataset to build predictive models of expected total bases
for batted balls in baseball. We compare Random Forest and XGBoost models,
using features like exit speed, launch angle, spin rate, and more.

# Data Upload & Cleaning


In [None]:
import pandas as pd
import numpy as np
# Load dataset into DataFrame
trackman = pd.read_csv("Trackman(2).csv")
print(trackman.head(5))

# Keep only rows with valid batted ball events
trackman = trackman[trackman["PlayResult"].notnull()]

# Drop rows with nulls to improve model accuracy
trackman = trackman.dropna(subset=["ExitSpeed", "Angle", "Distance", "Direction", "HitType", "HitSpinRate", "HangTime",  "MaxHeight", "VertApprAngle", "HorzApprAngle"])



# Target Variable Construction


In [None]:
# Map play results to total bases
total_base_map = {
    "Single": 1,
    "Double": 2,
    "Triple": 3,
    "HomeRun": 4
}

trackman["TotalBases"] = trackman["PlayResult"].map(total_base_map).fillna(0)

# Feature Selection & Encoding




In [None]:

# Select features for modeling
features = [
    "ExitSpeed", "Angle", "Direction", "Distance", "HitSpinRate", "HangTime",
    "MaxHeight", "VertApprAngle", "HorzApprAngle", "HitType"
]

# Keep only selected features and target, drop nulls
trackman = trackman[features + ["TotalBases"]].dropna()

# One-hot encode HitType feature
trackman = pd.get_dummies(trackman, columns=["HitType"], drop_first=True)

# Random Forest Model


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score



# Split data 80/20 into train and test sets
X = trackman.drop("TotalBases", axis=1)
y = trackman["TotalBases"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize Random Forest model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

# Evaluate model with RMSE and R²
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R²:", r2_score(y_test, y_pred))

# Hyperparameter Tuning with GridSearchCV


In [None]:
from sklearn.model_selection import GridSearchCV
import numpy as np

# Define parameter grid
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2']
}

# Initialize Random Forest model
rf_model = RandomForestRegressor(random_state=42)

# Configure GridSearchCV
grid = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=5,
    verbose=1,
    n_jobs=-1
)

# Fit grid search on training data
grid.fit(X_train, y_train)

# Use best model from grid search
best_rf = grid.best_estimator_
y_pred = best_rf.predict(X_test)

# Evaluate tuned model with RMSE and R²
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Best Parameters:", grid.best_params_)
print("RMSE:", rmse)
print("R²:", r2)

# Model Comparison: Random Forest vs. XGBoost

---



In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Baseline XGBoost (no tuning)
# Train baseline XGBoost model
xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

xgb_model.fit(X_train, y_train)

# Evaluate baseline model with RMSE and R²
y_pred = xgb_model.predict(X_test)

print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R²:", r2_score(y_test, y_pred))



# Hyperparameter Tuning for XGBoost

In [None]:
from sklearn.model_selection import GridSearchCV

# Hyperparameter tuning for XGBoost
param_grid = {
    'max_depth': [4, 6, 8],
    'learning_rate': [0.05, 0.1, 0.2],
    'n_estimators': [100, 200],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Initialize XGBoost model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# Configure GridSearchCV
grid = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=5,
    verbose=1,
    n_jobs=-1
)

# Fit GridSearchCV on training data
grid.fit(X_train, y_train)

# Evaluate best tuned model on test set
best_xgb = grid.best_estimator_
y_pred = best_xgb.predict(X_test)

print("Best Parameters:", grid.best_params_)
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R²:", r2_score(y_test, y_pred))


# Feature Correlation Analysis

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Combine features and target for correlation analysis
corr_df = X_train.copy()
corr_df["TotalBases"] = y_train

# Compute correlation matrix
corr_matrix = corr_df.corr()

# Plot heatmap of correlations
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm", square=True)
plt.title("Correlation Matrix of Features")
plt.show()

In [None]:
!pip install shap
import shap


 # Interpreting Model Predictions with SHAP

In [None]:
# Create SHAP explainer for tuned XGBoost model
explainer = shap.TreeExplainer(best_xgb)

# Compute SHAP values for test set
shap_values = explainer.shap_values(X_test)


# Bar plot for feature importances
shap.summary_plot(shap_values, X_test, plot_type="bar")

 # Feature Selection and Dimensionality Reduction



In [None]:

# Drop low-importance features based on SHAP values
low_impact_features = ['HitType_PopUp', 'HitType_GroundBall', 'VertApprAngle', 'HorzApprAngle', 'HangTime']
X_reduced = X.drop(columns=low_impact_features)

# Split reduced dataset 80/20 into train/test
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42)

# Define parameter grid
param_grid = {
    'max_depth': [4, 6, 8],
    'learning_rate': [0.05, 0.1, 0.2],
    'n_estimators': [100, 200],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Initialize XGBoost model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# Configure GridSearchCV
grid = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=5,
    verbose=1,
    n_jobs=-1
)

# Fit model on reduced feature set
grid.fit(X_train, y_train)

# Predict with best estimator
best_xgb = grid.best_estimator_
y_pred = best_xgb.predict(X_test)

# Evaluate with RMSE and R²
print("Best Parameters:", grid.best_params_)
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R²:", r2_score(y_test, y_pred))


# Conclusions
- Random Forest and XGBoost performed well, with XGBoost slightly stronger after tuning.
- ExitSpeed and Launch Angle were the most important predictors.
- Removing low-importance features simplified the model without hurting performance.

In [None]:
import pickle

with open("model.pkl", "wb") as f:
    pickle.dump(best_xgb, f)

In [None]:
from google.colab import files
files.download("model.pkl")