In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt

# Load and preprocess the dataset
data = pd.read_csv('slider_data.csv')  # Replace with your dataset file
data = data.dropna()  # Remove rows with missing values

# Combine "swinging_strike" and "swinging_strike_blocked" into a single target variable
data['swing_and_miss'] = (data['outcome'] == 'swinging_strike') | (data['outcome'] == 'swinging_strike_blocked')

# Define features and target variable
features = data.drop(['swing_and_miss'], axis=1)  # Adjust columns accordingly
target = data['swing_and_miss']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Train a Random Forest Classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)

# Feature Importance
feature_importances = model.feature_importances_
sorted_indices = feature_importances.argsort()[::-1]

# Top 3 important features
top_features = features.columns[sorted_indices][:3]
print("Top 3 important features:", top_features)

# Rank players based on predicted swing and miss rates
data['predicted_swing_and_miss_prob'] = model.predict_proba(features)[:, 1]
top_players = data.sort_values(by='predicted_swing_and_miss_prob', ascending=False).head(5)
print("Top 5 players:\n", top_players[['player_name', 'predicted_swing_and_miss_prob']])
