In [3]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score, KFold


In [4]:
df = pd.read_csv("braves_23_sim.csv")
df

Unnamed: 0,game_date,game_pk,player_name,batter,pitcher,stand,p_throws,pit_handR,bat_handR,home_team,...,responsible_fielder_x,responsible_fielder_y,responsible_fielder_angle,responsible_fielder_depth,responsible_fielder_coords_diff,responsible_fielder_angle_diff,fielder_landing_x,fielder_landing_y,fielder_landing_angle_radians,is_out
0,2023-10-12,748561,"Johnson, Pierce",592206,572955,R,R,1,1,PHI,...,-76.896164,116.285126,-33.475571,139.410368,16.465525,5.777129,-5.416530,-15.549107,-1.906000,1
1,2023-10-09,748563,"Yates, Kirby",592206,489446,R,R,1,1,ATL,...,136.387080,260.838400,27.604150,294.343518,27.580649,2.875241,-23.155956,-14.983121,-2.567292,0
2,2023-10-07,748562,"Strider, Spencer",592206,675911,R,R,1,1,ATL,...,-76.896164,116.285126,-33.475571,139.410368,104.970393,1.711256,57.038557,-88.121429,-0.996341,1
3,2023-10-07,748562,"Strider, Spencer",592206,675911,R,R,1,1,ATL,...,-76.896164,116.285126,-33.475571,139.410368,137.511766,17.756683,75.340691,-115.035934,-0.990962,1
4,2023-10-07,748562,"Strider, Spencer",592206,675911,R,R,1,1,ATL,...,136.387080,260.838400,27.604150,294.343518,61.869917,5.351455,-4.410295,61.712527,1.642140,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1768,2023-04-04,718711,"Dodd, Dylan",660766,689266,R,L,0,1,STL,...,-149.088490,256.868700,-30.131196,296.999843,76.551805,7.285287,10.581436,-75.816964,-1.432127,0
1769,2023-04-04,718711,"Dodd, Dylan",666185,689266,R,L,0,1,STL,...,68.324400,103.364760,33.464817,123.905195,41.286925,1.691603,18.544987,36.887581,1.104956,1
1770,2023-04-02,718743,"Tonkin, Michael",645302,543859,R,R,1,1,WSH,...,-36.778090,148.092070,-13.947022,152.590593,55.231617,7.668634,-0.017320,-55.231615,-1.571110,1
1771,2023-04-01,718753,"Jiménez, Joe",645302,641729,R,R,1,1,WSH,...,-75.237290,118.503030,-32.411366,140.369576,25.671836,1.810296,-17.887329,18.414305,2.341679,1


In [3]:
X = df[["launch_angle","launch_speed", "hit_direction", "hangtime", "distance", "landing_x", "landing_y", "responsible_fielder_depth"]]
y = df["is_out"]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create a Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Detailed classification report
print('Classification Report:')
print(classification_report(y_test, y_pred))

# Confusion matrix
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.83
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.71      0.74       183
           1       0.85      0.89      0.87       349

    accuracy                           0.83       532
   macro avg       0.82      0.80      0.81       532
weighted avg       0.83      0.83      0.83       532

Confusion Matrix:
[[130  53]
 [ 37 312]]


In [5]:
rf_model = RandomForestClassifier(random_state=42)
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
results = cross_val_score(rf_model, X, y, cv=kfold, scoring='accuracy')
print("Cross-validation accuracy: %.2f (%.2f)" % (results.mean(), results.std()))


Cross-validation accuracy: 0.79 (0.03)


In [6]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initialize the GridSearchCV object
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, 
                           cv=kfold, scoring='accuracy', n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search.fit(X, y)

# Print the best parameters and best score
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation accuracy: {grid_search.best_score_:.2f}")

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.2s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.2s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.2s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.2s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.2s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.3s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.3s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total ti

In [1]:
# do new rndom forest model tomorrow morning before the meeting 


In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the data
df = pd.read_csv("braves_23_sim.csv")

# Group the data by player position
positions = df['responsible_fielder'].unique()

# Dictionary to store models for each position
position_models = {}

# Iterate through each position and train a model
for position in positions:
    print(f"\nTraining model for position: {position}")
    
    # Filter the data for the current position
    position_df = df[df['responsible_fielder'] == position]
    
    # Define features and target
    X = position_df[["launch_angle", "launch_speed", "hit_direction", "hangtime", "distance", "landing_x", "landing_y", "responsible_fielder_depth"]]
    y = position_df["is_out"]
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # Create a Random Forest Classifier
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    
    # Train the model
    rf_model.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = rf_model.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy for {position}: {accuracy:.2f}')
    
    # Detailed classification report
    print('Classification Report:')
    print(classification_report(y_test, y_pred))
    
    # Confusion matrix
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred))
    
    # Store the model for the current position
    position_models[position] = rf_model
    
    # Cross-validation
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    results = cross_val_score(rf_model, X, y, cv=kfold, scoring='accuracy')
    print("Cross-validation accuracy: %.2f (%.2f)" % (results.mean(), results.std()))
    
    # Grid search for hyperparameter tuning
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'bootstrap': [True, False]
    }
    
    # Initialize the GridSearchCV object
    grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, 
                               cv=kfold, scoring='accuracy', n_jobs=-1, verbose=2)
    
    # Fit the grid search to the data
    grid_search.fit(X, y)
    
    # Print the best parameters and best score
    print(f"Best parameters for {position}: {grid_search.best_params_}")
    print(f"Best cross-validation accuracy for {position}: {grid_search.best_score_:.2f}")



Training model for position: 5
Accuracy for 5: 0.75
Classification Report:
              precision    recall  f1-score   support

           0       0.12      0.05      0.07        19
           1       0.81      0.92      0.86        83

    accuracy                           0.75       102
   macro avg       0.47      0.48      0.47       102
weighted avg       0.68      0.75      0.71       102

Confusion Matrix:
[[ 1 18]
 [ 7 76]]
Cross-validation accuracy: 0.79 (0.04)
Fitting 5 folds for each of 216 candidates, totalling 1080 fits
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.0s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.0s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.1s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; tota

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Cross-validation accuracy: 0.82 (0.03)
Fitting 5 folds for each of 216 candidates, totalling 1080 fits
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.0s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.0s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.0s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.0s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.0s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samp