In [None]:
import pandas as pd 
import math
import matplotlib.pyplot as plt 
import seaborn as sns 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.regularizers import l2
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv("../input/bravespositioning/braves_23_sim.csv")
df

In [None]:
# Select relevant features
features = [ 'responsible_fielder',
    'launch_speed', 'launch_angle', 
    'hit_direction', 'distance', 'landing_x', 'landing_y', 'responsible_fielder_x', 'responsible_fielder_y','responsible_fielder_depth'
]
target = ['is_out']

X = df.drop(columns=['is_out'])
y = df['is_out']

categorical_features = ['responsible_fielder']
numerical_features = ['hit_direction','launch_speed', 'launch_angle', 'distance', 
                      'landing_x', 'landing_y', 'responsible_fielder_x', 'responsible_fielder_y', 'responsible_fielder_depth']

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply preprocessing
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

# Encode categorical variables
data_encoded = pd.get_dummies(df[features])

# Combine the encoded features with the target variables
data_model = pd.concat([data_encoded, df[target]], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply preprocessing
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)


In [None]:
from tensorflow.keras.layers import BatchNormalization, LeakyReLU

def create_model(input_shape):
    model = Sequential()
    model.add(Dense(128, input_shape=input_shape, kernel_regularizer=l2(0.01)))
    model.add(LeakyReLU(alpha=0.1))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    
    model.add(Dense(64, kernel_regularizer=l2(0.01)))
    model.add(LeakyReLU(alpha=0.1))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    
    model.add(Dense(32, kernel_regularizer=l2(0.01)))
    model.add(LeakyReLU(alpha=0.1))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    
    model.add(Dense(1, activation='sigmoid'))
    return model

# Create the model
input_shape = (X_train.shape[1],)
model = create_model(input_shape)

# Compile the model
model.compile(optimizer='AdamW', loss='binary_crossentropy', metrics=['accuracy'])

# Summary of the model
model.summary()

# Define callbacks
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.0001)

# Train the model
history = model.fit(X_train, y_train, epochs=100, validation_split=0.3, batch_size=32, callbacks=[early_stopping, reduce_lr])

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")

# Plot the training history
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')

plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.show()



In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import calibration_curve

# Predict probabilities on the training set
y_pred_prob_train = model.predict(X_train)

# Train a logistic regression model on these probabilities
calibration_model = LogisticRegression()
calibration_model.fit(y_pred_prob_train.reshape(-1, 1), y_train)

# Predict probabilities on the test set using the original model
y_pred_prob_test = model.predict(X_test)

# Calibrate the predicted probabilities using the logistic regression model
y_pred_prob_calibrated = calibration_model.predict_proba(y_pred_prob_test.reshape(-1, 1))[:, 1]

# Compute and plot the calibration curve for the calibrated model
prob_true_calibrated, prob_pred_calibrated = calibration_curve(y_test, y_pred_prob_calibrated, n_bins=10, strategy='uniform')

plt.figure(figsize=(8, 6))
plt.plot(prob_pred_calibrated, prob_true_calibrated, marker='o', label='Calibrated Model')
plt.plot([0, 1], [0, 1], linestyle='--', label='Perfectly Calibrated')
plt.xlabel('Mean Predicted Probability')
plt.ylabel('Fraction of Positives')
plt.title('Calibration Plot for Calibrated Model')
plt.legend()
plt.show()


In [None]:

from sklearn.metrics import brier_score_loss
brier_score = brier_score_loss(y_test, y_pred_prob_calibrated)
print(f"Brier Score: {brier_score}")


In [None]:
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import calibration_curve

# Predict probabilities on the training set
y_pred_prob_train = model.predict(X_train).flatten()  # Use predict to get probabilities

# Train a logistic regression model on these probabilities
calibration_model = LogisticRegression()
calibration_model.fit(y_pred_prob_train.reshape(-1, 1), y_train)

# Predict probabilities on the test set using the original model
y_pred_prob_test = model.predict(X_test).flatten()  # Use predict to get probabilities

# Calibrate the predicted probabilities using the logistic regression model
y_pred_prob_calibrated = calibration_model.predict_proba(y_pred_prob_test.reshape(-1, 1))[:, 1]

# Compute and plot the calibration curve for the calibrated model
prob_true_calibrated, prob_pred_calibrated = calibration_curve(y_test, y_pred_prob_calibrated, n_bins=10, strategy='uniform')

plt.figure(figsize=(8, 6))
plt.plot(prob_pred_calibrated, prob_true_calibrated, marker='o', label='Calibrated Model')
plt.plot([0, 1], [0, 1], linestyle='--', label='Perfectly Calibrated')
plt.xlabel('Mean Predicted Probability')
plt.ylabel('Fraction of Positives')
plt.title('Calibration Plot for Calibrated Model')
plt.legend()
plt.show()



In [None]:
#Making feature engineering changes to hopefully improve the model. 

In [2]:
df['relative_distance'] = np.sqrt((df['responsible_fielder_x'] - df['fielder_landing_x'])**2 + (df['responsible_fielder_y'] - df['fielder_landing_y'])**2)

# Display the DataFrame with the new column
df.head()
df['relative_distance'] = np.sqrt((df['responsible_fielder_x'] - df['fielder_landing_x'])**2 + (df['responsible_fielder_y'] - df['fielder_landing_y'])**2)

# Display the DataFrame with the new column
df.head()

NameError: name 'np' is not defined

In [None]:
#Heat Maps 
import seaborn as sns
outs_data = df[df['is_out'] == 1]

# List of positions to create heatmaps for
positions = outs_data['responsible_fielder'].unique()

# Create heatmaps for each position
for position in positions:
    position_data = outs_data[outs_data['responsible_fielder'] == position]
    
    plt.figure(figsize=(10, 8))
    heatmap = sns.kdeplot(
        x=position_data['responsible_fielder_x'],
        y=position_data['responsible_fielder_y'],
        cmap='coolwarm',
        fill=True,
        thresh=0,
        levels=100
    )
    plt.title(f'Heatmap of Where Responsible Fielders Are Making Outs (Position {position})')
    plt.xlabel('Responsible Fielder X Coordinate')
    plt.ylabel('Responsible Fielder Y Coordinate')
    plt.show()
