In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder

In [8]:
mlb_data = pd.read_csv('/content/drive/MyDrive/FA2023/IntroToAI/jlamber4-CSE30124-Fall2023-submissions/ProjectData/rvs.csv')
pitches = ['FF', 'SI', 'SL', 'CB', 'CH', 'KN']
mlb_data

Unnamed: 0.1,Unnamed: 0,out_base_count_state,game_pk,at_bat_number,pitch_number,pitch_type_condensed,events,description,game_inning,game_inning_ab,balls,strikes,runs_post_ab,runs_from_event,runs_expected,next_out_base_count_state,next_runs_expected,delta_re
0,1,0_000_00,716352,1,1,FF,out,called_strike,716352_1,716352_1_1,0,0,0,0,1.617840,0_000_01,1.607383,-0.010456
1,2,0_000_01,716352,1,2,SI,out,foul,716352_1,716352_1_1,0,1,0,0,1.607383,0_000_02,1.559021,-0.048362
2,3,0_000_02,716352,1,3,SI,out,foul,716352_1,716352_1_1,0,2,0,0,1.559021,0_000_02,1.559021,0.000000
3,4,0_000_02,716352,1,4,CT,out,foul,716352_1,716352_1_1,0,2,0,0,1.559021,0_000_02,1.559021,0.000000
4,5,0_000_02,716352,1,5,SI,out,ball,716352_1,716352_1_1,0,2,0,0,1.559021,0_000_12,1.553814,-0.005206
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
709912,709913,1_110_31,718782,89,5,FF,out,swinging_strike,718782_9,718782_9_89,3,1,2,0,1.868902,1_110_32,1.779021,-0.089881
709913,709914,1_110_32,718782,89,6,FF,out,hit_into_play,718782_9,718782_9_89,3,2,2,1,1.779021,2_010_00,1.216978,0.437957
709914,709915,2_010_00,718782,90,1,FF,out,called_strike,718782_9,718782_9_90,0,0,1,0,1.216978,2_010_01,1.160057,-0.056921
709915,709916,2_010_01,718782,90,2,FF,out,foul,718782_9,718782_9_90,0,1,1,0,1.160057,2_010_02,1.178808,0.018751


In [73]:
def preprocess_data(data):

    if data['out_base_count_state'].isnull().any() or data['next_out_base_count_state'].isnull().any():
        data.dropna(subset=['out_base_count_state', 'next_out_base_count_state'], inplace=True)

    data['out_base_count_state'] = data['out_base_count_state'].astype(str)
    data['next_out_base_count_state'] = data['next_out_base_count_state'].astype(str)


    pitch_type_encoder = OneHotEncoder(sparse=False)
    pitch_type_encoded = pitch_type_encoder.fit_transform(data[['pitch_type_condensed']])
    pitch_type_df = pd.DataFrame(pitch_type_encoded, columns=pitch_type_encoder.get_feature_names_out(['pitch_type_condensed']))

    state_df = data['out_base_count_state'].apply(parse_state).apply(pd.Series)
    next_state_df = data['next_out_base_count_state'].apply(parse_state).apply(pd.Series)

    state_df = state_df.add_suffix('_current')
    next_state_df = next_state_df.add_suffix('_next')

    data = data.join(state_df).join(next_state_df)

    data = data.drop(columns=[
        'pitch_type_condensed',
        'out_base_count_state',
        'next_out_base_count_state',
        'events',
        'description',
        'game_inning',
        'game_inning_ab'
    ])

    data = pd.concat([data, pitch_type_df], axis=1)

    X = data.drop(columns=['delta_re'])
    y = data['delta_re']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    return X_train, X_test, y_train, y_test, pitch_type_encoder


def parse_state(state_str):
    outs, base, count = state_str.split('_')
    return pd.Series({
        'outs': int(outs),
        'on_first': int(base[0]),
        'on_second': int(base[1]),
        'on_third': int(base[2]),
        'balls': int(count[0]),
        'strikes': int(count[1])
    })

def calculate_delta_re(current_erv, next_erv):
    return next_erv - current_erv

def train_model(X_train, y_train):
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    return model

def decision_policy(state_series, model, pitch_types, pitch_type_encoder, training_features):
    best_pitch = None
    best_delta_re = float('inf')

    state_df = pd.DataFrame([state_series], columns=training_features[:-len(pitch_type_encoder.get_feature_names_out())])

    for pitch in pitch_types:
        state_with_pitch = state_df.copy()

        pitch_encoded = pitch_type_encoder.transform([[pitch]])
        pitch_encoded_df = pd.DataFrame(pitch_encoded, columns=pitch_type_encoder.get_feature_names_out())

        state_with_pitch = pd.concat([state_with_pitch, pitch_encoded_df], axis=1)

        state_with_pitch = state_with_pitch[training_features]

        predicted_delta_re = model.predict(state_with_pitch)[0]

        if predicted_delta_re < best_delta_re:
            best_delta_re = predicted_delta_re
            best_pitch = pitch

    return best_pitch


In [71]:
X_train, X_test, y_train, y_test, pitch_type_encoder = preprocess_data(mlb_data)

model = train_model(X_train, y_train)

