In [None]:
# !pip install -q numpy pandas tensorflow

## references
# https://keras.io/examples/structured_data/structured_data_classification_from_scratch/
# https://www.kaggle.com/datasets/pschale/mlb-pitch-data-20152018/code
# https://www.kaggle.com/code/ryancmcv/mlb-pitch-data
# https://stackoverflow.com/questions/64689483/how-to-do-multiclass-classification-with-keras

# to_ordinal https://www.tensorflow.org/api_docs/python/tf/keras/utils/to_ordinal
# to_categorical https://www.tensorflow.org/api_docs/python/tf/keras/utils/to_categorical


import os

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras

from tensorflow.keras import layers
from tensorflow.keras.layers import Normalization
from tensorflow.keras.layers import IntegerLookup
from tensorflow.keras.layers import StringLookup

tf.__version__

In [None]:

##!curl -s 'https://storage.googleapis.com/0x19f.com/media/kaggle-mlb-pitch-data-2015-2018.tgz' | tar xz

BASEBALL_FILES_BASE = os.getcwd() + "/kaggle-mlb-pitch-data-2015-2018/"
BASEBALL_FILES_BASE = BASEBALL_FILES_BASE + "2019_"  # just look at 2019 to speed things up


pitches = pd.read_csv(BASEBALL_FILES_BASE + 'pitches.csv')
atbats = pd.read_csv(BASEBALL_FILES_BASE + 'atbats.csv')

df = pd.merge(pitches, atbats, how='inner', on='ab_id')

df = df[[
    # situation stuff
    "ab_id", "inning", "top", "outs", "on_1b", "on_2b", "on_3b",

    # matchup
    "pitcher_id", "p_throws", "batter_id", "stand",

    # specific pitch state
    "pitch_num", "b_count", "s_count",
    # TODO need to add running total of pitches for the current pitcher, and maybe strike percentage
    # TODO add the previous n pitches before and their code outcome
    # 
    
    # this is the label we're trying to predict
    "code",
    "pitch_type",
    # TODO also make the label have the location. not just FF (fourseam fastball, but also high inside)
]]

# remove anything without a pitchtype (add more as helpful)
df = df.dropna(subset=['pitch_type', 'code'])

# make our target label
# for now, whether the pritcher threw a hitable pitch, that is having codes:
# S - Swinging Strike
# C - Called Strike
# F - Foul
# T - Foul Tip
# L - Foul Bunt

hittablePitchCodes = ["S", "C", "F", "T", "L", ]
df['target_label'] = np.where(df['code'].isin(hittablePitchCodes), 1, 0)


# simple number codes for strings
df['p_throws_right'] = np.where(df['p_throws'].isin(['R']), 1, 0)
df['b_stands_right'] = np.where(df['stand'].isin(['R']), 1, 0)

# get appropriate types. 
# TODO prob can use a schema on the CSV read to get only the columns we're after, alias them to be more
#      useful, and cast to the right type
# TODO consider to_ordinal for cumulative things maybe? inning, outs, pitch_num, b_count, s_count
df = df.astype({
    'ab_id': 'int32',
    'inning': 'int8',
    'top': 'int8',
    'outs': 'int8',
    'on_1b': 'int8',
    'on_2b': 'int8',
    'on_3b': 'int8',
    'pitch_num': 'int8',
    'b_count': 'int8',
    's_count': 'int8',
  })

# drop ab_id now that join has occured and others we've encoded
df = df.drop(['ab_id', 'code', 'pitch_type', 'p_throws', 'stand', ], axis=1)

df = df.truncate(after=10000)


In [None]:
df["target_label"].value_counts()
  


In [None]:


val_dataframe = df.sample(frac=0.2, random_state=1337)
train_dataframe = df.drop(val_dataframe.index)

print(
    "Using %d samples for training and %d for validation"
    % (len(train_dataframe), len(val_dataframe))
)


def dataframe_to_dataset(dataframe):
    dataframe = dataframe.copy()
    labels = dataframe.pop("target_label")
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    ds = ds.shuffle(buffer_size=len(dataframe))
    return ds

train_ds = dataframe_to_dataset(train_dataframe)
val_ds = dataframe_to_dataset(val_dataframe)

BATCH_SIZE = 5
train_ds = train_ds.batch(BATCH_SIZE)
val_ds = val_ds.batch(BATCH_SIZE)

print(train_ds.take(1))

In [None]:
df


In [None]:
# lookup_class is one of StringLookup, IntegerLookup
def encode_categorical_feature(feature, name, dataset, lookup_class):
    # Create a lookup layer which will turn strings into integer indices
    lookup = lookup_class(output_mode="int")

    # Prepare a Dataset that only yields our feature
    feature_ds = dataset.map(lambda x, y: x[name])
    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))

    # Learn the set of possible string values and assign them a fixed integer index
    lookup.adapt(feature_ds)

    # Turn the input into integer indices
    encoded_feature = lookup(feature)
    return encoded_feature


encodedFields = []
all_inputs = []

string_fields = ["pitch_type", "pitch_type"]

for f in [
    'pitcher_id', 'batter_id', 'top', 'on_1b', 'on_2b', 'on_3b', 
    'b_count','s_count', 'p_throws_right', 'b_stands_right', 'inning', 'outs', 'pitch_num',
]:
    print("encoding %s ..." % f)
    input = keras.Input(shape=(1,), name=f)
    all_inputs.append(input)

    feature_ds = train_ds.map(lambda x, y: x[f])
    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))

    lookup_class = StringLookup if (f in string_fields) else IntegerLookup
    lookup = lookup_class(output_mode="int")
    lookup.adapt(feature_ds)
    encoded_feature = lookup(input)    
    encodedFields.append(encoded_feature)


## TODO also encode the pitch_type which is in the is this right? 

# label_field = "pitch_type"
# print("encoding label %s ..." % label_field)
# label_input = keras.Input(shape=(1,), name=label_field) # TODO should this be an "keras.Output"?
# label_input_encoded = encode_categorical_feature(input, label_field, val_ds, IntegerLookup)
# encodedFields.append(label_input_encoded)
    

all_features = layers.concatenate(encodedFields)

x = layers.Dense(32, activation="relu")(all_features)
x = layers.Dropout(0.5)(x)
output = layers.Dense(1, activation="sigmoid")(x)

model = keras.Model(all_inputs, output)

# TODO categorical_crossentropy or binary_crossentropy?
model.compile("adam", "binary_crossentropy", metrics=["accuracy"])



In [None]:

model.fit(train_ds, epochs=10, validation_data=val_ds)
