In [12]:
# !pip install -q numpy pandas tensorflow

## references
# https://keras.io/examples/structured_data/structured_data_classification_from_scratch/
# https://www.kaggle.com/datasets/pschale/mlb-pitch-data-20152018/code
# https://www.kaggle.com/code/ryancmcv/mlb-pitch-data
# https://stackoverflow.com/questions/64689483/how-to-do-multiclass-classification-with-keras

# to_ordinal https://www.tensorflow.org/api_docs/python/tf/keras/utils/to_ordinal
# to_categorical https://www.tensorflow.org/api_docs/python/tf/keras/utils/to_categorical


import os

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras

from tensorflow.keras import layers
from tensorflow.keras.layers import Normalization
from tensorflow.keras.layers import IntegerLookup
from tensorflow.keras.layers import StringLookup

tf.__version__

'2.13.0'

In [27]:

##!curl -s 'https://storage.googleapis.com/0x19f.com/media/kaggle-mlb-pitch-data-2015-2018.tgz' | tar xz

BASEBALL_FILES_BASE = os.getcwd() + "/kaggle-mlb-pitch-data-2015-2018/"
BASEBALL_FILES_BASE = BASEBALL_FILES_BASE + "2019_"  # just look at 2019 to speed things up


pitches = pd.read_csv(BASEBALL_FILES_BASE + 'pitches.csv')
atbats = pd.read_csv(BASEBALL_FILES_BASE + 'atbats.csv')

df = pd.merge(pitches, atbats, how='inner', on='ab_id')

df = df[[
    # situation stuff
    "ab_id", "inning", "top", "outs", "on_1b", "on_2b", "on_3b",

    # matchup
    "pitcher_id", "p_throws", "batter_id", "stand",

    # specific pitch state
    "pitch_num", "b_count", "s_count",
    # TODO need to add running total of pitches for the current pitcher, and maybe strike percentage
    # TODO add the previous n pitches before and their code outcome
    # 
    
    # this is the label we're trying to predict
    "code",
    "pitch_type",
    # TODO also make the label have the location. not just FF (fourseam fastball, but also high inside)
]]

# remove anything without a pitchtype (add more as helpful)
df = df.dropna(subset=['pitch_type', 'code'])

# make our target label
# for now, whether the pritcher threw a hitable pitch, that is having codes:
# S - Swinging Strike
# C - Called Strike
# F - Foul
# T - Foul Tip
# L - Foul Bunt

hittablePitchCodes = ["S", "C", "F", "T", "L", ]
df['target_label'] = np.where(df['code'].isin(hittablePitchCodes), 1, 0)


# simple number codes for strings
df['p_throws_right'] = np.where(df['p_throws'].isin(['R']), 1, 0)
df['b_stands_right'] = np.where(df['stand'].isin(['R']), 1, 0)

# get appropriate types. 
# TODO prob can use a schema on the CSV read to get only the columns we're after, alias them to be more
#      useful, and cast to the right type
# TODO consider to_ordinal for cumulative things maybe? inning, outs, pitch_num, b_count, s_count
df = df.astype({
    'ab_id': 'int32',
    'inning': 'int8',
    'top': 'int8',
    'outs': 'int8',
    'on_1b': 'int8',
    'on_2b': 'int8',
    'on_3b': 'int8',
    'pitch_num': 'int8',
    'b_count': 'int8',
    's_count': 'int8',
  })

# drop ab_id now that join has occured and others we've encoded
df = df.drop(['ab_id', 'code', 'pitch_type', 'p_throws', 'stand', ], axis=1)

df = df.truncate(after=10000)


In [31]:
df["pitch_type"].value_counts()



pitch_type
FF    3702
SL    1732
CH    1139
SI     841
CU     820
FT     673
FC     630
KC     302
FS     129
EP       3
Name: count, dtype: int64

In [45]:


val_dataframe = df.sample(frac=0.2, random_state=1337)
train_dataframe = df.drop(val_dataframe.index)

print(
    "Using %d samples for training and %d for validation"
    % (len(train_dataframe), len(val_dataframe))
)


def dataframe_to_dataset(dataframe):
    dataframe = dataframe.copy()
    labels = dataframe.pop("pitch_type")
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    ds = ds.shuffle(buffer_size=len(dataframe))
    return ds

train_ds = dataframe_to_dataset(train_dataframe)
val_ds = dataframe_to_dataset(val_dataframe)

BATCH_SIZE = 5
train_ds = train_ds.batch(BATCH_SIZE)
val_ds = val_ds.batch(BATCH_SIZE)



Using 7977 samples for training and 1994 for validation


In [52]:
# lookup_class is one of StringLookup, IntegerLookup
def encode_categorical_feature(feature, name, dataset, lookup_class):
    # Create a lookup layer which will turn strings into integer indices
    lookup = lookup_class(output_mode="int")

    # Prepare a Dataset that only yields our feature
    feature_ds = dataset.map(lambda x, y: x[name])
    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))

    # Learn the set of possible string values and assign them a fixed integer index
    lookup.adapt(feature_ds)

    # Turn the input into integer indices
    encoded_feature = lookup(feature)
    return encoded_feature


encodedFields = []
all_inputs = []

string_fields = ["pitch_type", "pitch_type"]

for f in [
    'pitcher_id', 'batter_id', 'top', 'on_1b', 'on_2b', 'on_3b', 
    'b_count','s_count', 'p_throws_right', 'b_stands_right', 'inning', 'outs', 'pitch_num',
]:
    print("encoding %s ..." % f)
    input = keras.Input(shape=(1,), name=f)
    all_inputs.append(input)

    feature_ds = train_ds.map(lambda x, y: x[f])
    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))

    lookup_class = StringLookup if (f in string_fields) else IntegerLookup
    lookup = lookup_class(output_mode="int")
    lookup.adapt(feature_ds)
    encoded_feature = lookup(input)    
    encodedFields.append(encoded_feature)


## TODO also encode the pitch_type which is in the is this right? 

# label_field = "pitch_type"
# print("encoding label %s ..." % label_field)
# label_input = keras.Input(shape=(1,), name=label_field) # TODO should this be an "keras.Output"?
# label_input_encoded = encode_categorical_feature(input, label_field, val_ds, IntegerLookup)
# encodedFields.append(label_input_encoded)
    

all_features = layers.concatenate(encodedFields)

x = layers.Dense(32, activation="relu")(all_features)
x = layers.Dropout(0.5)(x)
output = layers.Dense(1, activation="sigmoid")(x)

model = keras.Model(all_inputs, output)

# TODO categorical_crossentropy or binary_crossentropy?
model.compile("adam", "binary_crossentropy", metrics=["accuracy"])



encoding pitcher_id ...
encoding batter_id ...
encoding top ...
encoding on_1b ...
encoding on_2b ...
encoding on_3b ...
encoding b_count ...
encoding s_count ...
encoding p_throws_right ...
encoding b_stands_right ...
encoding inning ...
encoding outs ...
encoding pitch_num ...


In [49]:

model.fit(train_ds, epochs=10, validation_data=val_ds)


Epoch 1/10


2023-08-19 15:41:18.290617: W tensorflow/core/framework/op_kernel.cc:1805] OP_REQUIRES failed at cast_op.cc:121 : UNIMPLEMENTED: Cast string to float is not supported


UnimplementedError: Graph execution error:

Detected at node 'binary_crossentropy/Cast' defined at (most recent call last):
    File "/usr/lib/python3.9/runpy.py", line 197, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "/usr/lib/python3.9/runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "/home/stevenlinde/src/etc/aiml-inference-playground/.venv/lib/python3.9/site-packages/ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "/home/stevenlinde/src/etc/aiml-inference-playground/.venv/lib/python3.9/site-packages/traitlets/config/application.py", line 1043, in launch_instance
      app.start()
    File "/home/stevenlinde/src/etc/aiml-inference-playground/.venv/lib/python3.9/site-packages/ipykernel/kernelapp.py", line 736, in start
      self.io_loop.start()
    File "/home/stevenlinde/src/etc/aiml-inference-playground/.venv/lib/python3.9/site-packages/tornado/platform/asyncio.py", line 195, in start
      self.asyncio_loop.run_forever()
    File "/usr/lib/python3.9/asyncio/base_events.py", line 596, in run_forever
      self._run_once()
    File "/usr/lib/python3.9/asyncio/base_events.py", line 1890, in _run_once
      handle._run()
    File "/usr/lib/python3.9/asyncio/events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "/home/stevenlinde/src/etc/aiml-inference-playground/.venv/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 516, in dispatch_queue
      await self.process_one()
    File "/home/stevenlinde/src/etc/aiml-inference-playground/.venv/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 505, in process_one
      await dispatch(*args)
    File "/home/stevenlinde/src/etc/aiml-inference-playground/.venv/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 412, in dispatch_shell
      await result
    File "/home/stevenlinde/src/etc/aiml-inference-playground/.venv/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 740, in execute_request
      reply_content = await reply_content
    File "/home/stevenlinde/src/etc/aiml-inference-playground/.venv/lib/python3.9/site-packages/ipykernel/ipkernel.py", line 422, in do_execute
      res = shell.run_cell(
    File "/home/stevenlinde/src/etc/aiml-inference-playground/.venv/lib/python3.9/site-packages/ipykernel/zmqshell.py", line 546, in run_cell
      return super().run_cell(*args, **kwargs)
    File "/home/stevenlinde/src/etc/aiml-inference-playground/.venv/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3009, in run_cell
      result = self._run_cell(
    File "/home/stevenlinde/src/etc/aiml-inference-playground/.venv/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3064, in _run_cell
      result = runner(coro)
    File "/home/stevenlinde/src/etc/aiml-inference-playground/.venv/lib/python3.9/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "/home/stevenlinde/src/etc/aiml-inference-playground/.venv/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3269, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "/home/stevenlinde/src/etc/aiml-inference-playground/.venv/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3448, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "/home/stevenlinde/src/etc/aiml-inference-playground/.venv/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3508, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "/tmp/ipykernel_3163/581066605.py", line 1, in <module>
      model.fit(train_ds, epochs=10, validation_data=val_ds)
    File "/home/stevenlinde/src/etc/aiml-inference-playground/.venv/lib/python3.9/site-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/home/stevenlinde/src/etc/aiml-inference-playground/.venv/lib/python3.9/site-packages/keras/src/engine/training.py", line 1742, in fit
      tmp_logs = self.train_function(iterator)
    File "/home/stevenlinde/src/etc/aiml-inference-playground/.venv/lib/python3.9/site-packages/keras/src/engine/training.py", line 1338, in train_function
      return step_function(self, iterator)
    File "/home/stevenlinde/src/etc/aiml-inference-playground/.venv/lib/python3.9/site-packages/keras/src/engine/training.py", line 1322, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/stevenlinde/src/etc/aiml-inference-playground/.venv/lib/python3.9/site-packages/keras/src/engine/training.py", line 1303, in run_step
      outputs = model.train_step(data)
    File "/home/stevenlinde/src/etc/aiml-inference-playground/.venv/lib/python3.9/site-packages/keras/src/engine/training.py", line 1081, in train_step
      loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "/home/stevenlinde/src/etc/aiml-inference-playground/.venv/lib/python3.9/site-packages/keras/src/engine/training.py", line 1139, in compute_loss
      return self.compiled_loss(
    File "/home/stevenlinde/src/etc/aiml-inference-playground/.venv/lib/python3.9/site-packages/keras/src/engine/compile_utils.py", line 265, in __call__
      loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "/home/stevenlinde/src/etc/aiml-inference-playground/.venv/lib/python3.9/site-packages/keras/src/losses.py", line 142, in __call__
      losses = call_fn(y_true, y_pred)
    File "/home/stevenlinde/src/etc/aiml-inference-playground/.venv/lib/python3.9/site-packages/keras/src/losses.py", line 268, in call
      return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "/home/stevenlinde/src/etc/aiml-inference-playground/.venv/lib/python3.9/site-packages/keras/src/losses.py", line 2421, in binary_crossentropy
      y_true = tf.cast(y_true, y_pred.dtype)
Node: 'binary_crossentropy/Cast'
Cast string to float is not supported
	 [[{{node binary_crossentropy/Cast}}]] [Op:__inference_train_function_127327]