In [None]:
# !pip install -q numpy pandas tensorflow

## references
# https://keras.io/examples/structured_data/structured_data_classification_from_scratch/
# https://www.kaggle.com/datasets/pschale/mlb-pitch-data-20152018/code
# https://www.kaggle.com/code/ryancmcv/mlb-pitch-data
# https://stackoverflow.com/questions/64689483/how-to-do-multiclass-classification-with-keras

# to_ordinal https://www.tensorflow.org/api_docs/python/tf/keras/utils/to_ordinal
# to_categorical https://www.tensorflow.org/api_docs/python/tf/keras/utils/to_categorical

# TODO filter features that fewer than n instances (ie pitchers who only pitched a game or so.) n=5 maybe?

import os

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras

from keras import layers
from keras.layers import Normalization
from keras.layers import IntegerLookup
from keras.layers import StringLookup



tf.__version__

In [None]:
# TODO there is a numpy util to get files from within tgz's and zips
##!curl -s 'https://storage.googleapis.com/0x19f.com/media/kaggle-mlb-pitch-data-2015-2018.tgz' | tar xz

BASEBALL_FILES_BASE = os.getcwd() + "/kaggle-mlb-pitch-data-2015-2018/"
BASEBALL_FILES_BASE = BASEBALL_FILES_BASE + "2019_"  # just look at 2019 to speed things up


pitches = pd.read_csv(BASEBALL_FILES_BASE + 'pitches.csv')
atbats = pd.read_csv(BASEBALL_FILES_BASE + 'atbats.csv')

df = pd.merge(pitches, atbats, how='inner', on='ab_id')

df = df[[
    # situation stuff
    "ab_id", "inning", "top", "outs", "on_1b", "on_2b", "on_3b",

    # matchup
    "pitcher_id", "p_throws", "batter_id", "stand",

    # specific pitch state
    "pitch_num", "b_count", "s_count",
    # TODO need to add running total of pitches for the current pitcher, and maybe strike percentage
    # TODO add the previous n pitches before and their code outcome
    # 
    
    # this is the label we're trying to predict
    "code",
    "pitch_type",
    # TODO also make the label have the location. not just FF (fourseam fastball, but also high inside)
]]

# remove anything without values, hard to have defaults
# df = df.dropna(subset=['pitch_type', 'code'])
df = df.dropna()

## this is our current target label TODO make this categorical 
df["is_fastball"] = np.where(df["pitch_type"].isin(["FC","FF","FT",]), 1, 0)

# get appropriate types. 
# TODO prob can use a schema on the CSV read to get only the columns we're after, alias them to be more
#      useful, and cast to the right type
# TODO consider to_ordinal for cumulative things maybe? inning, outs, pitch_num, b_count, s_count


# TODO revisit the sizes here if it helps
LARGE_INT = "int64"
SMALL_INT = LARGE_INT

df = df.astype({
    'ab_id': LARGE_INT,
    'inning': SMALL_INT,
    'top': SMALL_INT,
    'outs': SMALL_INT,
    'on_1b': SMALL_INT,
    'on_2b': SMALL_INT,
    'on_3b': SMALL_INT,
    'pitch_num': LARGE_INT,
    'b_count': SMALL_INT,
    's_count': SMALL_INT,
    'p_throws': 'string',
    'stand': 'string',
  })


# drop ab_id now that join has occured, not a useful feature
df = df.drop(['ab_id', "code","pitch_type"], axis=1)

# TODO remove this truncation when solid
df = df.truncate(after=1000)

df.info()

In [None]:

val_dataframe = df.sample(frac=0.2, random_state=1337)
train_dataframe = df.drop(val_dataframe.index)

print(
    "Using %d samples for training and %d for validation"
    % (len(train_dataframe), len(val_dataframe))
)


In [None]:

def dataframe_to_dataset(dataframe):
    copied_dataframe = dataframe.copy()
    labels = copied_dataframe.pop("is_fastball")
    
    ds = tf.data.Dataset.from_tensor_slices((dict(copied_dataframe), labels))
    ds = ds.shuffle(buffer_size=len(copied_dataframe))
    return ds


train_ds = dataframe_to_dataset(train_dataframe)
val_ds = dataframe_to_dataset(val_dataframe)


for x, y in train_ds.take(1):
    print("Input:", x)
    print("Target:", y)

    

In [None]:

def encode_feature_with_normalizer(feature, name, dataset, normalizer):
    # Prepare a Dataset that only yields our feature
    feature_ds = dataset.map(lambda x, y: x[name])
    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))
    
    normalizer.adapt(feature_ds)
    encoded_feature = normalizer(feature)
    return encoded_feature


feature_types = {
    # TODO these become int32, we should maybe treat them that way the whole way through
    # TODO better ... use the pd type here too 
    SMALL_INT : {
        "features" : [
            "inning", "top", "outs", "on_1b", "on_2b", "on_3b", "b_count", "s_count", 
            "pitcher_id", "batter_id", "pitch_num", 
        ],
        "normalizer_provider" : lambda : IntegerLookup(output_mode="binary")
    },
    "string" : {
        "features" : ["p_throws", "stand",],
        "normalizer_provider" : lambda : StringLookup(output_mode="binary")
    },
}

feature_inputs_combined, feature_layers_combined = [], []

for feature_type, feature_info in feature_types.items():
    for feature_name in feature_info["features"]:
        feature_input = keras.Input(shape=(1,), name=feature_name, dtype=feature_type)
        feature_inputs_combined.append(feature_input)

        normalizer = feature_info["normalizer_provider"]()
        feature_encoded = encode_feature_with_normalizer(feature_input, feature_name, train_ds, normalizer)
        feature_layers_combined.append(feature_encoded)
        print("encoded: %s (%s)" % (feature_name, feature_type))




In [None]:

# TODO eval https://stackoverflow.com/questions/69933345/expected-min-ndim-2-found-ndim-1-full-shape-received-none

    

all_features = layers.concatenate(feature_layers_combined)
x = layers.Dense(32, activation="relu")(all_features)
x = layers.Dropout(0.5)(x)
output = layers.Dense(1, activation="sigmoid")(x)


model = keras.Model(feature_inputs_combined, output)
model.compile("adam", "binary_crossentropy", metrics=["accuracy"])




In [None]:

model.fit(train_ds, epochs=50, validation_data=val_ds)



In [None]:
model.summary()