In [None]:

# good references
# https://www.tensorflow.org/tutorials/load_data/pandas_dataframe
# https://www.tensorflow.org/tutorials/load_data/csv

import os

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras

from keras import layers
from keras import Input

tf.__version__

In [None]:
# TODO there is a numpy util to get files from within tgz's and zips
##!curl -s 'https://storage.googleapis.com/0x19f.com/media/kaggle-mlb-pitch-data-2015-2018.tgz' | tar xz

BASEBALL_FILES_BASE = os.getcwd() + "/kaggle-mlb-pitch-data-2015-2018/"
BASEBALL_FILES_BASE = BASEBALL_FILES_BASE + "2019_"  # just look at 2019 to speed things up

pitches = pd.read_csv(BASEBALL_FILES_BASE + 'pitches.csv')
atbats = pd.read_csv(BASEBALL_FILES_BASE + 'atbats.csv')

df = pd.merge(pitches, atbats, how='inner', on='ab_id')

df = df[[
    # situation stuff
    "ab_id", "inning", "top", "outs", "on_1b", "on_2b", "on_3b",

    # matchup
    "pitcher_id", "p_throws", "batter_id", "stand",

    # specific pitch state
    "pitch_num", "b_count", "s_count",
    # TODO need to add running total of pitches for the current pitcher, and maybe strike percentage
    # TODO add the previous n pitches before and their code outcome
    # 
    
    # this is the label we're trying to predict
    "code",
    "pitch_type",
    # TODO also make the label have the location. not just FF (fourseam fastball, but also high inside)
]]

# remove anything without values, hard to have defaults
# df = df.dropna(subset=['pitch_type', 'code'])
df = df.dropna()

## this is our current target label TODO make this categorical 
df["is_fastball"] = np.where(df["pitch_type"].isin(["FC","FF","FT",]), 1, 0)

# get appropriate types. 
# TODO prob can use a schema on the CSV read to get only the columns we're after, alias them to be more
#      useful, and cast to the right type
# TODO consider to_ordinal for cumulative things maybe? inning, outs, pitch_num, b_count, s_count

# TODO there is a numpy util to get files from within tgz's and zips
##!curl -s 'https://storage.googleapis.com/0x19f.com/media/kaggle-mlb-pitch-data-2015-2018.tgz' | tar xz

BASEBALL_FILES_BASE = os.getcwd() + "/kaggle-mlb-pitch-data-2015-2018/"
BASEBALL_FILES_BASE = BASEBALL_FILES_BASE + "2019_"  # just look at 2019 to speed things up


pitches = pd.read_csv(BASEBALL_FILES_BASE + 'pitches.csv')
atbats = pd.read_csv(BASEBALL_FILES_BASE + 'atbats.csv')

df = pd.merge(pitches, atbats, how='inner', on='ab_id')

df = df[[
    # situation stuff
    "ab_id", "inning", "top", "outs", 
    
    # would get baserunner stuff but these were all zero.
    #"on_1b", "on_2b", "on_3b",

    # matchup
    "pitcher_id", "p_throws", "batter_id", "stand",

    # TODO add catcher, team
    # specific pitch state
    "pitch_num", "b_count", "s_count",
    # TODO p0: multi hot encode the count
    # TODO need to add running total of pitches for the current pitcher, and maybe strike percentage
    # TODO add the previous n pitches before and their code outcome
    # 
    
    # this is the label we're trying to predict
    # "code",
    "pitch_type",
    # TODO also make the label have the location. not just FF (fourseam fastball, but also high inside)
]]

# remove anything without values, hard to have defaults
df = df.dropna()

## this is our current target label TODO make this categorical 
df["is_fastball"] = np.where(df["pitch_type"].isin(["FC","FF","FT",]), 1, 0)

# get appropriate types. 
# TODO prob can use a schema on the CSV read to get only the columns we're after, alias them to be more
#      useful, and cast to the right type
# TODO consider to_ordinal for cumulative things maybe? inning, outs, pitch_num, b_count, s_count

df = df.astype({
    # string features
    'p_throws': 'string',
    'stand': 'string',

    # small card numeric features
    'inning': 'int8',
    'top': 'int8',
    'outs': 'int8',

    'pitch_num': 'int8',
    'b_count': 'int8',
    's_count': 'int8',

    # player identifiers, treat them as strings so we dont scale them
    'pitcher_id': 'string', 
    'batter_id': 'string',
  })

# drop ab_id now that join has occured, not a useful feature; also pitch_type since we're doing is_fastball
df = df.drop(['ab_id', 'pitch_type'], axis=1)

# TODO remove this truncation when solid
df = df.truncate(after=100*1000)


pitch_features = df.copy()
pitch_labels = pitch_features.pop('is_fastball')

print(pitch_features.info())
print(pitch_features.shape)

pitch_features.head()


In [None]:


feature_inputs, processed_inputs = {}, []

for name, col in pitch_features.items():
    print("processing: %s ..." % name)
    col_type_str = str(col.dtype)
    tf_type_for_col = tf.as_dtype(col_type_str)

    # first create the feature's input and collect into feature_inputs
    col_input = tf.keras.Input(shape=(1,), name=name, dtype=tf_type_for_col)
    feature_inputs[name] = col_input

    # next, normalize the feature values and collect them in feature_normalizers
    if col_type_str in ["string"]:       
        feature_lookup = layers.StringLookup(vocabulary=np.unique(pitch_features[name]))
        one_hot = layers.CategoryEncoding(num_tokens=feature_lookup.vocabulary_size(), output_mode="one_hot")

        col_values_from_lookup = feature_lookup(col_input)
        normalizer = one_hot(col_values_from_lookup)

    else:  # TODO should prob do a better job of figuring out numerics
        norm = layers.Normalization()
        norm.adapt(col)
        normalizer = norm(col_input)
        
    # here we append the normalizer from the respective types
    processed_inputs.append(normalizer)

feature_inputs, len(processed_inputs)


In [None]:
all_processed_inputs = layers.Concatenate()(processed_inputs)
pitch_preprocessing = tf.keras.Model(feature_inputs, all_processed_inputs)

# tf.keras.utils.plot_model(model = pitch_preprocessing , rankdir="LR", dpi=72, show_shapes=True)

In [None]:

seq_model = tf.keras.Sequential([
    layers.Dense(256, activation="relu"),
    layers.Dense(64, activation="relu"),
    layers.Dense(32, activation="relu"),
    layers.Dense(1),
  ])


preprossesed_inputs = pitch_preprocessing(feature_inputs)
result = seq_model(preprossesed_inputs)
pitch_model = tf.keras.Model(feature_inputs, result)

pitch_model.compile("adam", "binary_crossentropy", run_eagerly=True, metrics=["accuracy"])

pitch_features_dict = {
  name: np.array(value) for name, value in pitch_features.items()
}
pitch_model.fit(x=pitch_features_dict, y=pitch_labels, epochs=100, batch_size=30*1000)

