In [None]:

# good references
# https://www.tensorflow.org/tutorials/load_data/pandas_dataframe
# https://www.tensorflow.org/tutorials/load_data/csv

import os

from collections import defaultdict

import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow import keras
from keras import layers
from keras import Input
from keras.utils import to_categorical

tf.__version__

In [None]:
# TODO there is a numpy util to get files from within tgz's and zips
##!curl -s 'https://storage.googleapis.com/0x19f.com/media/kaggle-mlb-pitch-data-2015-2018.tgz' | tar xz

BASEBALL_FILES_BASE = os.getcwd() + "/kaggle-mlb-pitch-data-2015-2018/"
BASEBALL_FILES_BASE = BASEBALL_FILES_BASE + "2019_"  # just look at 2019 to speed things up

pitches = pd.read_csv(BASEBALL_FILES_BASE + 'pitches.csv')
atbats = pd.read_csv(BASEBALL_FILES_BASE + 'atbats.csv')

df = pd.merge(pitches, atbats, how='inner', on='ab_id')

df = df[[
    # situation stuff
    "ab_id", "inning", "top", "outs", "on_1b", "on_2b", "on_3b",

    # matchup
    "pitcher_id", "p_throws", "batter_id", "stand",

    # specific pitch state
    "pitch_num", "b_count", "s_count",
    # TODO need to add running total of pitches for the current pitcher, and maybe strike percentage
    # TODO add the previous n pitches before and their code outcome
        
    # this is the label we're trying to predict
    "code",
    "pitch_type",
    # TODO also make the label have the location. not just FF (fourseam fastball, but also high inside)
]]

# get appropriate types. 
# TODO prob can use a schema on the CSV read to get only the columns we're after, alias them to be more
#      useful, and cast to the right type
# TODO consider to_ordinal for cumulative things maybe? inning, outs, pitch_num, b_count, s_count

# TODO there is a numpy util to get files from within tgz's and zips
##!curl -s 'https://storage.googleapis.com/0x19f.com/media/kaggle-mlb-pitch-data-2015-2018.tgz' | tar xz

BASEBALL_FILES_BASE = os.getcwd() + "/kaggle-mlb-pitch-data-2015-2018/"
BASEBALL_FILES_BASE = BASEBALL_FILES_BASE + "2019_"  # just look at 2019 to speed things up


pitches = pd.read_csv(BASEBALL_FILES_BASE + 'pitches.csv')
atbats = pd.read_csv(BASEBALL_FILES_BASE + 'atbats.csv')

df = pd.merge(pitches, atbats, how='inner', on='ab_id')

df = df[[
    # situation stuff
    "ab_id", "inning", # "top",  
    
    # would get baserunner stuff but these were all zero.
    #"on_1b", "on_2b", "on_3b",

    # matchup fields
    "pitcher_id", "p_throws", "stand", "batter_id", 

    # TODO add catcher, team
    # TODO need to add running total of pitches for the current pitcher, and maybe strike percentage
    # TODO add the previous n pitches before and their code outcome
    
    # specific pitch state, this becomes situation and is then dropped
    "b_count", "s_count", "outs", "pitch_num",      
    
    # this is the label we're trying to predict
    # TODO also make the label have the location, not just FF (but also high inside)
    "pitch_type",
]]

# remove anything without values, hard to have defaults
df = df.dropna()

# set the appropriate types. 
TYPE_DICT = {
    # string features
    'p_throws': 'string',
    'stand': 'string',

    # small card numeric features and player ids: treat as strings to avoid norming with mean
    # 'top': 'string',
    'inning': 'string',
    'outs': 'string',
    'pitcher_id': 'string', 
    'batter_id': 'string',

    # situation stuff as int8s
    'b_count': 'int8', 
    's_count': 'int8', 
    'pitch_num': 'int8',

    # ad_id as a long, helps make it readable
    'ab_id': 'int64',

}
df = df.astype(TYPE_DICT)


# add the pitch count sitaution, concat as an int8 array to loose the decimals,
# and then convert to a string so we can one_shot encode it too. 
df["situation"] = df[["b_count", "s_count"]].to_numpy().tolist()
df['situation'] = df['situation'].astype("string")

# add the previous pitch type to the mix
df['lag_pitch'] = df.groupby(by='ab_id', sort='pitch_num')['pitch_type'].shift(1, fill_value='-')
df['lag_pitch'] = df['lag_pitch'].astype("string")
# TODO consider concatenating all of the previous pitches

# TODO figure out "enough" data; for now subset to iterate faster
truncated_df = df.truncate(after=300*1000)

# finally, remove pitch_type and ab_id and stuff already in "situation" 
# because we dont need them at this point
truncated_df = truncated_df.drop(['ab_id', 'b_count', 's_count', 'pitch_num'], axis=1)


# ok, ready to split into features and labels. pitch_features is a copy
# of the df source, but we drop the label feature.
pitch_features = truncated_df.copy()
pitch_features.pop("pitch_type")

# the labels we process using a lookup to normalize to one_hot encoding   
all_pitch_types = np.unique(truncated_df['pitch_type'].unique())
pitch_type_lookup = layers.StringLookup(vocabulary=all_pitch_types, output_mode='one_hot')
pitch_labels = pitch_type_lookup(truncated_df.pitch_type)

print(pitch_features.info())
print(pitch_features.shape)

print(all_pitch_types)
pitch_features.head()

In [None]:
## TODO refactor this into a library, it is also in titanic.ipynb

# attempts to (eventually) preprocess a dataframe of features
# returns all of the collected inputs and preprocessed features 
def process_dataframe_inputs(feature_df):

  # first we walk through each feature in the dataframe, make an input for it,
  # normalize it using the feature_df data, then collect it into lists of
  # inputs and preprocessed inputs, respectively

  input_layers, preprocessed_inputs  = [], []
  for name, col in feature_df.items():

    # TODO do this with log()
    print("processing %s ..." % name)

    # convert to tensorflow types. this is straightforward except 
    # for objects, treat them as strings.
    dtype_str = str(col.dtype) if col.dtype != object else "string"
    cur_dtype = tf.as_dtype(dtype_str)
    cur_input = tf.keras.Input(shape=(1,), name=name, dtype=cur_dtype)
    input_layers.append(cur_input)  

    if cur_dtype == tf.string:
      lookup = layers.StringLookup(vocabulary=np.unique(col))    
      norm = layers.CategoryEncoding(num_tokens=lookup.vocabulary_size())
      normalized_input = norm(lookup(cur_input))
      preprocessed_inputs.append(normalized_input)

    else:
      norm = layers.Normalization(axis=None)
      norm.adapt(col)
      normalized_input = norm(cur_input)
      preprocessed_inputs.append( normalized_input )

  # prepare the return values by concatentating the preprocessed inputs
  # and creating a model for which they serve as outputs.
  # 
  # returns the input layers and the processed_inputs, respectively

  preprocessed_inputs_cat = layers.Concatenate()(preprocessed_inputs)
  preprocessing_model = tf.keras.Model(input_layers, preprocessed_inputs_cat)
  preprocessed_inputs = preprocessing_model(input_layers)

  return input_layers, preprocessed_inputs


In [None]:
input_layers, preprocessed_inputs = process_dataframe_inputs(pitch_features)

seqential_hidden_model = tf.keras.Sequential([
  layers.Dropout(0.1),
  layers.Dense(64, activation="relu"),
  layers.Dense(32, activation="relu"),
  layers.Dense(len(pitch_type_lookup.get_vocabulary()), activation="softmax"),
])
result = seqential_hidden_model(preprocessed_inputs)
pitch_model = tf.keras.Model(input_layers, result)
pitch_model.compile("adam", "categorical_crossentropy", metrics=["accuracy"])


In [None]:

pitch_features_dict = {
  name: np.array(value) for name, value in pitch_features.items()
}

hist = pitch_model.fit(x=pitch_features_dict, y=pitch_labels, validation_split=0.33, epochs=50, batch_size=10*1000)


In [None]:

import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

colors_iter = iter(mcolors.BASE_COLORS.values())

fig, axisLeft = plt.subplots()
axisLeft.set_xlabel("epoch")
axisLeft.set_ylabel("loss")
for val in ["loss", "val_loss"]:
    axisLeft.plot(hist.epoch, hist.history[val], color=next(colors_iter), label=val)
axisLeft.legend(loc='upper left')

axisRight = axisLeft.twinx()
axisRight.set_ylabel("accuracy")
for val in ["accuracy", "val_accuracy"]:
    axisRight.plot(hist.epoch, hist.history[val], color=next(colors_iter), label=val)
axisRight.legend(loc='lower left')

plt.show()


In [None]:

check_sample_in = df.tail(20).copy()
check_sample_in = check_sample_in.reset_index(drop=True)
## check_sample_in.info()

check_sample_label = check_sample_in.pop("pitch_type")

check_sample_dict = {
  name: np.array(value) for name, value in check_sample_in.items()
}

probs = pitch_model.predict(check_sample_dict)

predicted_pitch_types = all_pitch_types.tolist() + ["[other]"]

for i, row in check_sample_in.iterrows():
  print("%s\n%s" % ("-"*10, row))
  print("label: %s" % check_sample_label[i])
  pitch_probs = list(zip(probs[i],predicted_pitch_types))
  pitch_probs.sort(reverse=True)
  for pred in pitch_probs[:5]:
    print("%s: %0.10f" % (pred[1], pred[0])) 
  