In [None]:
import pickle
import random

import numpy as np
import sklearn.preprocessing

import stengel.model.pitch_data
import stengel.model.pitch_outcome

In [None]:
with open("../data/python/pitch_data_2009.p", "rb") as pitch_file:
    pitch_data = stengel.model.pitch_data.PitchData.from_dict(pickle.load(pitch_file))
with open("../data/python/density_renders.p", "rb") as density_file:
    pitch_density = pickle.load(density_file)
with open("../data/python/compressed_renders.p", "rb") as compressed_density_file:
    compressed_density = pickle.load(compressed_density_file)
with open("../data/python/pitcher_quantiles.p", "rb") as quantile_file:
    pitcher_quantiles = pickle.load(quantile_file)

In [None]:
# Prep the main pitch data
pitch_data.filter_nulls(in_place=True)
pitch_counts = pitch_data.pitches_per_pitcher()
high_count_pitchers = [i for i, pc in enumerate(pitch_counts) if pc > 3000]
pitch_data.filter_by_pitcher_id(high_count_pitchers, in_place=True)

In [None]:
# Load the three types of density arrays we'll be evaluating.
density_array = np.array([pitch_density[name].reshape([-1])
                          for name in pitch_data.pitchers])
density_array = density_array - np.mean(density_array)

density_compressed_array = np.array([compressed_density[name].reshape([-1])
                                     for name in pitch_data.pitchers])
density_compressed_array = density_compressed_array - np.mean(density_compressed_array)

quantile_array = np.array([pitcher_quantiles[name].reshape([-1])
                          for name in pitch_data.pitchers])
quantile_array = (quantile_array - np.mean(quantile_array, axis=0)) / np.std(quantile_array, axis=0)

In [None]:
def split_data(data, train_size=0.6, validation_size=0.2, random_seed=1729):
    """Split a pitch data set into training, validation, and test sets."""
    train_obs = int(train_size * data.num_observations)
    valid_obs = int(validation_size * data.num_observations)
    train_indices = np.array(range(train_obs))
    valid_indices = np.array(range(train_obs, train_obs + valid_obs))
    test_indices = np.array(range(train_obs + valid_obs, data.num_observations))
    
    data.shuffle()
    train_data = data.filter_rows(train_indices, reassign_ids=False)
    valid_data = data.filter_rows(valid_indices, reassign_ids=False)
    test_data = data.filter_rows(test_indices, reassign_ids=False)
    
    
    scaler = sklearn.preprocessing.StandardScaler()
    scaler.fit(train_data.pitch_data)
    train_data.pitch_data = scaler.transform(train_data.pitch_data)
    valid_data.pitch_data = scaler.transform(valid_data.pitch_data)
    test_data.pitch_data = scaler.transform(test_data.pitch_data)
    return train_data, valid_data, test_data

In [None]:
def attach_density(data, density_type):
    """Attach density information to a pitch data set."""
    if density_type == "full":
        data.pitch_density = density_array
        return data, [1000]
    elif density_type == "compressed":
        data.pitch_density = density_compressed_array
        return data, [322]
    elif density_type == "quantile":
        data.pitch_density = quantile_array
        return data, [21]
    else:
        return data, None

In [None]:
def evaluate_parameters(data, hidden_nodes, density_type="none", density_hidden_nodes=None,
                        batter_embed_size=None, pitcher_embed_size=None, 
                        learning_rate=0.1, batch_size=64, train_steps=300000, print_every=5000):
    data, density_size = attach_density(data, density_type)
    train_data, valid_data, test_data = split_data(data)
    
    num_batters = len(train_data.batters) if batter_embed_size else None
    num_pitchers = len(train_data.pitchers) if pitcher_embed_size else None
    model = stengel.model.pitch_outcome.PitchOutcomeModel(
        batch_size=batch_size, learning_rate=learning_rate, hidden_nodes=hidden_nodes,
        num_batters=num_batters, batter_embed_size=batter_embed_size,
        num_pitchers=num_pitchers, pitcher_embed_size=pitcher_embed_size,
        density_size=density_size, density_hidden_nodes=density_hidden_nodes
    )
    model.train(train_data, valid_data, train_steps, print_every)
    return model.fit_log, model.fit_time

In [None]:
def print_summary(eval_results):
    """Print a friendly, nicely-formatted format of the model results."""
    eval_names = []
    run_times = []
    last_run_scores = []
    min_run_scores = []
    for k, v in eval_results.items():
        eval_names.append(k)
        run_times.append(v[1])
        scores = v[0]["validation_score"]
        last_run_scores.append(scores[-1])
        min_run_scores.append(min(scores))
        
    eval_names = [x for (s, x) in sorted(zip(last_run_scores, eval_names))]
    run_times = [x for (s, x) in sorted(zip(last_run_scores, run_times))]
    min_run_scores = [x for (s, x) in sorted(zip(last_run_scores, min_run_scores))]
    last_run_scores = sorted(last_run_scores)
    
    print("Model Name                       Fit Time  Best Perp.  Last Perp.")
    print("=================================================================")
    for name, time, min_score, last_score in zip(eval_names, run_times, min_run_scores, last_run_scores):
        print("{:<30}   {:>8.1f}    {:0.3f}    {:0.3f}".format(name, time, min_score, last_score))

In [None]:
def evaluate_basic_structure(data, num_hidden):
    return evaluate_parameters(data, num_hidden, train_steps=100000)

basic_structure_runs = {
    "No Hidden Layer": evaluate_basic_structure(pitch_data, []),
    "48-node Hidden Layer": evaluate_basic_structure(pitch_data, [48]),
    "96-node Hidden Layer": evaluate_basic_structure(pitch_data, [96]),
    "192-node Hidden Layer": evaluate_basic_structure(pitch_data, [192]),
    "192x96-node Hidden Layers": evaluate_basic_structure(pitch_data, [192, 96])
}

print_summary(basic_structure_runs)

In [None]:
def evaluate_encodings(data, batter_embed_size, pitcher_embed_size):
    return evaluate_parameters(data, [192], batter_embed_size=batter_embed_size,
                               pitcher_embed_size=pitcher_embed_size,
                               train_steps=150000, print_every=10000)

encoding_runs = {
    "Batter Encoding": evaluate_encodings(pitch_data, 16, None),
    "Pitcher Encoding": evaluate_encodings(pitch_data, None, 16),
    "Batter and Pitcher Encoding": evaluate_encodings(pitch_data, 16, 16),
    "Batter and Pitcher Large Encoding": evaluate_encodings(pitch_data, 32, 32)
}

print_summary(encoding_runs)

In [None]:
def evaluate_density(data, density_type):
    return evaluate_parameters(data, [192], batter_embed_size=32,
                               pitcher_embed_size=32,
                               density_type=density_type,
                               train_steps=250000, print_every=10000)

density_runs = {
    "Full Density": evaluate_density(pitch_data, "full"),
    "Compressed Density": evaluate_density(pitch_data, "compressed"),
    "Quantile Density": evaluate_density(pitch_data, "quantile")
}

print_summary(density_runs)

In [None]:
def evaluate_hidden_density(data, hidden_density):
    return evaluate_parameters(data, [192], batter_embed_size=16,
                               pitcher_embed_size=24, density_hidden_nodes=hidden_density,
                               density_type="compressed",
                               train_steps=250000, print_every=10000)

hidden_density_runs = {
    "Small Hidden": evaluate_hidden_density(pitch_data, [64]),
    "Big Hidden": evaluate_hidden_density(pitch_data, [128]),
}

print_summary(hidden_density_runs)