In [None]:
import pickle

import pandas
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import pandas.tools.plotting as pdplot
import sklearn.mixture
import sklearn.model_selection
import sklearn.neighbors.kde
import time
import math

import stengel.model.pitch_data
import stengel.model.pitch_density

In [None]:
with open("../data/python/pitch_data_2009.p", "rb") as f:
    pitch_data = stengel.model.pitch_data.PitchData.from_dict(pickle.load(f))
    
pitch_data.filter_nulls(in_place=True)
high_volume_pitchers = [i for i, pitch_count in enumerate(pitch_data.pitches_per_pitcher())
                        if pitch_count > 3000]
high_pitch_data = pitch_data.filter_by_pitcher_id(high_volume_pitchers)

In [None]:
print high_pitch_data.pitches_per_pitcher()

In [None]:
def pitcher_density_data(pitcher_data):
    density_variables = ["velocity_y", "accel_x", "accel_z"]
    data_columns = stengel.model.pitch_data.PitchData.variable_names
    density_columns = [i for i, v in enumerate(data_columns)
                       if v in density_variables]
    return pitcher_data.pitch_data[:, density_columns]

density_data = {}
for id_, name in enumerate(high_pitch_data.pitchers):
    density_matrix = pitcher_density_data(high_pitch_data.filter_by_pitcher_id([id_]))
    train, test = sklearn.model_selection.train_test_split(density_matrix, test_size=0.3, random_state=1729)
    density_data[name] = {"train": train, "test": test}

In [None]:
train_combined = np.concatenate([v["train"] for v in density_data.values()], 0)
scaler = sklearn.preprocessing.StandardScaler()
scaler.fit(train_combined)

In [None]:
def score_bandwidth_value(bandwidth):
    scores = []
    for data in density_data.values():
        model = stengel.model.pitch_density.PitchDensityEstimator(bandwidth, scaler)
        model.fit(data["train"])
        scores.append(model.score(data["test"]))
    return np.mean(scores)

In [None]:
bandwidths = [0.09, 0.10, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.20]
bandwidth_scores = [score_bandwidth_value(b) for b in bandwidths]

In [None]:
plt.plot(bandwidths, bandwidth_scores)
plt.plot(bandwidths, bandwidth_scores, "ro")
plt.xlabel("Kernel bandwidth")
plt.ylabel("Mean score on test data")
plt.title("KDE Performance By Bandwidth")
plt.show()

In [None]:
train_scaled = scaler.transform(train_combined)
print(np.amin(train_scaled, 0))
print(np.amax(train_scaled, 0))

In [None]:
pitcher_density_renders = {}
for name, data in density_data.items():
    model = stengel.model.pitch_density.PitchDensityEstimator(bandwidth=0.13, scaler=scaler)
    model.fit(data["train"])
    render = model.render(mins=[-2.0, -2.5, -3.0],
                          maxes=[3.0, 2.5, 2.0],
                          resolutions=[10, 10, 10])
    pitcher_density_renders[name] = render

In [None]:
with file("../data/python/density_renders.p", "wb") as outfile:
    pickle.dump(pitcher_density_renders, outfile, pickle.HIGHEST_PROTOCOL)

In [None]:
render_minima = pitcher_density_renders.values()[0]
render_maxima = pitcher_density_renders.values()[0]
for render in pitcher_density_renders.values():
    render_minima = np.minimum(render_minima, render)
    render_maxima = np.maximum(render_maxima, render)

plt.hist(np.reshape(render_minima, [-1]), 50)
plt.xlabel("Voxel Minimum")
plt.ylabel("Number of Voxels")
plt.title("Distribution of Density Voxel Minima")
plt.show()

plt.hist(np.reshape(render_maxima, [-1]), 50)
plt.xlabel("Voxel Maximum")
plt.ylabel("Number of Voxels")
plt.title("Distribution of Density Voxel Maxima")
plt.show()

In [None]:
retained_voxels = np.reshape(render_maxima > 0.02, [-1])
compressed_renders = {}

for name, render in pitcher_density_renders.items():
    flattened_render = np.reshape(render, [-1])
    compressed_renders[name] = flattened_render[retained_voxels]

In [None]:
with file("../data/python/compressed_renders.p", "wb") as outfile:
    pickle.dump(compressed_renders, outfile, pickle.HIGHEST_PROTOCOL)

In [None]:
pitcher_quantiles = {}
for name, data in density_data.items():
    quantiles = np.percentile(data["train"], [1, 5, 25, 50, 75, 95, 99], axis=0)
    pitcher_quantiles[name] = np.reshape(quantiles.transpose(), [-1])

In [None]:
with file("../data/python/pitcher_quantiles.p", "wb") as outfile:
    pickle.dump(pitcher_quantiles, outfile, pickle.HIGHEST_PROTOCOL)