In [2]:
import os
import json
import pandas as pd

### Load data

In [3]:
data_dir = 'anonymized-files-wisd'
data_list = []

for filename in os.listdir(data_dir):
    with open(os.path.join(data_dir, filename), 'r') as file:
        for line in file:
            data_list.append(json.loads(line))

In [4]:
df = pd.json_normalize(data_list)
df.head()

Unnamed: 0,events,samples_ball,samples_bat,units.length,units.velocity,units.acceleration,units.angle,summary_acts.pitch.eventId,summary_acts.pitch.result,summary_acts.pitch.action,...,summary_score.runs.innings,summary_score.runs.play,summary_score.outs.inning,summary_score.outs.play,summary_score.count.balls.plateAppearance,summary_score.count.balls.play,summary_score.count.strikes.plateAppearance,summary_score.count.strikes.play,summary_acts.pitch.type,summary_acts.hit.eventId
0,[],"[{'time': 0.0103363, 'pos': [-1.61897673474511...",[{'event': 'No'}],foot,mph,mph/s,degree,0da10a0e-60ec-4714-b086-2001c7c01ed9,Ball,Called,...,"[{'team1': 0, 'team2': 0}]",0,0,0,1,1,2,0,,
1,[],"[{'time': 0.0296728, 'pos': [-0.80462875757812...",[{'event': 'No'}],foot,mph,mph/s,degree,7e78552a-4ab3-4667-a748-024841d6f9cc,Ball,Called,...,"[{'team1': 0, 'team2': 0}, {'team1': 0, 'team2...",0,1,0,0,1,0,0,,
2,[],"[{'time': 0.0382683, 'pos': [-1.08782534756770...",[{'event': 'No'}],foot,mph,mph/s,degree,ae2f8cfe-934d-486d-86f9-b05b6dcc8ffb,Ball,Called,...,"[{'team1': 1, 'team2': 0}, {'team1': 0, 'team2...",0,1,0,0,1,0,0,,
3,[],"[{'time': 0.0430903, 'pos': [-1.08296228279852...","[{'event': 'First', 'time': -0.430037700000000...",foot,mph,mph/s,degree,58e88a96-6f8a-4d37-894c-2f57ebad50e8,Ball,Called,...,"[{'team1': 0, 'team2': 0}, {'team1': 0, 'team2...",0,2,0,2,1,2,0,,
4,[],"[{'time': 0.0369063, 'pos': [-1.50433512772240...",[{'event': 'No'}],foot,mph,mph/s,degree,5f2aa62c-727d-4da3-82ef-0810a75dcba5,Ball,Called,...,"[{'team1': 0, 'team2': 0}, {'team1': 0, 'team2...",0,2,0,0,1,0,0,Sinker,


In [8]:
df.dtypes

events                                          object
samples_ball                                    object
samples_bat                                     object
units.length                                    object
units.velocity                                  object
units.acceleration                              object
units.angle                                     object
summary_acts.pitch.eventId                      object
summary_acts.pitch.result                       object
summary_acts.pitch.action                       object
summary_acts.pitch.speed.mph                   float64
summary_acts.pitch.speed.kph                   float64
summary_acts.pitch.speed.mps                   float64
summary_acts.pitch.spin.rpm                    float64
summary_acts.hit.speed.mph                     float64
summary_acts.hit.speed.kph                     float64
summary_acts.hit.speed.mps                     float64
summary_acts.hit.spin.rpm                      float64
summary_sc

In [12]:
# check for rows with unit not foot, mph, mph/s
print('length: ', df['units.length'].unique())
print('velocity: ', df['units.velocity'].unique())
print('acceleration: ', df['units.acceleration'].unique())
print('angle: ', df['units.angle'].unique())

length:  ['foot']
velocity:  ['mph']
acceleration:  ['mph/s']
angle:  ['degree']


In [15]:
# another way to load data?
non_flat_df = pd.DataFrame(data_list)
non_flat_df.head()

Unnamed: 0,units,summary_acts,summary_score,events,samples_ball,samples_bat
0,"{'length': 'foot', 'velocity': 'mph', 'acceler...",{'pitch': {'eventId': '0da10a0e-60ec-4714-b086...,"{'runs': {'game': {'team1': 0, 'team2': 0}, 'i...",[],"[{'time': 0.0103363, 'pos': [-1.61897673474511...",[{'event': 'No'}]
1,"{'length': 'foot', 'velocity': 'mph', 'acceler...",{'pitch': {'eventId': '7e78552a-4ab3-4667-a748...,"{'runs': {'game': {'team1': 1, 'team2': 0}, 'i...",[],"[{'time': 0.0296728, 'pos': [-0.80462875757812...",[{'event': 'No'}]
2,"{'length': 'foot', 'velocity': 'mph', 'acceler...",{'pitch': {'eventId': 'ae2f8cfe-934d-486d-86f9...,"{'runs': {'game': {'team1': 1, 'team2': 0}, 'i...",[],"[{'time': 0.0382683, 'pos': [-1.08782534756770...",[{'event': 'No'}]
3,"{'length': 'foot', 'velocity': 'mph', 'acceler...",{'pitch': {'eventId': '58e88a96-6f8a-4d37-894c...,"{'runs': {'game': {'team1': 1, 'team2': 3}, 'i...",[],"[{'time': 0.0430903, 'pos': [-1.08296228279852...","[{'event': 'First', 'time': -0.430037700000000..."
4,"{'length': 'foot', 'velocity': 'mph', 'acceler...",{'pitch': {'eventId': '5f2aa62c-727d-4da3-82ef...,"{'runs': {'game': {'team1': 9, 'team2': 0}, 'i...",[],"[{'time': 0.0369063, 'pos': [-1.50433512772240...",[{'event': 'No'}]


In [16]:
print(len(df))
print(len(non_flat_df))

1251
1251


### Find metrics for hit

In [74]:
# filter for pitches with a hit
hit_df = df.dropna(subset=['summary_acts.hit.eventId'], ignore_index=True)
print(len(hit_df))

325


metrics required to determine if a hit was a barrel
- launch angle
- exit velocity

In [75]:
# expand events column (hit details) & remove unnecessary columns
events_df = pd.json_normalize(hit_df['events'].explode().to_list())
events_df = events_df.add_prefix('events.')
hit_df = hit_df.join(events_df)
hit_df = hit_df.drop(columns=['events', 'units.length', 'units.velocity', 'units.acceleration', 'units.angle',
                              'summary_acts.pitch.speed.kph', 'summary_acts.pitch.speed.mps', 
                              'summary_acts.hit.speed.kph', 'summary_acts.hit.speed.mps'])

In [81]:
angle_df = pd.DataFrame(hit_df['events.start.angle'].tolist(), columns=['events.spray_angle', 'events.launch_angle'])
hit_df = hit_df.drop(columns=['events.start.angle'])
hit_df = pd.concat([hit_df, angle_df], axis=1)

In [82]:
hit_df.iloc[0]

samples_ball                                   [{'time': -0.0046734, 'pos': [-1.0477735799342...
samples_bat                                    [{'event': 'First', 'time': -0.3944424, 'head'...
summary_acts.pitch.eventId                                  b120cf14-305c-442c-a739-c499bf61eec8
summary_acts.pitch.result                                                            HitIntoPlay
summary_acts.pitch.action                                                                    NaN
summary_acts.pitch.speed.mph                                                                84.0
summary_acts.pitch.spin.rpm                                                               2720.0
summary_acts.hit.speed.mph                                                                  84.0
summary_acts.hit.spin.rpm                                                                 1560.0
summary_score.runs.game.team1                                                                  4
summary_score.runs.game.team2 

In [29]:
# launch angle: events.launch_angle
# exit velocity: summary_acts.hit.speed

[{'start': {'angle': [-24.99839701395031, 3.2914346803695023]},
  'type': 'Hit',
  'teamId': {'mlbId': 90068},
  'personId': {'mlbId': 485007791},
  'eventId': 'ca663ed8-d834-4957-93db-e98c28270b6a'}]