In [1]:
import pandas as pd
import numpy as np

## The Data
The data comes from the MLB [PitchFX](http://www.fangraphs.com/library/misc/pitch-fx/) dataset. It's publicly available and is updated __very__ frequently.

I used [`mlb_terminal`](https://github.com/slnovak/mlb_terminal) to collect 2 months worth of data from the 2013 season. You can see it in the bash script `scrape-mlb.sh`.

In [2]:
! open http://gd2.mlb.com/components/game/mlb/year_2012/month_06/day_01/gid_2012_06_01_arimlb_sdnmlb_1/inning/inning_2.xml

Couldn't get a file descriptor referring to the console


In [3]:
df = pd.read_csv("./baseball-pitches.csv")
df.head()

Unnamed: 0,pitch_time,inning,top_or_bottom,pitcher_name,hitter_name,pitch_type,x,y,start_speed,end_speed,...,break_angle,break_length,pitch_name,type_confidence,zone,nasty,spin_dir,spin_rate,comments,unk
0,2013-10-01 20:07:43 -0400,1,Top,Francisco Liriano,Shin-Soo Choo,B,78.97,164.92,93.2,85.3,...,-41.3,6.3,FT,0.894,9.0,65.0,120.583,2541.561,,
1,2013-10-01 20:07:57 -0400,1,Top,Francisco Liriano,Shin-Soo Choo,S,82.4,131.24,93.4,85.6,...,-44.6,5.4,FT,0.895,12.0,62.0,128.371,2589.087,,
2,2013-10-01 20:08:12 -0400,1,Top,Francisco Liriano,Shin-Soo Choo,S,96.14,161.47,89.1,82.8,...,-10.4,5.8,SL,0.931,8.0,32.0,148.073,1133.227,,
3,2013-10-01 20:08:31 -0400,1,Top,Francisco Liriano,Shin-Soo Choo,S,106.44,163.19,90.0,83.3,...,2.6,6.8,SL,0.926,8.0,34.0,189.793,430.593,,
4,2013-10-01 20:09:09 -0400,1,Top,Francisco Liriano,Ryan Ludwick,B,163.95,194.28,87.7,81.6,...,-3.1,7.3,SL,0.915,13.0,55.0,140.567,482.08,,


Let's limit this to a few less columns.

Cleaning the `pitch_name` column.

In [4]:
lu = """FA,Fastball
FF,Fastball
FT,Fastball
FC,Cut fastball
FS,Fastball (sinker|split-fingered)
SI,Fastball (sinker|split-fingered)
SF,Fastball (sinker|split-fingered)
SL,Slider
CH,Changeup
CB,Curveball
CU,Curveball
KC,Curveball
KN,Knuckleball
EP,Eephus
UN,Unidentified
XX,Unidentified
PO,Pitch out
FO,Pitch out""".split('\n')

In [5]:
for row in lu:
    row = row.split(',')
    abbrv, name = row[0], row[1]
    df['pitch_name'] = df['pitch_name'].replace(abbrv, name)
df['pitch_name'] = df['pitch_name']
# df = df[df.pitch_name.isin(df.pitch_name.value_counts().head(8).index)]

In [7]:
df.iloc[:,:10].head()

Unnamed: 0,pitch_time,inning,top_or_bottom,pitcher_name,hitter_name,pitch_type,x,y,start_speed,end_speed
0,2013-10-01 20:07:43 -0400,1,Top,Francisco Liriano,Shin-Soo Choo,B,78.97,164.92,93.2,85.3
1,2013-10-01 20:07:57 -0400,1,Top,Francisco Liriano,Shin-Soo Choo,S,82.4,131.24,93.4,85.6
2,2013-10-01 20:08:12 -0400,1,Top,Francisco Liriano,Shin-Soo Choo,S,96.14,161.47,89.1,82.8
3,2013-10-01 20:08:31 -0400,1,Top,Francisco Liriano,Shin-Soo Choo,S,106.44,163.19,90.0,83.3
4,2013-10-01 20:09:09 -0400,1,Top,Francisco Liriano,Ryan Ludwick,B,163.95,194.28,87.7,81.6


In [8]:
df.iloc[:,25:].head()

Unnamed: 0,break_y,break_angle,break_length,pitch_name,type_confidence,zone,nasty,spin_dir,spin_rate,comments,unk
0,23.8,-41.3,6.3,Fastball,0.894,9.0,65.0,120.583,2541.561,,
1,23.8,-44.6,5.4,Fastball,0.895,12.0,62.0,128.371,2589.087,,
2,23.8,-10.4,5.8,Slider,0.931,8.0,32.0,148.073,1133.227,,
3,23.8,2.6,6.8,Slider,0.926,8.0,34.0,189.793,430.593,,
4,23.8,-3.1,7.3,Slider,0.915,13.0,55.0,140.567,482.08,,


In [9]:
df.iloc[:,25:].head()

Unnamed: 0,break_y,break_angle,break_length,pitch_name,type_confidence,zone,nasty,spin_dir,spin_rate,comments,unk
0,23.8,-41.3,6.3,Fastball,0.894,9.0,65.0,120.583,2541.561,,
1,23.8,-44.6,5.4,Fastball,0.895,12.0,62.0,128.371,2589.087,,
2,23.8,-10.4,5.8,Slider,0.931,8.0,32.0,148.073,1133.227,,
3,23.8,2.6,6.8,Slider,0.926,8.0,34.0,189.793,430.593,,
4,23.8,-3.1,7.3,Slider,0.915,13.0,55.0,140.567,482.08,,


In [10]:
df = df[df.pitch_name.isin(["IN", "Pitch out", "SC"])==False]
df = df[df.pitch_name.isnull()==False]

In [11]:
df.to_csv("./baseball-pitches-clean.csv", index=False)