In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LogisticRegression
import sklearn.metrics
from sklearn.preprocessing import PolynomialFeatures

In [None]:
folder_dir = '../data/'
date_range = ["2012","2013","2014","2015","2016","2017"]

In [None]:
for i in range(len(date_range)):
    year = date_range[i]
    data_path = "MLB_" + year + "/MLB_PitchFX_" + year + "_RegularSeason.csv"
    year_data = pd.read_csv(filepath_or_buffer= folder_dir + data_path, header=0)
    
    if i == 0:
        data = year_data
    else:
        data = pd.concat([data,year_data])

data.head()

In [None]:
'''
TODO:
    -More exploratory data analysis
    -Feature engineering/Clean data/select features
    -Define target valuable   
    
'''

In [None]:
data = data.drop(['on_1b', 'on_2b', "on_3b",'y'], axis = 1)
data.shape

In [None]:
_ = plt.hist(data.isnull().sum())

In [None]:
print(len(data.columns.values))
print(data.columns.values[0])
data.columns.values

In [None]:
data.isnull().sum()

In [None]:
data.dtypes

In [None]:
g = data.columns.to_series().groupby(data.dtypes).groups
dtype_groups = {k.name: v for k, v in g.items()}
dtype_groups

In [None]:
dtype_groups["int64"].values

In [None]:
dtype_groups["float64"].values

In [None]:
dtype_groups["object"].values

In [None]:
#Plotting all Int64 typed features
for i in range(len(dtype_groups["int64"].values)):
    plt.figure(i, 
               figsize=(10,5))
    plt.title(dtype_groups["int64"].values[i])
    plt.hist(data[dtype_groups["int64"].values[i]],
             rwidth = 0.5)

In [None]:
#Plotting all Float64 typed features
for i in range(len(dtype_groups["float64"].values)):
    plt.figure(i, 
               figsize=(10,5))
    plt.title(dtype_groups["float64"].values[i])
    plt.hist(data[dtype_groups["float64"].values[i]].dropna(),
             rwidth = 0.5)

In [None]:
_ = data[dtype_groups["object"][0]].value_counts().plot(kind="bar",
                                                    figsize=(20,10),
                                                    title = "Dates")

In [None]:
#Plotting all Object typed features
for i in range(len(dtype_groups["object"].values)):
    print(dtype_groups["object"][i])
    _ = data[dtype_groups["object"][i]].value_counts().plot(kind="bar",
                                                       figsize=(20,10),
                                                       title = dtype_groups["object"][i])

In [None]:
plt.hist(data['zone'])

In [None]:
plt.hist(data['spin_dir'])

### Rolling_mean feature

In [None]:
data = data.sample(n = 1000)
data.shape

In [None]:
data.loc[data['pitch_type'] == "FF"]["start_speed"]

In [None]:
data.loc[data['pitch_type'] == "FF"].groupby('pitcher')['start_speed'].rolling(5).mean()

## 1. Target variable
We classify a pitch as good if it induces a swinging strike, a called strike, a foul, or if it generates a
ball in play that is converted to an out

In [None]:
data.outcome.unique()

In [None]:
data.umpcall.unique()

In [None]:
data.descr.unique()

In [None]:
#Using descr feature to define good pitch
g_outcomes=['Foul Tip','Swinging Strike','In play, out(s)', 'Called Strike', 'Foul','Foul (Runner Going)','Foul Bunt',
       'Swinging Strike (Blocked)', 'Missed Bunt' ]

#Using outcome feature to use more detailed descriptions (eg: lineout vs groundout). I think this approach is better
g_outcomes2=['Flyout', 'Fly Out','Groundout','Pop Out','Forceout','Double Play','Grounded into DP','Fielders Choice','Fielders Choice Out', 
             'Bunt Groundout','Bunt Pop Out','Sac Fly DP','Bunt Lineout','Foul']

In [None]:
#Create target feature
data['y']=np.where((data['outcome'].isin(g_outcomes2)) | (data['umpcall']=='S'),1,0)
a=plt.hist(data['y'])

In [None]:
data.y.unique(), data.y.isnull().any()

In [None]:
data.columns

## 2. Divide in Train (2012-2106) and Test (2017)

In [None]:
#Drop columns we´re not going to use
data['year']=data['date'].str.slice(0,4)

features=['umpcall', 'outcome','start_speed','pfx_x','pfx_z','px','pz','break_y','break_angle','break_length','pitch_type',
         'spin_dir','nasty','pitch_count','descr','y','year','zone','strikes','balls','pitcher','pitch_type']
baseline_features=['start_speed','px','pz','year','y']

data=data[features]
data_base=data[baseline_features]

In [None]:
#data_path = "MLB_" + year + "/MLB_PitchFX_" + year + "_RegularSeason.csv"
#year_data = pd.read_csv(filepath_or_buffer= folder_dir + data_path, header=0)
save_path=folder_dir+'data_clean.csv'
data.to_csv(save_path) 

In [None]:
#data = data.drop(['on_1b', 'on_2b', "on_3b"], axis = 1)
X_train=data_base[data_base['year']!='2017']
y_train=X_train['y']
X_train=X_train.drop(['y','year'],axis=1)

X_test=data_base[data_base['year']=='2017']
y_test=X_test['y']
X_test=X_test.drop(['y','year'],axis=1)

X_train.head()

In [None]:
X_train.shape

## 3. Baseline with Logistic Regression

In [None]:
poly = PolynomialFeatures(interaction_only=True, include_bias=False)
poly.fit_transform(X_train)
poly.fit_transform(X_test)

In [None]:
X_train.shape, X_test.shape

In [None]:
baseline=LogisticRegression()
baseline.fit(X_train,y_train)

In [None]:
#print(list(zip(list(X_train.columns),list(baseline.coef_[0]))))
baseline.coef_

In [None]:
probs=baseline.predict_proba(X_test)[:,1]
preds=baseline.predict(X_test)
baseline_loss=sklearn.metrics.log_loss(y_test,probs)
baseline_acc=sklearn.metrics.accuracy_score(y_test,preds)
majority_class=np.amax(y_test.value_counts(normalize=True))

print('%s: %0.2f, %s: %0.2f, %s: %0.2f '
      %('Baseline accuracy',baseline_acc,'Baseline log loss', baseline_loss,'Majority class',majority_class))