In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import datetime

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures,OneHotEncoder

from sklearn.model_selection import GridSearchCV, TimeSeriesSplit

In [2]:
folder_dir = '../data/'
data_path = "data_clean.csv"
data = pd.read_csv(filepath_or_buffer= folder_dir + data_path, header=0)
data=data.drop('Unnamed: 0',axis=1)

#=data.head(500)

In [3]:
data.shape

(3902210, 22)

In [4]:
data.isnull().any()[data.isnull().any()]

outcome    True
nasty      True
pitcher    True
dtype: bool

In [9]:
### Create features
data.columns

Index(['umpcall', 'outcome', 'start_speed', 'pfx_x', 'pfx_z', 'px', 'pz',
       'break_y', 'break_angle', 'break_length', 'pitch_type', 'spin_dir',
       'nasty', 'pitch_count', 'descr', 'y', 'year', 'strikes', 'balls',
       'pitcher', 'pitch_type.1', 'count_b_p', 'zone_1', 'zone_2', 'zone_3',
       'zone_4', 'zone_5', 'zone_6', 'zone_7', 'zone_8', 'zone_9', 'zone_11',
       'zone_12', 'zone_13'],
      dtype='object')

## Train, test, validation split

In [10]:
model_feats=['px','pz','pfx_x','pfx_z','start_speed','spin_dir','pitch_count','count_b_p']
model_feats=['px','pz','pfx_x','pfx_z','start_speed','spin_dir','pitch_count','balls','strikes']

y=data['y']
data=data[model_feats]

In [None]:
data["date"] = pd.to_datetime(data["date"])

start = data["date"].searchsorted(datetime.datetime(2012, 1, 1))[0]
#start
end = data["date"].searchsorted(datetime.datetime(2017, 1, 1))[0] - 1
#end

X_train = data[model_feats].loc[start:end]
y_train=X_train['y']
X_train=X_train.drop(['y'],axis=1)

X_test=data[model_feats][end:]
y_test=X_test['y']
X_test=X_test.drop(['y'],axis=1)

## Logistic Regression

In [None]:
#Create pipeline
my_cv = TimeSeriesSplit(n_splits=3).split(X_train)
steps= [('polyfeat', PolynomialFeatures()),
         ('scaler', StandardScaler()),
         ('lr', LogisticRegression())]

mypipeline= Pipeline(steps)
parameters = dict(lr__C = [10**i for i in range(-5, 5)],
                  lr__penalty = ['l1', 'l2'])

In [None]:
lr_grid_search= GridSearchCV(mypipeline, 
                             param_grid = parameters,  
                             scoring = 'log_loss',
                             n_jobs=2, 
                             refit=True)

lr_grid_search.fit(X_train, y_train)

In [None]:
best_1 = lr_grid_search.best_score_
print(best_1)
lr_grid_search.best_estimator_
model=lr_grid_search.best_estimator_.steps[2][1]
print(model)

In [None]:
model=lr_grid_search.best_estimator_.steps[2][1]

In [None]:
probabilities_lr=model.predict_proba(X_test)[:,1]
np.savetxt'probabilities_lr.txt', probabilities_lr, delimiter=',', newline='\n')

In [None]:
coef_dict = {}
for coef, feat in zip(model.coef_[0],X_train.columns.tolist()):
    coef_dict[feat] = coef
coef_dict

### Random Forest

In [None]:
steps= [('polyfeat', PolynomialFeatures()), ('scaler', StandardScaler()),('rf', RandomForestClassifier())]

steps= [('rf', RandomForestClassifier())]

mypipeline= Pipeline(steps)

param_grid_rf = dict(
    #rf__min_samples_leaf= np.logspace(4, 5, num=5, base=4, endpoint=False, dtype=int),
    rf__min_samples_split= [100, 1000],
    rf__max_depth= list(range(4,9,2)),
    rf__n_estimators= list(range(200,500,100))
      )

In [None]:
rf_grid_search= GridSearchCV(mypipeline, 
                             param_grid = param_grid_rf, 
                             scoring = 'log_loss',
                                    n_jobs=-1, refit=True)

rf_grid_search.fit(X_train, y_train)

In [None]:
best_1 = rf_grid_search.best_score_
print(best_1)
rf_grid_search.best_estimator_
model=rf_grid_search.best_estimator_.steps[2][1]
print(model)

In [None]:
model=rf_grid_search.best_estimator_.steps[2][1]

In [None]:
#Get class probabilities
probabilities_rf = model.predict_proba(X_test)
print('\rSaving class probabilities.', end='\r')
np.savetxt(folder_dir + 'probabilities_rf.txt', probabilities_rf, delimiter=',', newline='\n')

In [None]:
STOP=

In [None]:
# RF Feature Importances
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

# Plot the feature importances
fig = plt.figure(figsize=(20,8))
plt.bar(range(len(indices)), importances[indices], color="r", align="center")
plt.title('Feature Importances Bar Plot')
plt.xlabel('Feature Name')
plt.ylabel('Feature Importance')
plt.xticks(range(len(indices)), train_val.drop('Y', axis=1).columns[indices], rotation=45, horizontalalignment='right')
plt.xlim([-1, len(indices)])
plt.title('Feature Importance Graph for RF')
plt.show()
print('\rSaving feature_importances barplot... ', end='')
fig.savefig(rf_dir + 'feature_importances_rf.png')
print('done.')