In [1]:
from utils import *

import pandas as pd; pd.set_option('precision', 4)
import numpy as np

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split

from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB as NB

from sklearn.metrics import classification_report, balanced_accuracy_score, f1_score



## Loading FanGraphs Data

In [2]:
twtc = pd.read_json('labelled_data.json', orient='split')
orig_cols = twtc.columns

twtc['primary_position'].replace('DH', '1B', inplace=True)
twtc['primary_position'].replace('UTIL', 'INF', inplace=True)

twtc.loc[twtc.key_fangraphs.isin(['', 0, None]), 'key_fangraphs'] = np.nan
twtc = twtc[~twtc.key_fangraphs.isnull()]
twtc = twtc[~twtc.report.isin(['', 0, np.nan, None])]


twtc.loc[~twtc.key_fangraphs.isnull(), 'uid'] = (twtc \
                                                .loc[~twtc.key_fangraphs.isnull(), 'key_fangraphs'] \
                                                .apply(lambda x: int(x) 
                                                       if not str(x).strip().startswith('sa') 
                                                       and str(x).strip() != ''
                                                       else x) \
                                                .astype(str) + "_" + twtc.year.astype(str))
twtc = twtc[~twtc.uid.isnull()]

print(twtc.shape)
twtc.head()

(5315, 30)


Unnamed: 0,name,age,year,primary_position,eta,report,Arm,Changeup,Control,Curveball,...,uid,key_uuid,key_bbref,key_bbref_minors,mlb_played_first,birthdate,debut_age,key_fangraphs,expected,label
0,Luis Perdomo,,2015,RHP,2018,Not to be confused with the Luis Perdomo the C...,0,50,50,0,...,14682_2015,69c08698-cd47-4e3e-b942-913a3c0b4bce,perdolu02,perdom002lui,2016.0,1993-05-09T00:00:00.000Z,22.6,14680.0,24.104,1
1,Bruce Maxwell,24.0,2015,C,2016,Maxwell led NCAA Division III with 15 homers ...,55,0,0,0,...,13866_2015,1a8850d5-52a7-45ed-88ef-e9cfc7bdee6b,maxwebr01,maxwel001bru,2016.0,1990-12-20T00:00:00.000Z,25.0,13870.0,25.261,1
3,Anthony Banda,22.4,2016,LHP,2017,"In 2011, the D-backs drafted Banda in the 33rd...",0,45,50,55,...,14706_2016,2e627856-cb46-4c36-ad1e-604f35414565,bandaan01,banda-000ant,2017.0,1993-08-10T00:00:00.000Z,23.4,14710.0,24.49,1
4,Mike Wright,23.0,2013,RHP,2014,The East Carolina product jumped on the fast t...,0,50,50,0,...,12586_2013,75103df0-e411-4822-ba04-3edf4ee05a67,wrighmi01,wright000den,2015.0,1990-01-03T00:00:00.000Z,25.0,12590.0,25.261,1
7,Brendan Rodgers,18.4,2015,SS,2019,Ranked No. 1 on MLB.com's Draft Top 200 enteri...,60,0,0,0,...,17907_2015,42abe3dc-5672-407e-9cf9-2c4cdcef1a82,rodgebr02,rodger000bre,2019.0,1996-08-09T00:00:00.000Z,22.4,17910.0,24.008,1


### Batting Data

In [3]:
fg_b = pd.read_csv('fg_bat.csv')
fg_b['Level'] = fg_b.Team.str.extract('\((.*)\)')
fg_b['uid'] = (fg_b.PlayerId.astype(str) + "_" + fg_b.Season.astype(str))

fg_b = fg_b.sort_values(['uid', 'Pitches'], ascending=False).reset_index(drop=True)
fg_b = fg_b[~fg_b.uid.duplicated()]

print(fg_b.shape)
fg_b.head()

(33092, 35)


Unnamed: 0,Name,Season,Team,Age,PA,BB%,K%,BB/K,AVG,OBP,...,Pull%,Cent%,Oppo%,SwStr%,Balls,Strikes,Pitches,PlayerId,Level,uid
0,Arturo Lara,2017,Rangers (A+),—,237,0.038,0.1941,0.1957,0.1918,0.2241,...,0.4667,0.2333,0.3,0.1556,204.0,490.0,694.0,sa977838,A+,sa977838_2017
1,Luke Barker,2019,Brewers (AAA),—,3,0.0,0.6667,0.0,0.0,0.0,...,1.0,0.0,0.0,0.1429,5.0,9.0,14.0,sa977837,AAA,sa977837_2019
3,Luke Barker,2017,Brewers (A+),—,0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,sa977837,A+,sa977837_2017
4,Parker Curry,2019,Dodgers (AA),—,8,0.0,0.625,0.0,0.125,0.125,...,0.0,0.3333,0.6667,0.4074,5.0,22.0,27.0,sa977821,AA,sa977821_2019
5,Parker Curry,2018,Dodgers (AA),—,1,0.0,1.0,0.0,0.0,0.0,...,,,,0.4,1.0,4.0,5.0,sa977821,AA,sa977821_2018


### Pitching Data

In [4]:
fg_p = pd.read_csv('fg_pitch.csv')

fg_p['Level'] = fg_p.Team.str.extract('\((.*)\)')
fg_p['uid'] = (fg_p.PlayerId.astype(str) + "_" + fg_p.Season.astype(str)).str.strip()

fg_p = fg_p.sort_values(['uid', 'Pitches'], ascending=False).reset_index(drop=True)
fg_p = fg_p[~fg_p.uid.duplicated()]

print(fg_p.shape)
fg_p.head()

(32166, 29)


Unnamed: 0,Season,Name,Team,Age,IP,HR/9,K%,BB%,WHIP,BABIP,...,Pull%,Cent%,Oppo%,SwStr%,Balls,Strikes,Pitches,PlayerId,Level,uid
0,2017,Patrick Duester,Dodgers (R),23.0,35.2,1.2617,0.2053,0.0993,1.2617,0.2577,...,0.3981,0.2816,0.3204,0.1355,201.0,323.0,524.0,sa999479,R,sa999479_2017
2,2019,Victor Castaneda,Brewers (A),20.0,44.0,0.8182,0.2849,0.0753,1.3182,0.3478,...,0.479,0.2773,0.2437,0.1527,234.0,434.0,668.0,sa978399,A,sa978399_2019
3,2018,Victor Castaneda,Brewers (R),19.0,47.1,1.9014,0.1835,0.0688,1.5634,0.3245,...,0.4658,0.2547,0.2795,0.1205,253.0,477.0,730.0,sa978399,R,sa978399_2018
4,2017,Christian Stolo,Dodgers (A),23.0,39.2,0.9076,0.2267,0.0814,1.4874,0.3628,...,0.3932,0.2821,0.3248,0.1322,247.0,411.0,658.0,sa977917,A,sa977917_2017
6,2019,Luke Barker,Brewers (AAA),27.0,30.0,0.6,0.2883,0.0631,0.6667,0.1618,...,0.4571,0.3,0.2429,0.1629,154.0,288.0,442.0,sa977837,AAA,sa977837_2019


### Merging

In [38]:
df = twtc.merge(fg_p, how='left', on='uid', suffixes=('', '_p'))
df = df.merge(fg_b, how='left', on='uid', suffixes=('', '_h'))

df.Level.fillna('AMTR', inplace=True)

df.fillna({'Age': df['Age_h'], 'Level': df['Level_h']}, inplace=True)
df.fillna({'age': df['Age']}, inplace=True)
df.fillna({c: 0 for c in df.columns.difference(orig_cols)}, inplace=True)

filled_ages = (pd.to_datetime(df.year, format='%Y') - 
               pd.to_datetime(df.birthdate, format='%Y-%m-%d').dt.tz_convert(None)).dt.days / 365.25
df.age.replace('—', filled_ages, inplace=True)

df = df[~df.age.isnull()]
df = df.drop(columns=dropped_cols)
df = df.filter(regex='^(?!key)\w+$', axis='columns')

df = df.reset_index(drop=True)

df = onehot_encode_column(df.copy(), 'primary_position')
df = onehot_encode_column(df.copy(), 'Level')

df = df.dropna()

df.report = apply_text_mask(df.report, processes=2)
df.report = df.report.str.replace('(\d+)', 'NUMBER', regex=True)

print(df.shape)
df.head()

(5215, 59)


Unnamed: 0,name,age,report,Arm,Changeup,Control,Curveball,Cutter,Fastball,Field,...,RF,RHP,SS,A,A+,A-,AA,AAA,AMTR,R
0,Luis Perdomo,22.0,Not to be confused with the Luis Perdomo the C...,0,50,50,0,0,65,0,...,0,1,0,1,0,0,0,0,0,0
1,Bruce Maxwell,24.0,Maxwell led NCAA Division III with 15 homers ...,55,0,0,0,0,0,50,...,0,0,0,0,0,0,0,0,1,0
2,Anthony Banda,22.4,"In 2011, the D-backs drafted Banda in the 33rd...",0,45,50,55,0,55,0,...,0,0,0,0,0,0,1,0,0,0
3,Mike Wright,23.0,The East Carolina product jumped on the fast t...,0,50,50,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
4,Brendan Rodgers,18.4,Ranked No. 1 on MLB.com's Draft Top 200 enteri...,60,0,0,0,0,0,55,...,0,0,1,0,0,0,0,0,1,0


## `Sk-learn` Modeling

In [None]:
CLFS = {
    'lr': lambda: LogisticRegression(max_iter=2000, solver='lbfgs'), # C=1e-2
    'svm': lambda: LinearSVC(max_iter=5000),
    'nb':  lambda: NB,
    'knn': lambda: KNeighborsClassifier(10),
    'rf': lambda: RandomForestClassifier(n_estimators=100),
    'nn': lambda: MLPClassifier(alpha=1e-2, max_iter=1000),
    'ada': lambda: AdaBoostClassifier(),
    'sgd': lambda: SGDClassifier(max_iter=1000, tol=1e3),
    'gb': GradientBoostingClassifier,
}

voters = ['rf', 'nn', 'sgd', 'svm']
CLFS['vote'] = lambda: VotingClassifier([(v, CLFS[v]) for v in voters])

In [None]:
def fit_pipeline(X_train, y_train, featurizer='union', clf='lr', min_df=1):
    tfidf_pipe = Pipeline([
        ('report_tfidf', Pipeline([
            ('selector', ItemSelector(key='report')),
            ('tfidf', TfidfVectorizer(
                max_features=10000, 
                min_df=min_df,
                strip_accents='unicode'
            )),
        ]))
    ])

    meta = ItemSelector(key=X_train.drop(columns='report').columns)

    features = None
    if featurizer == 'union':
        features = FeatureUnion([
            ('metadata', meta),
            ('tfidf', tfidf_pipe)
        ])
    elif featurizer == 'tfidf':
        features = tfidf_pipe
    elif featurizer == 'metadata':
        features = meta
    else:
        raise ValueError(f'Invalid featurizer: {featurizer}')

    clf_model = CLFS[clf]()
    print(f'Training {type(clf_model).__name__} with {featurizer} features on {X_train.shape} training set.')

    pipe = Pipeline([
        ('featurizer', features),
        ('impute', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=-1)),
        ('scale', StandardScaler(with_mean=False)),
        ('clf', clf_model)
    ]).fit(X_train, y_train)

    return pipe

In [None]:
def top_features(clf, fts, coef=None, order='top', n=10):
    if coef is None:
        coef = clf.coef_.reshape(-1)
    for f, c in sorted(zip(fts, coef), key=lambda x: x[1], reverse=(order == 'top'))[:n]:
        print(f'{f}: {c:.4f}')

In [None]:
y = df['label'].copy()
X = df.drop(columns=['name', 'label']).copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
X_train.head()

In [None]:
clf = fit_pipeline(X_train, y_train, 'metadata', 'lr')
print(classification_report(y_test, clf.predict(X_test)))

top_features(clf.get_params()['clf'], [c for c in X.columns if c != 'report'])

In [None]:
clf = fit_pipeline(X_train, y_train, 'tfidf', 'lr', min_df=100)
print(classification_report(y_test, clf.predict(X_test)))

clf_tfidf = clf.get_params()['featurizer'].get_params()['report_tfidf'].get_params()['tfidf']
top_features(None, clf_tfidf.get_feature_names(), clf_tfidf.idf_)

In [None]:
clf_tfidf.

In [None]:

#list(sorted(zip(X.columns, clf.get_params()['clf'].coef_.reshape(-1)), key=lambda x: x[1], reverse=True))[:10]

In [None]:
clf = fit_pipeline(X_train, y_train, 'union', 'lr')
print(classification_report(y_test, clf.predict(X_test)))