In [2]:
def pitch_classification(mlbid, output):
    #importing required libraries
    import pandas as pd
    import lightgbm as lgb
    import numpy as np
    from pybaseball import statcast_pitcher, playerid_lookup
    from sklearn.metrics import precision_score, confusion_matrix, classification_report
    from sklearn.model_selection import train_test_split
    
    #gathering pitch-by-pitch data for the pitcher
    data = statcast_pitcher('2019-03-15','2019-11-30',mlbid)
    
    #removes all pitches that are intentional balls or pitchouts
    data = data.loc[(data['description']!= 'intent_ball') & (data['pitch_type']!='PO')]
    
    #getting rid of any pitch types that aren't correctly classified
    data = data.loc[data['pitch_type'].isnull() == False]
    
    new_data = data.loc[(data['release_pos_x'].isnull()==False) & 
                            (data['release_pos_z'].isnull()==False)]
    
    #getting all pitch types for the pitcher
    pitches = new_data.pitch_type.unique().tolist()
    
    #this loop removes pitch types that aren't thrown more than 7.5% of the time to make classifying more accurate
    for i in range(len(pitches)):
        if (len(new_data[new_data['pitch_type']== pitches[i]])/len(new_data)) <= 0.075:
            new_data = new_data[new_data['pitch_type']!= pitches[i]]
        
    #defining new pitch list if some pitch types were dropped in the loop above
    pitches = new_data.pitch_type.unique().tolist()
    
    #converting pitche types from strings to integers
    new_data['pitch_type'].replace(pitches, list(range(len(pitches))),inplace = True)
    new_data['pitch_type'] = new_data['pitch_type'].apply(pd.to_numeric)
    
    #getting the necessary columns for the model
    X1 = new_data[['release_pos_x', 'release_pos_z', 'strikes','balls']]

    #singling out the variable we want to predict
    Y1 = new_data['pitch_type']
    
    #splitting the data into training and testing sets
    X_train,X_test,y_train,y_test=train_test_split(X1,Y1,test_size=0.3,random_state=0)
    
    d_train=lgb.Dataset(X_train, label=y_train)
    
    #setting up the parameters
    params={}
    params['num_leaves'] = 10
    params['learning_rate']=0.03
    params['boosting_type']='gbdt' #GradientBoostingDecisionTree
    params['objective']='multiclass' #Multi-class target feature
    params['metric']='multi_logloss' #metric for multi-class
    params['max_depth']= 5
    params['num_class']= len(pitches) #no.of unique values in the target class not inclusive of the end value
    params['min_data_in_leaf'] = 10
    params['class_weight'] = 'balanced'
    
    #training the model
    clf=lgb.train(params,d_train,500)
    
    #prediction on the test dataset
    y_pred_1=clf.predict(X_test)
    
    #argmax() method 
    y_pred_1 = [np.argmax(line) for line in y_pred_1]
    
    #using precision score for error metrics
    score = precision_score(y_pred_1,y_test,average=None).mean()
    
    matrix = confusion_matrix(y_test, y_pred_1)
    
    report = classification_report(y_test, y_pred_1)
    
    if output == 'score':
        return score
    
    elif output == 'matrix':
        return matrix
    
    elif output == 'report':
        return report
    else:
        return score

In [3]:
from pybaseball import playerid_lookup

In [4]:
playerid_lookup('verlander','justin')

Gathering player lookup table. This may take a moment.


Unnamed: 0,name_last,name_first,key_mlbam,key_retro,key_bbref,key_fangraphs,mlb_played_first,mlb_played_last
0,verlander,justin,434378,verlj001,verlaju01,8700,2005.0,2020.0


In [5]:
pitch_classification(434378, 'score')

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


Gathering Player Data


0.5382811821755901

In [7]:
pitch_classification(622491, 'score')

Gathering Player Data


0.5178694932553993

In [14]:
#the rest of the notebook is me calculating the precision scores for 16 players and adding the data to a df
playerid_lookup('Scherzer', 'Max')

Gathering player lookup table. This may take a moment.


Unnamed: 0,name_last,name_first,key_mlbam,key_retro,key_bbref,key_fangraphs,mlb_played_first,mlb_played_last
0,scherzer,max,453286,schem001,scherma01,3137,2008.0,2020.0


In [15]:
ms_score = pitch_classification(453286, 'score')

Gathering Player Data


In [16]:
names = []
prec_scores = []
adj_scores = []

In [17]:
names.append('Max Scherzer')
prec_scores.append(ms_score)
adj_scores.append(ms_score/0.25)

In [18]:
playerid_lookup('cole', 'gerrit')

Gathering player lookup table. This may take a moment.


Unnamed: 0,name_last,name_first,key_mlbam,key_retro,key_bbref,key_fangraphs,mlb_played_first,mlb_played_last
0,cole,gerrit,543037,coleg001,colege01,13125,2013.0,2020.0


In [19]:
gc_score = pitch_classification(543037, 'score')

Gathering Player Data


In [20]:
names.append('Gerrit Cole')
prec_scores.append(gc_score)
adj_scores.append(gc_score/(1/3))

In [21]:
playerid_lookup('glasnow','tyler')

Gathering player lookup table. This may take a moment.


Unnamed: 0,name_last,name_first,key_mlbam,key_retro,key_bbref,key_fangraphs,mlb_played_first,mlb_played_last
0,glasnow,tyler,607192,glast001,glasnty01,14374,2016.0,2020.0


In [22]:
tg_score = pitch_classification(607192, 'score')

Gathering Player Data


In [23]:
names.append('Tyler Glasnow')
prec_scores.append(tg_score)
adj_scores.append(tg_score/0.5)

In [24]:
playerid_lookup('bumgarner','madison')

Gathering player lookup table. This may take a moment.


Unnamed: 0,name_last,name_first,key_mlbam,key_retro,key_bbref,key_fangraphs,mlb_played_first,mlb_played_last
0,bumgarner,madison,518516,bumgm001,bumgama01,5524,2009.0,2020.0


In [25]:
mb_score = pitch_classification(518516, 'score')

Gathering Player Data


In [26]:
names.append('Madison Bumgarner')
prec_scores.append(mb_score)
adj_scores.append(mb_score/(1/3))

In [27]:
playerid_lookup('paddack','chris')

Gathering player lookup table. This may take a moment.


Unnamed: 0,name_last,name_first,key_mlbam,key_retro,key_bbref,key_fangraphs,mlb_played_first,mlb_played_last
0,paddack,chris,663978,paddc001,paddach01,20099,2019.0,2020.0


In [28]:
cp_score = pitch_classification(663978, 'score')

Gathering Player Data


In [29]:
names.append('Chris Paddack')
prec_scores.append(cp_score)
adj_scores.append(cp_score/(1/3))

In [30]:
playerid_lookup('degrom','jacob')

Gathering player lookup table. This may take a moment.


Unnamed: 0,name_last,name_first,key_mlbam,key_retro,key_bbref,key_fangraphs,mlb_played_first,mlb_played_last
0,degrom,jacob,594798,degrj001,degroja01,10954,2014.0,2020.0


In [31]:
jd_score = pitch_classification(594798, 'score')

Gathering Player Data


In [32]:
names.append('Jacob DeGrom')
prec_scores.append(jd_score)
adj_scores.append(jd_score/(1/3))

In [33]:
playerid_lookup('verlander','justin')

Gathering player lookup table. This may take a moment.


Unnamed: 0,name_last,name_first,key_mlbam,key_retro,key_bbref,key_fangraphs,mlb_played_first,mlb_played_last
0,verlander,justin,434378,verlj001,verlaju01,8700,2005.0,2020.0


In [34]:
jv_score = pitch_classification(434378, 'score')

Gathering Player Data


In [35]:
names.append('Justin Verlander')
prec_scores.append(jv_score)
adj_scores.append(jv_score/(1/3))

In [36]:
playerid_lookup('hendricks','kyle')

Gathering player lookup table. This may take a moment.


Unnamed: 0,name_last,name_first,key_mlbam,key_retro,key_bbref,key_fangraphs,mlb_played_first,mlb_played_last
0,hendricks,kyle,-1,,,-1,,
1,hendricks,kyle,543294,hendk001,hendrky01,12049,2014.0,2020.0


In [37]:
kh_score = pitch_classification(543294, 'score')

Gathering Player Data


In [38]:
names.append('Kyle Hendricks')
prec_scores.append(kh_score)
adj_scores.append(kh_score/0.25)

In [39]:
playerid_lookup('berrios','jose')

Gathering player lookup table. This may take a moment.


Unnamed: 0,name_last,name_first,key_mlbam,key_retro,key_bbref,key_fangraphs,mlb_played_first,mlb_played_last
0,berrios,jose,621244,berrj001,berrijo01,14168,2016.0,2020.0


In [40]:
jb_score = pitch_classification(621244, 'score')

Gathering Player Data


In [41]:
names.append('Jose Berrios')
prec_scores.append(jb_score)
adj_scores.append(jb_score/0.25)

In [42]:
playerid_lookup('odorizzi','jake')

Gathering player lookup table. This may take a moment.


Unnamed: 0,name_last,name_first,key_mlbam,key_retro,key_bbref,key_fangraphs,mlb_played_first,mlb_played_last
0,odorizzi,jake,543606,odorj001,odorija01,6397,2012.0,2019.0


In [43]:
jo_score = pitch_classification(543606, 'score')

Gathering Player Data


In [44]:
names.append('Jake Odorizzi')
prec_scores.append(jo_score)
adj_scores.append(jo_score/(1/3))

In [45]:
playerid_lookup('kershaw','clayton')

Gathering player lookup table. This may take a moment.


Unnamed: 0,name_last,name_first,key_mlbam,key_retro,key_bbref,key_fangraphs,mlb_played_first,mlb_played_last
0,kershaw,clayton,477132,kersc001,kershcl01,2036,2008.0,2019.0


In [46]:
ck_score = pitch_classification(477132, 'score')

Gathering Player Data


In [47]:
names.append('Clayton Kershaw')
prec_scores.append(ck_score)
adj_scores.append(ck_score/(1/3))

In [48]:
playerid_lookup('paxton','james')

Gathering player lookup table. This may take a moment.


Unnamed: 0,name_last,name_first,key_mlbam,key_retro,key_bbref,key_fangraphs,mlb_played_first,mlb_played_last
0,paxton,james,572020,paxtj001,paxtoja01,11828,2013.0,2020.0


In [49]:
jp_score = pitch_classification(572020,'score')

Gathering Player Data


In [50]:
names.append('James Paxton')
prec_scores.append(jp_score)
adj_scores.append(jp_score/(1/3))

In [51]:
playerid_lookup('snell','blake')

Gathering player lookup table. This may take a moment.


Unnamed: 0,name_last,name_first,key_mlbam,key_retro,key_bbref,key_fangraphs,mlb_played_first,mlb_played_last
0,snell,blake,605483,snelb001,snellbl01,13543,2016.0,2020.0


In [52]:
bs_score = pitch_classification(605483,'score')

Gathering Player Data


In [53]:
names.append('Blake Snell')
prec_scores.append(bs_score)
adj_scores.append(bs_score/(1/3))

In [54]:
playerid_lookup('nola','aaron')

Gathering player lookup table. This may take a moment.


Unnamed: 0,name_last,name_first,key_mlbam,key_retro,key_bbref,key_fangraphs,mlb_played_first,mlb_played_last
0,nola,aaron,605400,nolaa001,nolaaa01,16149,2015.0,2020.0


In [55]:
an_score = pitch_classification(605400,'score')

Gathering Player Data


In [56]:
names.append('Aaron Nola')
prec_scores.append(an_score)
adj_scores.append(an_score/0.25)

In [57]:
playerid_lookup('flaherty','jack')

Gathering player lookup table. This may take a moment.


Unnamed: 0,name_last,name_first,key_mlbam,key_retro,key_bbref,key_fangraphs,mlb_played_first,mlb_played_last
0,flaherty,jack,-1,,,-1,,
1,flaherty,jack,-1,,,-1,,
2,flaherty,jack,656427,flahj002,flaheja01,17479,2017.0,2020.0


In [58]:
jf_score = pitch_classification(656427, 'score')

Gathering Player Data


In [59]:
names.append('Jack Flaherty')
prec_scores.append(jf_score)
adj_scores.append(jf_score/0.25)

In [60]:
playerid_lookup('buehler','walker')

Gathering player lookup table. This may take a moment.


Unnamed: 0,name_last,name_first,key_mlbam,key_retro,key_bbref,key_fangraphs,mlb_played_first,mlb_played_last
0,buehler,walker,621111,buehw001,buehlwa01,19374,2017.0,2020.0


In [61]:
wb_score = pitch_classification(621111,'score')

Gathering Player Data


In [62]:
names.append('Walker Buehler')
prec_scores.append(wb_score)
adj_scores.append(wb_score/0.25)

In [63]:
names

['Max Scherzer',
 'Gerrit Cole',
 'Tyler Glasnow',
 'Madison Bumgarner',
 'Chris Paddack',
 'Jacob DeGrom',
 'Justin Verlander',
 'Kyle Hendricks',
 'Jose Berrios',
 'Jake Odorizzi',
 'Clayton Kershaw',
 'James Paxton',
 'Blake Snell',
 'Aaron Nola',
 'Jack Flaherty',
 'Walker Buehler']

In [64]:
data = {'Name':names, 'Precision Score':prec_scores, 'Adjusted Score':adj_scores}

In [66]:
import pandas as pd
df = pd.DataFrame(data)

In [67]:
final_df = df.sort_values('Adjusted Score', ascending = False)

In [68]:
final_df = final_df.reset_index(drop=True)

In [69]:
final_df

Unnamed: 0,Name,Precision Score,Adjusted Score
0,Blake Snell,0.730963,2.192889
1,Jose Berrios,0.541334,2.165335
2,Aaron Nola,0.526152,2.104608
3,Clayton Kershaw,0.69463,2.083889
4,Chris Paddack,0.623755,1.871265
5,Walker Buehler,0.446816,1.787264
6,Kyle Hendricks,0.430539,1.722157
7,Jake Odorizzi,0.55278,1.658339
8,Justin Verlander,0.538281,1.614844
9,Max Scherzer,0.396749,1.586996
