In [1]:
#Code for if using Google Colab

!pip -q install pybaseball
!pip -q install MLB-StatsAPI


[K     |████████████████████████████████| 415 kB 5.1 MB/s 
[K     |████████████████████████████████| 291 kB 48.6 MB/s 
[K     |████████████████████████████████| 856 kB 50.9 MB/s 
[K     |████████████████████████████████| 69 kB 2.7 MB/s 
[?25h  Building wheel for pymrmr (setup.py) ... [?25l[?25hdone


In [2]:
import pandas as pd
import numpy as np
import sys
sys.path.insert(0,'') #You may need to include the path to where these folders are stored if using Colab
import BaeBall as bb
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings("error")
import numpy.random
import statsapi
import os
import datetime
from sklearn.preprocessing import MinMaxScaler


In [11]:

day=datetime.date(year=2022,month=8,day=14)
DoubleThresh=0

#Previously generated training data. See "train_data_generator.ipynb"
MatchupTrainData=pd.read_csv('Matchup_TrainingData.csv')
NoMatchupTrainData=pd.read_csv('NoMatchup_TrainingData.csv')


#Features of interest for our two models
MatchupCols=['AVG', 'OBP', 'SLG',
       'Contact% (pi)', 'Home', 'MatchupAverage',
       'BallparkNumber', 'era', 'h9', 'k9', 'whip', 'avg',
        'Game 1', 'Game 2', 'Game 3', 'Game 4', 'Game 5',]
NoMatchupCols=['AVG', 'OBP', 'SLG',
       'Contact% (pi)', 'Home', 
       'BallparkNumber', 'era', 'h9', 'k9', 'whip', 'avg',
        'Game 1', 'Game 2', 'Game 3', 'Game 4', 'Game 5',]

SelectedMatchup=MatchupTrainData[MatchupCols]
SelectedNoMatchup=NoMatchupTrainData[NoMatchupCols]

#Scaling features
Scaler=MinMaxScaler()
ScaledSelectedMatchup=pd.DataFrame(Scaler.fit_transform(SelectedMatchup),columns=MatchupCols)
Scaler=MinMaxScaler()
ScaledSelectedNoMatchup=pd.DataFrame(Scaler.fit_transform(SelectedNoMatchup),columns=NoMatchupCols)

#Training models
Matchup_Model=LogisticRegression(solver='saga',max_iter=4000,random_state=12)
TrainedMatchupModel=Matchup_Model.fit(ScaledSelectedMatchup,MatchupTrainData['TestLabels'])
NoMatchup_Model=LogisticRegression(solver='saga',max_iter=4000,random_state=12)
TrainedNoMatchupModel=NoMatchup_Model.fit(ScaledSelectedNoMatchup,NoMatchupTrainData['TestLabels'])

#Getting test data for date of interest
MatchupTestData,matchup_log,matchup_progress_dfs=bb.get_feature_matrix(number_of_batters=200,date=str(day),
                                                number_of_games=5,GetMatchupValues=1,train=0)
NoMatchupTestData,nomatch_uplog,nomatchup_progress_dfs=bb.get_feature_matrix(number_of_batters=200,date=str(day),
                                                  number_of_games=5,GetMatchupValues=0,train=0)

#Scaling test data
Scaler=MinMaxScaler()
ScaledMatchupTestData=pd.DataFrame(Scaler.fit_transform(MatchupTestData[MatchupCols]),columns=MatchupCols)
Scaler=MinMaxScaler()
ScaledNoMatchupTestData=pd.DataFrame(Scaler.fit_transform(NoMatchupTestData[NoMatchupCols]),columns=NoMatchupCols)

#Predicting using models
MatchupProbs=TrainedMatchupModel.predict_proba(ScaledMatchupTestData)[:,1]
NoMatchupProbs=TrainedNoMatchupModel.predict_proba(ScaledNoMatchupTestData)[:,1]

#Creating prediction dataframes. These list probabilities for all n players in the test data
MatchupDF=pd.DataFrame()
MatchupDF['Players']=MatchupTestData.Name.values
MatchupDF['Team']=MatchupTestData.Team.values
MatchupDF['Probabilities']=MatchupProbs

MatchupDF=MatchupDF.sort_values(by='Probabilities',ascending=False)

NoMatchupDF=pd.DataFrame()
NoMatchupDF['Players']=NoMatchupTestData.Name.values
NoMatchupDF['Team']=NoMatchupTestData.Team.values
NoMatchupDF['Probabilities']=NoMatchupProbs

NoMatchupDF=NoMatchupDF.sort_values(by='Probabilities',ascending=False)

CombinedDF=pd.concat((MatchupDF.iloc[0:10,:],NoMatchupDF.iloc[0:10,:]),axis=0,keys=['Matchup','No Matchup'])
CombinedDF=CombinedDF.sort_values(by='Probabilities',ascending=False)
CombinedDF=CombinedDF.drop_duplicates(subset='Players',keep='first')
CombinedDF=CombinedDF.droplevel(level=1)
CombinedDF.index.name="Model"

## **Batter Predictions**

In [13]:
MatchupDF.iloc[[0,1],:] #Top 2 hit probabilities when considering only batter with previous records against the starting pitcher  

Unnamed: 0,Players,Team,Probabilities
3,Jose Ramirez,CLE,0.88129
2,Paul Goldschmidt,STL,0.875941


In [14]:
NoMatchupDF.iloc[[0,1],:] #Top 2 hit probabilities without considering batter-pitcher matchup stats

Unnamed: 0,Players,Team,Probabilities
40,Luis Arraez,MIN,0.90034
97,Gio Urshela,MIN,0.880166


In [15]:
CombinedDF.iloc[[0,1],:] #Top 2 hit probabilities when combining the above two. 

Unnamed: 0_level_0,Players,Team,Probabilities
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
No Matchup,Luis Arraez,MIN,0.90034
Matchup,Jose Ramirez,CLE,0.88129


## **Information about model coefficients for Logistic Regression**

In [8]:
MatchupCoeffs=pd.DataFrame()
MatchupCoeffs['Variables']=list(ScaledMatchupTestData.columns)
MatchupCoeffs['Model Coefficients']=TrainedMatchupModel.coef_[0]
MatchupCoeffs

Unnamed: 0,Variables,Model Coefficients
0,AVG,1.370514
1,OBP,-0.034673
2,SLG,0.224716
3,Contact% (pi),-0.025133
4,Home,0.026346
5,MatchupAverage,0.257208
6,BallparkNumber,-0.203471
7,era,0.507483
8,h9,0.53427
9,k9,0.359055


In [9]:
NoMatchupCoeffs=pd.DataFrame()
NoMatchupCoeffs['Variables']=list(ScaledNoMatchupTestData.columns)
NoMatchupCoeffs['Model Coefficients']=TrainedNoMatchupModel.coef_[0]
NoMatchupCoeffs

Unnamed: 0,Variables,Model Coefficients
0,AVG,0.947196
1,OBP,-0.087931
2,SLG,0.265785
3,Contact% (pi),-0.036098
4,Home,-0.046852
5,BallparkNumber,-0.087237
6,era,0.332885
7,h9,0.528062
8,k9,0.022807
9,whip,-0.656591
