In [None]:
#If using Google Colab, installs dependencies
!pip -q install pybaseball
!pip -q install MLB-StatsAPI


[?25l[K     |▉                               | 10 kB 26.5 MB/s eta 0:00:01[K     |█▋                              | 20 kB 8.4 MB/s eta 0:00:01[K     |██▍                             | 30 kB 11.8 MB/s eta 0:00:01[K     |███▏                            | 40 kB 4.6 MB/s eta 0:00:01[K     |████                            | 51 kB 4.6 MB/s eta 0:00:01[K     |████▊                           | 61 kB 5.4 MB/s eta 0:00:01[K     |█████▌                          | 71 kB 6.0 MB/s eta 0:00:01[K     |██████▎                         | 81 kB 5.6 MB/s eta 0:00:01[K     |███████                         | 92 kB 6.2 MB/s eta 0:00:01[K     |███████▉                        | 102 kB 5.3 MB/s eta 0:00:01[K     |████████▊                       | 112 kB 5.3 MB/s eta 0:00:01[K     |█████████▌                      | 122 kB 5.3 MB/s eta 0:00:01[K     |██████████▎                     | 133 kB 5.3 MB/s eta 0:00:01[K     |███████████                     | 143 kB 5.3 MB/s eta 0:00:01[K   

In [None]:
import pandas as pd
import numpy as np
import sys
sys.path.insert(0,'/content/drive/MyDrive/BeatingTheStreak/2022Finalized')
import BaeBall as bb
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings("error")
import numpy.random
import statsapi
import os
import datetime
from sklearn.preprocessing import MinMaxScaler


In [None]:
#Dates for testing 
startdate=datetime.date(2022,4,7) 
enddate=datetime.date(2022,8,9)
day=startdate

DoubleThresh=0 #Threshhold at which we will "double up" batters 
#if model probabilities are below this threshhold, we will only take the top prediction
#set at 0 because I think it's always best to double up... Can be tested/theoretically proven. 


#Read our big training data sheets. See "train_data_generator.ipynb"
MatchupTrainData=pd.read_csv('/content/drive/MyDrive/BeatingTheStreak/NewFinalizedTraining/FinalizedTrainingData_UpdatedMatchup.csv')
NoMatchupTrainData=pd.read_csv('/content/drive/MyDrive/BeatingTheStreak/NewFinalizedTraining/FinalizedTrainingData_NoMatchup.csv')


MatchupCols=['AVG', 'OBP', 'SLG',
       'Contact% (pi)', 'Home', 'MatchupAverage',
       'BallparkNumber', 'era', 'h9', 'k9', 'whip', 'avg',
        'Game 1', 'Game 2', 'Game 3', 'Game 4', 'Game 5',]
NoMatchupCols=['AVG', 'OBP', 'SLG',
       'Contact% (pi)', 'Home', 
       'BallparkNumber', 'era', 'h9', 'k9', 'whip', 'avg',
        'Game 1', 'Game 2', 'Game 3', 'Game 4', 'Game 5',]

#Scaling our data. Also only using columns with features we're interested in.
SelectedMatchup=MatchupTrainData[MatchupCols]
SelectedNoMatchup=NoMatchupTrainData[NoMatchupCols]
Scaler=MinMaxScaler()
ScaledSelectedMatchup=pd.DataFrame(Scaler.fit_transform(SelectedMatchup),columns=MatchupCols)
Scaler=MinMaxScaler()
ScaledSelectedNoMatchup=pd.DataFrame(Scaler.fit_transform(SelectedNoMatchup),columns=NoMatchupCols)

#Training our models using the training data. 
#One model is trained only on batters and hitters that have previous matchups against one another, the other is trained without considering matchup data
#This is because it is not universal for there to be a previous matchup between batter and pitcher.
Matchup_Model=LogisticRegression(solver='saga',max_iter=4000,random_state=12)
TrainedMatchupModel=Matchup_Model.fit(ScaledSelectedMatchup,MatchupTrainData['TestLabels'])
NoMatchup_Model=LogisticRegression(solver='saga',max_iter=4000,random_state=12)
TrainedNoMatchupModel=NoMatchup_Model.fit(ScaledSelectedNoMatchup,NoMatchupTrainData['TestLabels'])

#Setting variables to record streaks. "BestOf" referes to a basic ensembling approach
#in which we take the highest two probabilities generated by both the matchup and nomatchup models. 
CurrentBestof=0
CurrentMatchupStreak=0
CurrentNoMatchupStreak=0

#Setting the best recorded streak variables
BestOfBest=0
MatchupBest=0
NoMatchupBest=0

#Creating dataframes that will store information regarding model predictions
MatchupResults=pd.DataFrame()
NoMatchupResults=pd.DataFrame()
BestOfResults=pd.DataFrame()

#Folders where we will save data sheets for each date tested
MatchupFeatSheetDir='/content/drive/MyDrive/BeatingTheStreak/FeatSheetsFor2022Season/Matchup'
NoMatchupFeatSheetDir='/content/drive/MyDrive/BeatingTheStreak/FeatSheetsFor2022Season/NoMatchup'


while day<enddate:
  
  #Get MLB schedule for the given date. If it's empty, continue.
  CurrentSched=statsapi.schedule(date=str(day))
  if len(CurrentSched)==0:
    day=day+datetime.timedelta(days=1)
    continue


  #Read the existing feature sheets if we've created them before.
  #This saves a lot of time rather than regather all the data from every source.
  if os.path.exists(os.path.join(MatchupFeatSheetDir,str(day)+'__Matchup.csv')) and os.path.exists(os.path.join(NoMatchupFeatSheetDir,str(day)+'__NoMatchup.csv')):
    MatchupTestData=pd.read_csv(os.path.join(MatchupFeatSheetDir,str(day)+'__Matchup.csv'))
    NoMatchupTestData=pd.read_csv(os.path.join(NoMatchupFeatSheetDir,str(day)+'__NoMatchup.csv'))
  else:  
    try: #Rare for errors to be thrown here, but sometimes exhibition matches can mess with things, etc. Will improve this in the future. 
      MatchupTestData,_,_=bb.get_feature_matrix(number_of_batters=200,date=str(day),
                                                number_of_games=5,GetMatchupValues=1)
      NoMatchupTestData,_,_=bb.get_feature_matrix(number_of_batters=200,date=str(day),
                                                  number_of_games=5,GetMatchupValues=0)
      

      #Saving our feature matrices so we do not need to regenerate on subsequent runs.
      MatchupTestData.to_csv(os.path.join(MatchupFeatSheetDir,str(day)+'__Matchup.csv'))
      NoMatchupTestData.to_csv(os.path.join(NoMatchupFeatSheetDir,str(day)+'__NoMatchup.csv'))
  
    except:
      day=day+datetime.timedelta(days=1)
      continue
  if len(MatchupTestData)==0: #Can occur in rare cases for various reasons. Will need to improve reporting of why
    day=day+datetime.timedelta(days=1)
    continue
  #Scaling data
  Scaler=MinMaxScaler()
  ScaledMatchupTestData=pd.DataFrame(Scaler.fit_transform(MatchupTestData[MatchupCols]),columns=MatchupCols)
  Scaler=MinMaxScaler()
  ScaledNoMatchupTestData=pd.DataFrame(Scaler.fit_transform(NoMatchupTestData[NoMatchupCols]),columns=NoMatchupCols)

  #Predicting and storing predictions
  MatchupProbs=TrainedMatchupModel.predict_proba(ScaledMatchupTestData)[:,1]
  NoMatchupProbs=TrainedNoMatchupModel.predict_proba(ScaledNoMatchupTestData)[:,1]

  MatchupDF=pd.DataFrame()
  MatchupDF['Players']=MatchupTestData.Name.values
  MatchupDF['Probabilities']=MatchupProbs
  MatchupDF['GroundTruths']=MatchupTestData.TestLabels.values
  MatchupDF=MatchupDF.sort_values(by='Probabilities',ascending=False)

  NoMatchupDF=pd.DataFrame()
  NoMatchupDF['Players']=NoMatchupTestData.Name.values
  NoMatchupDF['Probabilities']=NoMatchupProbs
  NoMatchupDF['GroundTruths']=NoMatchupTestData.TestLabels.values
  NoMatchupDF=NoMatchupDF.sort_values(by='Probabilities',ascending=False)

  #Code for tracking streaks. 
  if MatchupDF.iloc[0,2]:
    CurrentMatchupStreak+=1
      #Checks if the probability for our second guess is greater than the threshhold set.
    if MatchupDF.iloc[1,1]>DoubleThresh: 

      if MatchupDF.iloc[1,2]:
        CurrentMatchupStreak+=1
        
      else:
        CurrentMatchupStreak=0
      
    if CurrentMatchupStreak>MatchupBest:
      MatchupBest=CurrentMatchupStreak
  else:
    CurrentMatchupStreak=0
  TempMatchupResult=MatchupDF.iloc[0,:].copy().to_frame().T
  TempMatchupResult['Date']=str(day)
  TempMatchupResult.insert(0, 'Date', TempMatchupResult.pop('Date'))
  TempMatchupResult['Streak']=CurrentMatchupStreak
  MatchupResults=pd.concat((MatchupResults,TempMatchupResult),axis='rows',ignore_index=True)
  if MatchupDF.iloc[1,1]>DoubleThresh:
    TempMatchupResult=MatchupDF.iloc[1,:].copy().to_frame().T
    TempMatchupResult['Date']=str(day)
    TempMatchupResult.insert(0, 'Date', TempMatchupResult.pop('Date'))
    TempMatchupResult['Streak']=CurrentMatchupStreak
    MatchupResults=pd.concat((MatchupResults,TempMatchupResult),axis='rows',ignore_index=True)

    
  if NoMatchupDF.iloc[0,2]:
    CurrentNoMatchupStreak+=1
    if NoMatchupDF.iloc[1,1]>DoubleThresh:
      if NoMatchupDF.iloc[1,2]:
        CurrentNoMatchupStreak+=1
      else:
        CurrentNoMatchupStreak=0
    if CurrentNoMatchupStreak>NoMatchupBest:
      NoMatchupBest=CurrentNoMatchupStreak
    
  else:
    CurrentNoMatchupStreak=0
  TempNoMatchupResult=NoMatchupDF.iloc[0,:].copy().to_frame().T  
  TempNoMatchupResult['Date']=str(day)
  TempNoMatchupResult.insert(0, 'Date', TempNoMatchupResult.pop('Date'))
  TempNoMatchupResult['Streak']=CurrentNoMatchupStreak
  NoMatchupResults=pd.concat((NoMatchupResults,TempNoMatchupResult),axis='rows',ignore_index=True)

  if MatchupDF.iloc[1,1]>DoubleThresh:
  
    TempNoMatchupResult=NoMatchupDF.iloc[1,:].copy().to_frame().T  
    TempNoMatchupResult['Date']=str(day)
    TempNoMatchupResult.insert(0, 'Date', TempNoMatchupResult.pop('Date'))
    TempNoMatchupResult['Streak']=CurrentNoMatchupStreak
    NoMatchupResults=pd.concat((NoMatchupResults,TempNoMatchupResult),axis='rows',ignore_index=True)

  CombinedDF=pd.concat((MatchupDF.iloc[0:10,:],NoMatchupDF.iloc[0:10,:]),axis=0,ignore_index=True)
  CombinedDF=CombinedDF.sort_values(by='Probabilities',ascending=False)
  CombinedDF.drop_duplicates(subset='Players',ignore_index=True,keep='first')
  if CombinedDF.iloc[0,2]:
    CurrentBestof+=1
    if CombinedDF.iloc[1,1]>DoubleThresh:
      if CombinedDF.iloc[1,2]:
        CurrentBestof+=1 
      else:
        CurrentBestof=0
    if CurrentBestof>BestOfBest:
      BestOfBest=CurrentBestof
  else:
    CurrentBestof=0
  TempBestOfResult=CombinedDF.iloc[0,:].copy().to_frame().T  
  TempBestOfResult['Date']=str(day)
  TempBestOfResult.insert(0, 'Date', TempBestOfResult.pop('Date'))
  TempBestOfResult['Streak']=CurrentBestof
  BestOfResults=pd.concat((BestOfResults,TempBestOfResult),axis='rows',ignore_index=True)

  if CombinedDF.iloc[1,1]>DoubleThresh: 
    TempBestOfResult=CombinedDF.iloc[1,:].copy().to_frame().T  
    TempBestOfResult['Date']=str(day)
    TempBestOfResult.insert(0, 'Date', TempBestOfResult.pop('Date'))
    TempBestOfResult['Streak']=CurrentBestof
    BestOfResults=pd.concat((BestOfResults,TempBestOfResult),axis='rows',ignore_index=True)
    
  print(" Date: %s \n Current Streaks: MATCHUP = %s NO_MATCHUP = %s BEST_OF = %s \n Best Streaks: MATCHUP = %s NO_MATCHUP = %s BEST_OF = %s \n" % (str(day),CurrentMatchupStreak,CurrentNoMatchupStreak,CurrentBestof,MatchupBest,NoMatchupBest,BestOfBest))
  day=day+datetime.timedelta(days=1)