<a href="https://colab.research.google.com/github/jackl14/Baseball-Prediction-Project/blob/main/MLB_Predictor_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from scipy.stats import binomtest
from sklearn.model_selection import train_test_split
import requests
from bs4 import BeautifulSoup
import re
from contextlib import redirect_stdout
import io
import sys


# Create a StringIO object to capture stdout so that tensorflow doesn't print extra outputs
stopper = io.StringIO()



mlb_teams = {
    'Arizona Diamondbacks': 'ARI',
    'Atlanta Braves': 'ATL',
    'Baltimore Orioles': 'BAL',
    'Boston Red Sox': 'BOS',
    'Chicago White Sox': 'CHW',
    'Chicago Cubs': 'CHC',
    'Cincinnati Reds': 'CIN',
    'Cleveland Guardians': 'CLE',
    'Colorado Rockies': 'COL',
    'Detroit Tigers': 'DET',
    'Houston Astros': 'HOU',
    'Kansas City Royals': 'KCR',
    'Los Angeles Angels': 'LAA',
    'Los Angeles Dodgers': 'LAD',
    'Miami Marlins': 'MIA',
    'Milwaukee Brewers': 'MIL',
    'Minnesota Twins': 'MIN',
    'New York Yankees': 'NYY',
    'New York Mets': 'NYM',
    'Oakland Athletics': 'OAK',
    'Philadelphia Phillies': 'PHI',
    'Pittsburgh Pirates': 'PIT',
    'San Diego Padres': 'SDP',
    'San Francisco Giants': 'SFG',
    'Seattle Mariners': 'SEA',
    'St. Louis Cardinals': 'STL',
    'Tampa Bay Rays': 'TBR',
    'Texas Rangers': 'TEX',
    'Toronto Blue Jays': 'TOR',
    'Washington Nationals': 'WSN'
}


def extract_numbers(string):
    numbers = re.findall(r'\d+', string)
    if len(numbers) >= 2:
        return int(numbers[0]), int(numbers[1])
    else:
        return None, None

def readInHistoricalData():
  df = pd.read_csv('/content/MLBDataframeWithRatings.csv')

  #Getting rid of faulty entries
  df.dropna(inplace=True)
  df = df.drop(df[df.apply(lambda row: row.astype(str).str.contains('Not Found').any(), axis=1)].index)

  return df

def get_last_ratings(df_historical):
    df = df_historical
    ratings = {}

    # Get unique team codes from the dataframe
    teams = mlb_teams.values()

    # Iterate over unique team codes to get the last rating for each team
    for team_code in teams:
      found = False
      i = 1

      while found == False:
        i = i + 1
        j = len(df) - i

        try:

          if df.loc[j,"Home_ID"] == team_code:
            ratings[team_code] = df.loc[j,"Home Rating"]
            found = True
          elif df.loc[j,"Away_ID"] == team_code:
            ratings[team_code] = df.loc[j,"Away Rating"]
            found = True
        except KeyError:
          pass

    return ratings

def regressLastYearRatingsToMean():
  for key in ratings:
    ratings[key] = ratings[key] - (ratings[key] - 1500) / 2



def scrapeLatestData():

  #Scrapes 2024 game outcomes
  r=requests.get("https://www.baseball-reference.com/leagues/majors/2024-schedule.shtml")
  parsing = BeautifulSoup(r.text, 'html.parser')

  #Finds game entries
  games = parsing.find_all('p', class_='game')

  #Initialize lists
  home_teams = []
  away_teams = []
  home_scores = []
  away_scores = []

  #Loop through games
  for game in games:

      #Extract team names
      teams = game.find_all('a')
      home_team = teams[0].text.strip()
      away_team = teams[1].text.strip()

      #Handle cases where game is in the future
      gtext = game.text
      if ':' not in gtext:

        #Get scores
        home_score, away_score = extract_numbers(gtext)
        #Add to lists
        home_teams.append(home_team)
        away_teams.append(away_team)
        home_scores.append(home_score)
        away_scores.append(away_score)

  # Create a Pandas DataFrame
  data = {
      'Home Team': home_teams,
      'Away Team': away_teams,
      'R Home': home_scores,
      'R Away': away_scores
  }

  df = pd.DataFrame(data)

  #Get rid of faulty rows
  df.dropna(inplace=True)

  return df

def expectation(elo1,elo2): #That player 1 wins
    #Elo equation
    expectation = (1+10**((elo2-elo1)/400))**-1

    return expectation

def update_rating(expectation,player_rating,result,score1,score2):
    k = 20
    normal_win_factor = 3.55 #This is the average value that a team wins a baseball game by
    #Rating updater
    blowout_proportion = abs(score1-score2) # Swings elo changes more in blowouts
    normalized_blowout = blowout_proportion  / normal_win_factor
    normalized_blowout = normalized_blowout ** 0.5 #Square root maps it closer to 1

    return player_rating + k * normalized_blowout * (result - expectation)

def resolveOneGame(i,df):

    #Look at home and away teams
    home = df.loc[i,'Home Team']
    if home == "Arizona D'Backs":
      home = "Arizona Diamondbacks"

    away = df.loc[i,'Away Team']
    if away == "Arizona D'Backs":
      away = "Arizona Diamondbacks"
    #print(home,away)

    #Get Codes
    home_code = mlb_teams[home]
    away_code = mlb_teams[away]

    #Look up their ratings
    helo = ratings[home_code]
    aelo = ratings[away_code]
    exp= expectation(helo,aelo)
    #print(aelo,helo,exp)

    #Look up the game scores
    homescore = df.loc[i,'R Home']
    awayscore = df.loc[i,'R Away']

    #Stores their ratings from before (potentially usefull later)
    bhratings = ratings[home_code]
    baratings = ratings[away_code]

    #Updates ratings based on game outcome
    if homescore > awayscore:
        ratings[home_code] = update_rating(exp,helo,1,homescore,awayscore)
        ratings[away_code] = update_rating(1-exp,aelo,0,homescore,awayscore)
    elif homescore < awayscore:
        ratings[home_code] = update_rating(exp,helo,0,homescore,awayscore)
        ratings[away_code] = update_rating(1-exp,aelo,1,homescore,awayscore)
    else:
        ratings[home_code] = update_rating(exp,helo,0.5,homescore,awayscore)
        ratings[away_code] = update_rating(1-exp,aelo,0.5,homescore,awayscore)

    #Not strictly necessary but I used it when scraping the 2023 data
    return bhratings, baratings

def resolve_all_games(df_new):
  for i in range(len(df_new)):
    resolveOneGame(i,df_new)


def trainNueralNet(df):

  #Convert dataframe data into y data
  y_tuple = np.array([list(row) for row in df[['R Home', 'R Away']].values])

  #Doing the same for x
  x_tuple = np.array([list(row) for row in df[['Home Rating', 'Away Rating', 'HomePitcherERA', 'AwayPitcherERA']].values.astype(float)])

  #Train Test Split
  ts = 0.2
  rs = 42
  x_train, x_test, y_train, y_test = train_test_split(x_tuple, y_tuple, test_size=ts, random_state=rs)

  #Global
  global model
  model = tf.keras.Sequential([
        tf.keras.layers.Dense(16, activation='relu', input_shape=(4,)),  # Input layer
        tf.keras.layers.Dense(16, activation='relu'),  # Hidden layer
        tf.keras.layers.Dense(2)  # Output layer with 2 neurons for 2 output features
    ])

  # Compile the model
  model.compile(optimizer='adam', loss=tf.keras.losses.Huber(), metrics=['mae'])

  # Train the model
  history = model.fit(x_train, y_train, epochs=20, batch_size=32, validation_split=0.2)

  # Evaluate the model
  loss, mae = model.evaluate(x_test, y_test)

  print(f"Test Loss: {loss}")
  print(f"Test MAE: {mae}")

  return x_train, x_test, y_train, y_test

def getNNpreds(home_rating, away_rating, homeERA, awayERA):
  X = np.array([[home_rating, away_rating, homeERA, awayERA]])
  #Blocks a print line
  with redirect_stdout(stopper):
    #Makes predictions
    prediction = model.predict(X)
  return prediction

def visualizeStats(x_test, y_test):

  #Initialize counters
  home_win_pred = 0
  home_win_wrong = 0
  away_win_pred = 0
  away_win_wrong = 0

  high_pred = 0
  high_wrong = 0
  low_pred = 0
  low_wrong = 0

  #Loop through test data and update counters
  for i in range(len(y_test)):

    #Home vs away
    pred = getNNpreds(x_test[i][0],x_test[i][1],x_test[i][2],x_test[i][3])
    if y_test[i][0] > y_test[i][1]:
      #Home win
      if pred[0][0] > pred[0][1]:
        #Home predicted to win
        home_win_pred += 1
      else:
        home_win_wrong += 1
    else:
      #Away win
      if pred[0][0] < pred[0][1]:
        #Away predicted to win
        away_win_pred += 1
      else:
        away_win_wrong += 1

    #High scoring vs low scoring
    true_score = y_test[i][0] + y_test[i][1]
    pred_score = pred[0][0] + pred[0][1]

    if true_score > 8.5:
      if pred_score > 8.5:
        high_pred +=1
      else:
        high_wrong += 1
    else:
      if pred_score < 8.5:
        low_pred += 1
      else:
        low_wrong += 1

    #Give feedback on how long it might take
    if i%100 == 0:
      print(f"The model has run prediction {i} out of {len(y_test)}")

  #Plotting
  confusion_matrix_ha = np.array([[home_win_pred, home_win_wrong],
                            [away_win_wrong, away_win_pred]])
  plt.imshow(confusion_matrix_ha, interpolation='nearest', cmap=plt.cm.Blues)
  plt.title('Confusion Matrix (Home vs away)')
  plt.colorbar()
  plt.xlabel('Winner prediction: Home (left) vs Away (right)')
  plt.ylabel('True Outcome: Home Win (top) vs Away Win (bottom)')
  for i in range(2):
      for j in range(2):
          plt.text(j, i, str(confusion_matrix_ha[i][j]), ha='center', va='center', color='orange')

  plt.show()

  confusion_matrix_hl = np.array([[high_pred, high_wrong],
                            [low_wrong,low_pred]])
  plt.imshow(confusion_matrix_hl, interpolation='nearest', cmap=plt.cm.Blues)
  plt.title('Confusion Matrix (High vs Low Scoring)')
  plt.colorbar()
  plt.xlabel('Score prediction: High (left) vs Low (right)')
  plt.ylabel('True Outcome: High (top) vs Low (bottom)')
  for i in range(2):
      for j in range(2):
          plt.text(j, i, str(confusion_matrix_hl[i][j]), ha='center', va='center', color='orange')

  plt.show()


  #Return how many games the model successfuly predicted
  return home_win_pred + away_win_pred, home_win_wrong + away_win_wrong

def binomialTest(right,wrong):
  # Total number of trials
  n_trials = right + wrong

  # Probability of success for a fair coin flip (0.5)
  p_null = 0.5

  # Perform binomial test
  result = binomtest(right, n_trials, p_null)
  p_value = result.pvalue

  # Print the p-value
  print(f"The p-value for the binomial test is: {p_value}")

  # Check if the process guesses better than a coin flip
  alpha = 0.05  # significance level
  if p_value < alpha:
      print("The nueral net guesses the winner better than a coin flip (reject the null hypothesis).")
  else:
      print("The nueral net does cannot be said to guess better than a coin flip (fail to reject the null hypothesis at a significance level of 0.05).")


def promptForData():
  try:
    #Take inputs
    home_team = input("Enter the home team: ")
    home_id = mlb_teams[home_team]
    away_team = input("Enter the away team: ")
    away_id = mlb_teams[away_team]
    home_era = float(input("Enter the ERA of the home pitcher: "))
    away_era = float(input("Enter the ERA of the away pitcher: "))
    raway = float(ratings[away_id])
    rhome = float(ratings[home_id])

    #Printing
    print(f"{home_team}: {rhome} {away_team}: {raway}")
    #Finding probability of a home win
    prob = expectation(rhome,raway)
    print(f"According to the elos, {home_team} have a {prob} chance of winning while {away_team} have a {1 - prob} chance.")
    prediction = getNNpreds(rhome, raway, home_era, away_era)
    print(f"{home_team}: {prediction[0][0]}")
    print(f"{away_team}: {prediction[0][1]}")

  except Exception as e:
    # Handle the user entering something that would cause an error
    print(f"An error occurred: {e}")
  print()


def main():
  df_hist = readInHistoricalData()

  global ratings

  ratings = get_last_ratings(df_hist)
  regressLastYearRatingsToMean()
  newDf = scrapeLatestData()
  resolve_all_games(newDf)
  x_train, x_test, y_train, y_test = trainNueralNet(df_hist)
  right, wrong = visualizeStats(x_test, y_test)
  binomialTest(right,wrong)
  while True:
    promptForData()

main()



Epoch 1/20


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - loss: 319.4264 - mae: 319.9258 - val_loss: 32.0255 - val_mae: 32.5209
Epoch 2/20
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 15.1401 - mae: 15.6309 - val_loss: 4.9835 - val_mae: 5.4672
Epoch 3/20
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 5.1968 - mae: 5.6770 - val_loss: 4.4842 - val_mae: 4.9606
Epoch 4/20
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 4.3562 - mae: 4.8302 - val_loss: 3.4722 - val_mae: 3.9420
Epoch 5/20
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - loss: 3.5331 - mae: 4.0017 - val_loss: 2.7309 - val_mae: 3.1987
Epoch 6/20
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 2.9884 - mae: 3.4553 - val_loss: 2.6809 - val_mae: 3.1402
Epoch 7/20
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 2.7781 - m

KeyboardInterrupt: Interrupted by user