In [1]:
# Import libraries
import statsapi
import requests as re
import pandas as pd
import datetime
import numpy as np
import pickle
from tqdm import tqdm
import time
import more_itertools as mit
from bs4 import BeautifulSoup
from collections import defaultdict
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from pyowm.utils import timestamps
from pyowm.owm import OWM
from selenium import webdriver
import sqlalchemy
driver = webdriver.Firefox()

In [2]:
# Connect to a local postgres database for weather data
engine = sqlalchemy.create_engine('postgresql://postgres:password@localhost:5432/Weather')

In [3]:
# Get game info for today's game
f = {'game_id':[],'away_id':[],'home_id':[],'apitcher':[],'hpitcher':[],'time':[]}
for game in statsapi.schedule(date=str(datetime.date.today())):
    f['game_id'].append(game['game_id'])
    f['away_id'].append(game['away_id'])
    f['home_id'].append(game['home_id'])
    f['apitcher'].append(game['away_probable_pitcher'])
    f['hpitcher'].append(game['home_probable_pitcher'])
    f['time'].append(game['game_datetime'][11:16])
f = pd.DataFrame(f)
f = f[f['apitcher'] != ''][f['hpitcher'] != ''].reset_index(drop=True)

  # Remove the CWD from sys.path while we load stuff.


In [4]:
# Web-scrape starting pitcher data
pa = defaultdict(list)
ph = defaultdict(list)

# away pitchers
for p in f['apitcher']:
    ps = statsapi.lookup_player(p)
    if len(ps) == 1:
        pid = ps[0]['id']
    else:
        for player in ps:
            if player['primaryPosition']['abbreviation'] == 'P':
                pid = player['id']
    for key, value in statsapi.player_stat_data(pid, group='pitching',type='career')['stats'][0]['stats'].items():
        pa[key+'_away'].append(value)

# home pitchers
for p in f['hpitcher']:
    ps = statsapi.lookup_player(p)
    if len(ps) == 1:
        pid = ps[0]['id']
    else:
        for player in ps:
            if player['primaryPosition']['abbreviation'] == 'P':
                pid = player['id']
    for key, value in statsapi.player_stat_data(pid, group='pitching',type='career')['stats'][0]['stats'].items():
        ph[key+'_home'].append(value)

# delete empty keys of the dictionary if there are any
todelete = []
for k in ph.keys():
    if len(ph[k]) < f.shape[0]:
        todelete.append(k)
        
for k in todelete:
    del ph[k]
    
todelete = [] 
for k in pa.keys():
    if len(pa[k]) < f.shape[0]:
        todelete.append(k)

for k in todelete:
    del pa[k]


# convert and append to a dataframe
ph = pd.DataFrame(ph)
pa = pd.DataFrame(pa)
f = pd.concat([f,pa,ph],axis=1)

# read in the team abbreviations file
abbre = pd.read_csv('Data/teamabbre.csv')

# Merge
f = f.merge(abbre,left_on=['away_id'],right_on=['teamid']).drop(columns=['teamid']).merge(abbre,left_on=['home_id'],right_on=['teamid'],suffixes=['_away','_home']).drop(columns=['teamid'])

# web-scrape away team batting splits
a = defaultdict(list)
for team in tqdm(f['teamabb_away']):
    if team == 'WAS':
        teamm = 'WSN'
    elif team == 'CWS':
        teamm = 'CHW'
    elif team == 'ANA':
        teamm = 'LAA'
    elif team == 'SF':
        teamm = 'SFG'
    elif team == 'KC':
        teamm = 'KCR'
    elif team == 'LA':
        teamm = 'LAD'
    elif team == 'TB':
        teamm = 'TBR'
    elif team == 'SD':
        teamm = 'SDP'
    else:
        teamm = team
 
    driver.get("https://www.baseball-reference.com/teams/split.cgi?t=b&team="+teamm+"&year=2021")
    t = driver.find_element_by_id("hmvis").text

    away = t.split('\n')[2].split(' ')
    a['BA_away'].append(away[15])
    a['OBP_away'].append(away[16])
    a['SLG_away'].append(away[17])
    a['OPS_away'].append(away[18])
    a['BAbip_away'].append(away[26])

    time.sleep(2)
a = pd.DataFrame(a)

# webscrape for home team batting splits
h = defaultdict(list)
for team in tqdm(f['teamabb_home']):
    if team == 'WAS':
        teamm = 'WSN'
    elif team == 'CWS':
        teamm = 'CHW'
    elif team == 'ANA':
        teamm = 'LAA'
    elif team == 'SF':
        teamm = 'SFG'
    elif team == 'KC':
        teamm = 'KCR'
    elif team == 'LA':
        teamm = 'LAD'
    elif team == 'TB':
        teamm = 'TBR'
    elif team == 'SD':
        teamm = 'SDP'
    else:
        teamm = team

    driver.get("https://www.baseball-reference.com/teams/split.cgi?t=b&team="+teamm+"&year=2021")
    t = driver.find_element_by_id("hmvis").text

    home = t.split('\n')[1].split(' ')
    h['BA_home'].append(home[15])
    h['OBP_home'].append(home[16])
    h['SLG_home'].append(home[17])
    h['OPS_home'].append(home[18])
    h['BAbip_home'].append(home[26])

    time.sleep(2)
h = pd.DataFrame(h)

# combine the columns
f = pd.concat([f,a,h],axis=1)

# drop columns we dont use for training
pre = f.drop(columns=['away_id','home_id','apitcher','hpitcher','teamabb_away'])

100%|██████████████████████████████████████████████████████████████████████████████████| 14/14 [01:22<00:00,  5.89s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 14/14 [01:14<00:00,  5.31s/it]


In [5]:
# geolocation of each home venue
venues = [{"team":"ANA","address":"2000 Gene Autry Way, Anaheim, CA. 92806","lat":33.799572,"lng":-117.889031},
      {"team":"ARI","address":"P.O. Box 2095, Phoenix, AZ. 85001","lat":33.452922,"lng":-112.038669},
      {"team":"ATL","address":"P.O. Box 4064, Atlanta, GA. 30302","lat":33.74691,"lng":-84.391239},
      {"team":"BAL","address":"333 W. Camden Street, Baltimore, MD. 21201","lat":39.285243,"lng":-76.620103},
      {"team":"BOS","address":"4 Yawkey Way, Boston, MA 02215","lat":42.346613,"lng":-71.098817},
      {"team":"CHC","address":"1060 Addison Street, Chicago, IL 60616","lat":41.947201,"lng":-87.656413},
      {"team":"CWS","address":"333 W. 35th Street, Chicago, IL 60616","lat":41.830883,"lng":-87.635083},
      {"team":"CIN","address":"100 Cinergy Field, Cincinnati, OH 45202","lat":39.107183,"lng":-84.507713},
      {"team":"CLE","address":"2401 Ontario Street, Cleveland, OH 44115","lat":41.495149,"lng":-81.68709},
      {"team":"COL","address":"Coors Field, 2001 Blake Street, Denver, CO 80205-2000","lat":39.75698,"lng":-104.965329},
      {"team":"DET","address":"Comerica Park, 2100 Woodward Ave., Detroit, MI 48201","lat":42.346354,"lng":-83.059619},
      {"team":"MIA","address":"2269 NW 199th Street, Miami, FL 33056","lat":25.954428,"lng":-80.238164},
      {"team":"HOU","address":"P.O. Box 288, Houston, TX 77001-0288","lat":29.76045,"lng":-95.369784},
      {"team":"KC","address":"P.O. Boz 419969, Kansas City, MO 64141","lat":39.10222,"lng":-94.583559},
      {"team":"LA","address":"1000 Elysian Park Ave., Los Angeles, CA 90012","lat":34.072437,"lng":-118.246879},
      {"team":"MIL","address":"P.O. Box 3099, Milwaukee, WI 53201-3099","lat":43.04205,"lng":-87.905599},
      {"team":"MIN","address":"501 Chicago Ave. S., Minneapolis, MN 55415","lat":44.974346,"lng":-93.259616},
      {"team":"WAS","address":"1500 South Capitol Street SE, Washington, DC","lat":38.87,"lng":-77.01},
      {"team":"NYM","address":"Roosevelt Ave & 126th Street, New York, NY 11368","lat":40.75535,"lng":-73.843219},
      {"team":"NYY","address":"Yankee Stadium, E. 161 Street & River Ave., New York, NY 10451","lat":40.819782,"lng":-73.929939},
      {"team":"OAK","address":"Oakland Coliseum, 700 Coliseum Way, Oakland, Ca 94621-1918","lat":37.74923,"lng":-122.196487},
      {"team":"PHI","address":"P.O. Box 7575, Philadelphia, PA 19101","lat":39.952313,"lng":-75.162392},
      {"team":"PIT","address":"600 Stadium Circle, Pittsburgh, PA 15212","lat":40.461503,"lng":-80.008924},
      {"team":"STL","address":"250 Stadium Plaza, St. Louis, MO 63102","lat":38.629683,"lng":-90.188247},
      {"team":"SD","address":"P.O. Box 2000, San Diego, CA 92112-2000","lat":32.752148,"lng":-117.143635},
      {"team":"SF","address":"Pacific Bell Park, 24 Willie Mays Plaza, San Francisco, CA 94107","lat":37.77987,"lng":-122.389754},
      {"team":"SEA","address":"P.O. Box 41000, 411 First Ave. S., Seattle, WA 98104","lat":47.60174,"lng":-122.330829},
      {"team":"TB","address":"1 Tropicana Drive, St. Petersburg, FL 33705","lat":27.768487,"lng":-82.648191},
      {"team":"TEX","address":"1000 Ballpark Way, Arlington, TX 76011","lat":32.750156,"lng":-97.081117},
      {"team":"TOR","address":"1 James D. Griffin Plaza, Buffalo, NY ","lat":42.8804,"lng":-78.8738}]

In [6]:
# OpenWeatherMap API
owm = OWM('xxxxx')
mgr = owm.weather_manager()

In [9]:
# Get the weather forcasts based on game location
wdf = {'game_id':[],'pressure':[],'temperature':[],'temperature_feelslike':[],'humidity':[],'rainprob':[]}
for i in range (pre.shape[0]):
    for stadium in venues:
        if stadium['team'] == pre.iloc[i]['teamabb_home']:
            forcasts = mgr.one_call(stadium['lat'],stadium['lng']).forecast_hourly
            for w in forcasts:
                if w.reference_time('iso')[11:13] == pre.iloc[i]['time'][:2]:
                    wdf['game_id'].append(pre.iloc[i]['game_id'])
                    wdf['pressure'].append(w.pressure['press'])
                    wdf['temperature'].append(w.temperature('fahrenheit')['temp'])
                    wdf['temperature_feelslike'].append(w.temperature('fahrenheit')['feels_like'])
                    wdf['humidity'].append(w.humidity)
                    wdf['rainprob'].append(w.precipitation_probability)
                    break

In [11]:
# Stores the weather data in a local database for future use
pd.DataFrame(wdf).to_sql('mlbweather',con=engine,if_exists='append',index=False)

In [14]:
# Take in temperature
pre['temp'] = pd.DataFrame(wdf)['temperature']

In [15]:
# Reformat the time
pre['time'] = pre['time'].apply(lambda x: float(x[:2]))

In [17]:
# Drop columns we dont use for training
pre = pre.drop(columns=['game_id','teamabb_home'])

In [18]:
# Replace null averages with 0
pre = pre.replace('.---',0).replace('-.--',0).astype(float)

In [19]:
# Select predictors for training
pca = ['avg_away','obp_away','slg_away','ops_away','stolenBasePercentage_away','era_away','whip_away','strikePercentage_away','groundOutsToAirouts_away','pitchesPerInning_away','strikeoutWalkRatio_away','strikeoutsPer9Inn_away','walksPer9Inn_away','hitsPer9Inn_away','runsScoredPer9_away','homeRunsPer9_away']
pch = ['avg_home','obp_home','slg_home','ops_home','stolenBasePercentage_home','era_home','whip_home','strikePercentage_home','groundOutsToAirouts_home','pitchesPerInning_home','strikeoutWalkRatio_home','strikeoutsPer9Inn_home','walksPer9Inn_home','hitsPer9Inn_home','runsScoredPer9_home','homeRunsPer9_home']
bca = ['BA_away','OBP_away','SLG_away','OPS_away','BAbip_away']
bch = ['BA_home','OBP_home','SLG_home','OPS_home','BAbip_home']
temp = ['temp','time']

In [21]:
# load model and make predictions on over/under
pre = pre[pca+pch+bca+bch+temp]
model = pickle.load(open('total.sav', 'rb'))
f['predictions'] = np.exp(model.predict(pre))

In [23]:
# Format the predictions in a string to email to myself
s = ''
for i in range (f.shape[0]):
    s += f.iloc[i]['apitcher'] + '\t'+ str("{:.2f}".format(f.iloc[i]['predictions'])) + '\n'

In [25]:
# send the email
import smtplib, ssl

port = 465  # For SSL
smtp_server = "smtp.gmail.com"
sender_email = "leowei08@gmail.com"
receiver_email2 = "leowei08@gmail.com"
password = 'password'
message = """\
Subject: Predictions Today {today}


{content}."""

context = ssl.create_default_context()
with smtplib.SMTP_SSL(smtp_server, port, context=context) as server:
    server.login(sender_email, password)
    server.sendmail(sender_email, receiver_email2, message.format(today=str(datetime.date.today()).replace('-',''), content=s))

In [24]:
# load model and make predictions for score difference
model = pickle.load(open('diff.sav', 'rb'))
f['predictions'] = model.predict(pre)

In [26]:
# Format the predictions in a string to email to myself
s = ''
for i in range (f.shape[0]):
    s += f.iloc[i]['apitcher'] + '\t'+ str("{:.2f}".format(f.iloc[i]['predictions'])) + '\n'

In [27]:
# send the email
import smtplib, ssl

port = 465  # For SSL
smtp_server = "smtp.gmail.com"
sender_email = "leowei08@gmail.com"
receiver_email2 = "leowei08@gmail.com"
password = 'password'
message = """\
Subject: Predictions Today {today}


{content}."""

context = ssl.create_default_context()
with smtplib.SMTP_SSL(smtp_server, port, context=context) as server:
    server.login(sender_email, password)
    server.sendmail(sender_email, receiver_email2, message.format(today=str(datetime.date.today()).replace('-',''), content=s))