# MLB Gameday Scraper
This scraper goes out to [mlb.com](https://www.mlb.com/scores/) and pulls down all of the data on that page (final runs,hits,errors, team, team record, pitcher, pitcher record, etc. - not the inning by inning boxes tho). It saves the data as a csv in the data directory. 

We use five selenium windows in parallel because they've got some javascript loading thingy that you have to wait for.

In [1]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By

from bs4 import BeautifulSoup as bs

import re
import threading
import queue
import time

import pandas as pd
import numpy as np

In [2]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
def get_box_data(day, driver):
    #get url
    url = f'https://www.mlb.com/scores/{day}'
    driver.get(url)
    time.sleep(5) # give it a couple of more second to load
    soup = bs(driver.page_source.encode("utf-8"), "lxml")
    
    #scrape data from page
    games = []
    matchups = soup.findAll('div',{'class':'g5-component--mlb-scores__panel--primary'})
    for m in matchups:
        game={}
        team_names = m.findAll('span',{'class':'g5-component--mlb-scores__team__info__name--long'})
        game['away_team_name'] = team_names[0].text.strip()
        game['home_team_name'] = team_names[1].text.strip()

        team_abbr = m.findAll('span',{'class':'g5-component--mlb-scores__team__info__name--abbrev'})
        game['away_team_abbr'] = team_abbr[0].text.strip()
        game['home_team_abbr'] = team_abbr[1].text.strip()
        try:
            team_record = m.findAll('span',{'class':'g5-component--mlb-scores__team__info__record'})
            game['away_team_wins'] = team_record[0].text.strip().split('-')[0]
            game['away_team_losses'] = team_record[0].text.strip().split('-')[1]
            game['home_team_wins'] = team_record[1].text.strip().split('-')[0]
            game['home_team_losses'] = team_record[1].text.strip().split('-')[1]
        except:
            # we can live without this
            pass
        
        try:
            team_runs = m.findAll('td',{'class':'g5-component--mlb-scores__linescore__table--summary__cell--runs'})
            game['away_team_runs'] = team_runs[0].text.strip()
            game['home_team_runs'] = team_runs[1].text.strip()
        except:
            #can't live without this, and sometimes get an error
            # we'll also see cases where this doesn't throw an exception, 
            # but we get blank data - that's usu a postponement
            continue

        team_hits = m.findAll('td',{'class':'g5-component--mlb-scores__linescore__table--summary__cell--hits'})
        game['away_team_hits'] = team_hits[0].text.strip()
        game['home_team_hits'] = team_hits[1].text.strip()

        team_errors = m.findAll('td',{'class':'g5-component--mlb-scores__linescore__table--summary__cell--errors'})
        game['away_team_errors'] = team_errors[0].text.strip()
        game['home_team_errors'] = team_errors[1].text.strip()
        
        try:
            team_pitcher = m.findAll('td',{'class':'g5-component--mlb-scores__player__text'})
            game['winning_pitcher'] = team_pitcher[0].find('a')['href'].split('/')[-1]
            game['losing_pitcher'] = team_pitcher[1].find('a')['href'].split('/')[-1]

            team_pitcher_stats = m.findAll('span',{'class':'g5-component--mlb-scores__player__text__line-2'})
            pattern = "(\d+)\-(\d+),\s+(\d+\.\d+) ERA"
            g = re.search(pattern, team_pitcher_stats[0].text)
            game['winning_pitcher_wins'] = g.group(1)
            game['winning_pitcher_losses'] = g.group(2)
            game['winning_pitcher_era'] = g.group(3)
            g = re.search(pattern, team_pitcher_stats[1].text)
            game['losing_pitcher_wins'] = g.group(1)
            game['losing_pitcher_losses'] = g.group(2)
            game['losing_pitcher_era'] = g.group(3)
        except:
            # we can live without this
            pass
        
        games.append(game)

    return games

In [4]:
def do_work(q, lock):
    #start a web browser
    driver = webdriver.Firefox()
    driver.implicitly_wait(10)
    
    #start working through the queue
    while not q.empty():
        day = q.get()
        games = get_box_data(day, driver)
        
        #some days have no games
        if len(games)==0:
            q.task_done()
            print(f"{day} no games.")
            continue
        
        new_games = pd.DataFrame(games)
        new_games['date']=day
        
        #save the games to disk
        lock.acquire()
        try:
            game_df = pd.read_csv('data/mlb-gameday.csv', low_memory=False)
        except:
            game_df = pd.DataFrame()
        game_df = pd.concat([game_df,new_games])
        game_df['date'] = pd.to_datetime(game_df.date).dt.date
        game_df.to_csv('data/mlb-gameday.csv', index=False)
        lock.release()
        
        q.task_done()
        print(f"{day} done.")
    driver.quit()

In [5]:
#fill the queue with dates that we need games from
q = queue.Queue(maxsize=0)

#get last date from disk if we've already saved some data
try:
    game_df = pd.read_csv('data/mlb-gameday.csv', low_memory=False)
    get_day = pd.to_datetime(game_df.date.max()).strftime('%Y-%m-%d')
except:
    get_day = '2002-03-01'
    
# fill queue with all the dates until yesterday
yesterday = pd.datetime.now().date() - pd.Timedelta(days=1)
days = []
while  pd.to_datetime(get_day).date() < yesterday:
    get_day = (pd.to_datetime(get_day).date() + pd.Timedelta(days=1)).strftime('%Y-%m-%d')
    if pd.to_datetime(get_day).month<3:continue    # baseball doesn't happen before march
    if pd.to_datetime(get_day).month>11:continue   # baseball doesn't happen in december
    q.put(get_day)
q.qsize()

1569

In [None]:
num_threads = 5    # num of firefox windows
lock = threading.Lock()

#start the workers
for i in range(num_threads):
    worker = threading.Thread(target=do_work, args=(q,lock,))
    worker.setDaemon(True)
    worker.start()
        
#wait for workers to finish
q.join()