# MLB Odds Scraper for covers.com
This notebook puls down historic data about MLB baseball odds from covers.com. We also get the final score and date/time of the game. Data is placed in the data/ directory as a csv file.

# Imports, Etc.

In [1]:
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By

import pandas as pd
import numpy as np

import re
import threading
import queue
import time

In [2]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)

# Build Queue

In [4]:
#fill the queue with dates that we need games from
q = queue.Queue(maxsize=0)

#get last date from disk if we've already saved some data
try:
    game_df = pd.read_csv(r'C:\Users\james\Documents\MLB\Data\covers.csv', low_memory=False)
    get_day = pd.to_datetime(game_df.date.max()).strftime('%Y-%m-%d')
except:
    get_day = '2022-03-01'
    
# fill queue with all the dates until yesterday
yesterday = pd.datetime.now().date() - pd.Timedelta(days=1)
days = []
while  pd.to_datetime(get_day).date() < yesterday:
    get_day = (pd.to_datetime(get_day).date() + pd.Timedelta(days=1)).strftime('%Y-%m-%d')
    if pd.to_datetime(get_day).month<3:continue    # baseball doesn't happen before march
    if pd.to_datetime(get_day).month>11:continue   # baseball doesn't happen in december
    q.put(get_day)
q.qsize()

292

# Scrape

In [5]:
def get_covers_data(day, driver):
    url = f'https://www.covers.com/Sports/MLB/Matchups?selectedDate={day}'
    driver.get(url)
    time.sleep(5) # give it a couple of more second to load
    soup = bs(driver.page_source.encode("utf-8"), "lxml")
    
    #make sure we're on the right day
    nav_date = soup.find('a', {'class':'cmg_active_navigation_item'})
    if nav_date['data-date']==day:
        pass
    else:
        print(f"{day}: no games")
        return []
    
    # grab the data
    games = []
    scraped_games = soup.findAll('div',{'class':'cmg_matchup_game_box'})
    for g in scraped_games:
        game = {}
        game['home_moneyline'] = g['data-game-odd']
        game['away_team_abbr'] = g['data-away-team-shortname-search']
        game['date'] = g['data-game-date']
        game['home_team_abbr'] = g['data-home-team-shortname-search']
        try:
            game['home_score'] =g.find('div',{'class':'cmg_matchup_list_score_home'}).text.strip()
        except:
            pass
        try:
            game['away_score'] =g.find('div',{'class':'cmg_matchup_list_score_away'}).text.strip()
        except:
            pass
        games.append(game)
    return games            

In [7]:
def do_work(q, lock):
    #start a web browser
    chromedriver_path= r"C:\Users\james\Documents\MLB\chromedriver.exe"
    driver = webdriver.Chrome(chromedriver_path)
    
    #start working through the queue
    while not q.empty():
        day = q.get()
        games = get_covers_data(day, driver)
        
        #some days have no games
        if len(games)==0:
            q.task_done()
            continue
        
        new_games = pd.DataFrame(games)
        
        #save the games to disk
        lock.acquire()
        try:
            game_df = pd.read_csv(r'C:\Users\james\Documents\MLB\Data\covers.csv', low_memory=False)
        except:
            game_df = pd.DataFrame()
        game_df = pd.concat([game_df,new_games])
        game_df.to_csv(r'C:\Users\james\Documents\MLB\Data\covers.csv', index=False)
        lock.release()
        
        q.task_done()
        print(f"{day} done.")
    driver.quit()

In [None]:
num_threads = 6    # num of firefox windows
lock = threading.Lock()

#start the workers
for i in range(num_threads):
    worker = threading.Thread(target=do_work, args=(q,lock,))
    worker.setDaemon(True)
    worker.start()
        
#wait for workers to finish
q.join()

2022-03-06: no games
2022-03-07: no games
2022-03-03: no games
2022-03-02: no games
2022-03-05: no games
2022-03-04: no games
2022-03-09: no games
2022-03-10: no games
2022-03-08: no games
2022-03-11: no games
2022-03-12: no games
2022-03-13: no games
2022-03-14: no games
2022-03-15: no games
2022-03-16: no games
2022-03-18 done.
2022-03-17 done.
2022-03-19 done.
2022-03-20 done.
2022-03-21 done.
2022-03-22 done.
2022-03-23 done.
2022-03-25 done.
2022-03-24 done.
2022-03-26 done.
2022-03-27 done.
2022-03-28 done.
2022-03-30 done.
2022-03-29 done.
2022-03-31 done.
2022-04-01 done.
2022-04-02 done.
2022-04-03 done.
2022-04-05 done.
2022-04-04 done.
2022-04-06 done.
2022-04-07 done.
2022-04-08 done.
2022-04-09 done.
2022-04-10 done.
2022-04-12 done.
2022-04-11 done.
2022-04-13 done.
2022-04-14 done.
2022-04-15 done.
2022-04-16 done.
2022-04-18 done.
2022-04-17 done.
2022-04-19 done.
2022-04-20 done.
2022-04-21 done.
2022-04-22 done.
2022-04-23 done.
2022-04-24 done.
2022-04-25 done.
2022-

Exception in thread Thread-7:
Traceback (most recent call last):
  File "C:\Users\james\anaconda3\lib\threading.py", line 973, in _bootstrap_inner
    self.run()
  File "C:\Users\james\anaconda3\lib\threading.py", line 910, in run
    self._target(*self._args, **self._kwargs)
  File "C:\Users\james\AppData\Local\Temp\ipykernel_24172\3411687224.py", line 9, in do_work
  File "C:\Users\james\AppData\Local\Temp\ipykernel_24172\2560376373.py", line 5, in get_covers_data
  File "C:\Users\james\anaconda3\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 679, in page_source
    return self.execute(Command.GET_PAGE_SOURCE)['value']
  File "C:\Users\james\anaconda3\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 321, in execute
    self.error_handler.check_response(response)
  File "C:\Users\james\anaconda3\lib\site-packages\selenium\webdriver\remote\errorhandler.py", line 242, in check_response
    raise exception_class(message, screen, stacktrace)
selenium.common

2022-11-24: no games
2022-11-27: no games


Exception in thread Thread-10:
Traceback (most recent call last):
  File "C:\Users\james\anaconda3\lib\threading.py", line 973, in _bootstrap_inner
    self.run()
  File "C:\Users\james\anaconda3\lib\threading.py", line 910, in run
    self._target(*self._args, **self._kwargs)
  File "C:\Users\james\AppData\Local\Temp\ipykernel_24172\3411687224.py", line 9, in do_work
  File "C:\Users\james\AppData\Local\Temp\ipykernel_24172\2560376373.py", line 5, in get_covers_data
  File "C:\Users\james\anaconda3\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 679, in page_source
    return self.execute(Command.GET_PAGE_SOURCE)['value']
  File "C:\Users\james\anaconda3\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 321, in execute
    self.error_handler.check_response(response)
  File "C:\Users\james\anaconda3\lib\site-packages\selenium\webdriver\remote\errorhandler.py", line 242, in check_response
    raise exception_class(message, screen, stacktrace)
selenium.commo

2022-11-28: no games


Exception in thread Thread-6:
Traceback (most recent call last):
  File "C:\Users\james\anaconda3\lib\threading.py", line 973, in _bootstrap_inner
    self.run()
  File "C:\Users\james\anaconda3\lib\threading.py", line 910, in run
    self._target(*self._args, **self._kwargs)
  File "C:\Users\james\AppData\Local\Temp\ipykernel_24172\3411687224.py", line 9, in do_work
  File "C:\Users\james\AppData\Local\Temp\ipykernel_24172\2560376373.py", line 5, in get_covers_data
  File "C:\Users\james\anaconda3\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 679, in page_source
    return self.execute(Command.GET_PAGE_SOURCE)['value']
  File "C:\Users\james\anaconda3\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 321, in execute
    self.error_handler.check_response(response)
  File "C:\Users\james\anaconda3\lib\site-packages\selenium\webdriver\remote\errorhandler.py", line 242, in check_response
    raise exception_class(message, screen, stacktrace)
selenium.common