# MLB Odds Scraper for covers.com
This notebook puls down historic data about MLB baseball odds from covers.com. We also get the final score and date/time of the game. Data is placed in the data/ directory as a csv file.

# Imports, Etc.

In [1]:
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By

import pandas as pd
import numpy as np

import re
import threading
import queue
import time
from datetime import datetime, timedelta

#

In [2]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)

# Build Queue

In [3]:
#fill the queue with dates that we need games from
q = queue.Queue(maxsize=0)

#get last date from disk if we've already saved some data
try:
    game_df = pd.read_csv('data/covers.csv', low_memory=False)
    get_day = pd.to_datetime(game_df.date.max()).strftime('%Y-%m-%d')
except:
    get_day = '2012-03-01'
    
# fill queue with all the dates until yesterday
# fill queue with all the dates until yesterday
yesterday = (datetime.now() - timedelta(days=1)).date()
days = []
while  pd.to_datetime(get_day).date() < yesterday:
    get_day = (pd.to_datetime(get_day).date() + pd.Timedelta(days=1)).strftime('%Y-%m-%d')
    if pd.to_datetime(get_day).month<3:continue    # baseball doesn't happen before march
    if pd.to_datetime(get_day).month>11:continue   # baseball doesn't happen in december
    q.put(get_day)
q.qsize()

3299

# Scrape

In [4]:
def get_covers_data(day, driver):
    url = f'https://www.covers.com/Sports/MLB/Matchups?selectedDate={day}'
    driver.get(url)
    time.sleep(5) # give it a couple of more second to load
    soup = bs(driver.page_source.encode("utf-8"), "lxml")
    
    #make sure we're on the right day
    nav_date = soup.find('a', {'class':'cmg_active_navigation_item'})
    if nav_date['data-date']==day:
        pass
    else:
        print(f"{day}: no games")
        return []
    
    # grab the data
    games = []
    scraped_games = soup.findAll('div',{'class':'cmg_matchup_game_box'})
    for g in scraped_games:
        game = {}
        game['home_moneyline'] = g['data-game-odd']
        game['away_team_abbr'] = g['data-away-team-shortname-search']
        game['date'] = g['data-game-date']
        game['home_team_abbr'] = g['data-home-team-shortname-search']
        try:
            game['home_score'] =g.find('div',{'class':'cmg_matchup_list_score_home'}).text.strip()
        except:
            pass
        try:
            game['away_score'] =g.find('div',{'class':'cmg_matchup_list_score_away'}).text.strip()
        except:
            pass
        games.append(game)
    return games            

In [5]:
def do_work(q, lock):
    #start a web browser
    driver = webdriver.Firefox()
    driver.implicitly_wait(10)
    
    #start working through the queue
    while not q.empty():
        day = q.get()
        games = get_covers_data(day, driver)
        
        #some days have no games
        if len(games)==0:
            q.task_done()
            continue
        
        new_games = pd.DataFrame(games)
        
        #save the games to disk
        lock.acquire()
        try:
            game_df = pd.read_csv('data/covers.csv', low_memory=False)
        except:
            game_df = pd.DataFrame()
        game_df = pd.concat([game_df,new_games])
        game_df.to_csv('data/covers.csv', index=False)
        lock.release()
        
        q.task_done()
        print(f"{day} done.")
    driver.quit()

In [6]:
num_threads = 6    # num of firefox windows
lock = threading.Lock()

#start the workers
for i in range(num_threads):
    worker = threading.Thread(target=do_work, args=(q,lock,))
    worker.setDaemon(True)
    worker.start()
        
#wait for workers to finish
q.join()

2012-03-03 done.
2012-03-02 done.
2012-03-04 done.
2012-03-05 done.
2012-03-06 done.
2012-03-07 done.
2012-03-08 done.
2012-03-09 done.
2012-03-10 done.
2012-03-11 done.
2012-03-12 done.
2012-03-13 done.
2012-03-14 done.
2012-03-16 done.
2012-03-15 done.
2012-03-17 done.
2012-03-19 done.
2012-03-18 done.
2012-03-20 done.
2012-03-22 done.
2012-03-21 done.
2012-03-23 done.
2012-03-24 done.
2012-03-25 done.
2012-03-26 done.
2012-03-27 done.
2012-03-28 done.
2012-03-29 done.
2012-03-30 done.
2012-03-31 done.
2012-04-01 done.
2012-04-03 done.
2012-04-04 done.
2012-04-02 done.
2012-04-05 done.
2012-04-06 done.
2012-04-07 done.
2012-04-08 done.
2012-04-09 done.
2012-04-10 done.
2012-04-11 done.
2012-04-12 done.
2012-04-13 done.
2012-04-16 done.
2012-04-15 done.
2012-04-14 done.
2012-04-17 done.
2012-04-18 done.
2012-04-19 done.
2012-04-20 done.
2012-04-22 done.
2012-04-21 done.
2012-04-23 done.
2012-04-24 done.
2012-04-25 done.
2012-04-26 done.
2012-04-28 done.
2012-04-27 done.
2012-04-29 don