## Daily racing stats scraping notebook
***

In [1]:
import pandas as pd
import numpy as np
import requests

from bs4 import BeautifulSoup

from selenium import webdriver 
from selenium.webdriver.common.by import By 
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.support import expected_conditions as EC 
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

from currency_converter import CurrencyConverter

import json 
import hashlib
from typing import List

import time
import random
import decimal

In [22]:
race_cols = [
    "date",
    "track", 
    "race_name", 
    "race_age_group", 
    "race_class", 
    "distance", 
    "going", 
    "runners", 
    "track_type", 
    "race_type",
    "off_time", 
    "winning_time",
    "prize_money_json", 
    "purse"
]

horse_cols = [
    "date",
    "race_name",
    "track",
    "horse_id",  
    "horse_name",
    "horse_age",
    "horse_sex",  
    "dam", 
    "sire", 
    "owner", 
    "trainer", 
    "jockey", 
    "weight", 
    "sp",
    "position", 
    "beaten_by", 
    "winnings", 
    "stall_number", 
    "race_comments",
    "estimated_running_time"
]

In [75]:
race_df = pd.DataFrame(columns=race_cols)
horse_df = pd.DataFrame(columns=horse_cols)

In [152]:
runner.text.split('\n')

['2nd',
 'nk',
 '7',
 '(7)',
 'Tio Mio',
 '39-5',
 '1/1f',
 'T: D LoughnaneJ: Cieren Fallon',
 'Slightly worse than midfield, pushed and took closer order before 2f out, ridden and joined leader inside final furlong, edged out close home op 10/11',
 'My Stable',
 'My Stable']

In [120]:
class Horse:
    def __init__(self, browser, race_name, winnings_json, race_type):
        self.browser = browser 
        self.race_name = race_name
        self.winnings_json = winnings_json
        self.race_type = race_type
        
    def get_horse_id(self):
        hash_object = hashlib.md5(self.horse_name.encode())
        return hash_object.hexdigest()
    
    def get_horse_winnings(self):
        winnings_dict = json.loads(self.winnings_json)
        if str(self.position) in winnings_dict.keys(): 
            return winnings_dict[str(self.position)]
        else:
            return 0
        
    def get_runner_info(self, runner_info):
        """ 
        gets information for selected horse in the race
        """
        position, beaten_by, stall_number, horse_number, horse_name, age_weight, sp, trainer_jockey, race_comments = self.get_right_runner_config(runner_info)
            
        if (len(age_weight) <= 6) & (age_weight[2] not in ['7', '8', '9']):
            weight = age_weight[1:]
            age = age_weight[0]
        else:
            age = age_weight[0:2]
            weight = age_weight[2:]
            
        tj_split = trainer_jockey.split('J: ')
        
        self.browser.current_url.split('/')
        self.date = self.browser.current_url.split('/')[5]
        self.track = self.browser.current_url.split('/')[6]
        
        self.trainer = tj_split[0].replace('T: ', '')
        self.jockey = tj_split[1]
        if position in ['PU', 'F']:
            self.position = position 
        else:
            self.position= position[:-2]
        self.horse_name = horse_name
        self.horse_id = self.get_horse_id() 
        self.horse_age = age 
        self.horse_weight = weight
        self.beaten_by = beaten_by 
        self.sp = sp 
        self.race_comments = race_comments
        
        self.winnings = self.get_horse_winnings()
        self.stall_number = stall_number
        self.estimated_running_time = "TODO"
    
    def get_right_runner_config(self, runner_info):
        if self.race_type in ['Chase', 'Hurdle', 'NHF']:
            if len(runner_info) == 9:
                [horse_position, horse_number, horse_name, age_weight, sp, trainer_jockey, race_comments, text, text2] = runner_info 
                beaten_by = 'N/A'
                stall_number='N/A'
                
            elif len(runner_info) == 10:
                [horse_position, beaten_by, horse_number, horse_name, age_weight, sp, trainer_jockey, race_comments, text, text2] = runner_info
                stall_number='N/A'
        else:
            if len(runner_info) == 10:
                [horse_position, horse_number, stall_number, horse_name, age_weight, sp, trainer_jockey, race_comments, text, text2] = runner_info 
                beaten_by = 'N/A'
                
            elif len(runner_info) == 11:
                [horse_position, beaten_by, horse_number, stall_number, horse_name, age_weight, sp, trainer_jockey, race_comments, text, text2] = runner_info
                
        try:   
            return horse_position, beaten_by, stall_number, horse_number, horse_name, age_weight, sp, trainer_jockey, race_comments
        except:
            return print(runner_info)
        
    def get_breeding_info(self):
        # Links to click on the horse 
        deal_with_popup(browser, "div[class^='ResultRunner__StyledHorseName-sc-58kifh-5']", "popup checking horse main page")
        horseLinks = self.browser.find_elements(By.XPATH, "//a")
        selectedHorse = [i for i in horseLinks if i.text == self.horse_name]
        if len(selectedHorse) > 0:
            time.sleep(float(decimal.Decimal(random.randrange(20, 50))/100))
            browser.execute_script("arguments[0].click();", selectedHorse[0])

        time.sleep(float(decimal.Decimal(random.randrange(200, 700))/100))
        deal_with_popup(browser, "table[class^='Header__DataTable']", "popup checking horse breeding page")
        # get breeding info for the horse
        breeding_info = browser.find_elements(By.CSS_SELECTOR, "table[class^='Header__DataTable']")[0].text.split('\n')
        [age2, trainer2, horse_sex, sire, dam, owner] = breeding_info 

        self.horse_sex = horse_sex.split(' ')[1]
        self.sire = " ".join(sire.split(' ')[1:])
        self.dam = " ".join(dam.split(' ')[1:])
        self.owner = " ".join(owner.split(' ')[1:])
        self.browser.back()
        time.sleep(float(decimal.Decimal(random.randrange(100, 500))/100))
        
    def add_to_df(self, df):
        """ 
        appends row to pandas dataframe
        TODO: this will change to append row to postgres table using sqlalchemy
        """
        row_dict = {
            "date": self.date, 
            "race_name": self.race_name,
            "track": self.track,
            "horse_id": self.horse_id,  
            "horse_name": self.horse_name,
            "horse_age": self.horse_age,
            "horse_sex": self.horse_sex,  
            "dam": self.dam, 
            "sire": self.sire, 
            "owner": self.owner, 
            "trainer": self.trainer, 
            "jockey": self.jockey, 
            "weight": self.horse_weight, 
            "sp": self.sp,
            "position": self.position, 
            "beaten_by": self.beaten_by, 
            "winnings": self.winnings, 
            "stall_number": self.stall_number, 
            "race_comments": self.race_comments,
            "estimated_running_time": self.estimated_running_time
        }
        df = df.append(row_dict, ignore_index=True)
        return df
        
class Race:
    def __init__(self, browser):
        self.browser=browser
        
    def get_race_info(self):
        self.browser.current_url.split('/')
        self.date = self.browser.current_url.split('/')[5]
        self.track = self.browser.current_url.split('/')[6]
        deal_with_popup(browser, "div[class^='RacePage__SummaryWrapper']", "popup checking race")
        race_info = self.browser.find_elements(By.CSS_SELECTOR, "div[class^='RacePage__SummaryWrapper']")[0].text.split('\n')
        
        self.race_name = race_info[0]
        race_sub_info = self.get_variable_race_sub_info(race_info[1].split('  |   '))
        self.race_age_group = race_sub_info[0]
        self.race_class = race_sub_info[1]
        self.distance = self.get_distance_in_yards(race_sub_info[2])
        self.going = race_sub_info[3]
        self.runners = race_sub_info[4].split(' ')[0]
        self.track_type = race_sub_info[5]
        timing = race_info[3].split('  |   ') 
        self.off_time = timing[0].split(': ')[1]
        self.winning_time = self.get_time_in_seconds(timing[1].split(': ')[1])
        self.race_type = self.get_race_type()
        prize_money = self.get_prize_money_dict()
        self.prize_money_json = json.dumps(prize_money)
        self.purse = sum(prize_money.values())
        
    def get_race_type(self):
        lower_case_race_name = self.race_name.lower()
        if "chase" in lower_case_race_name:
            return "Chase"
        elif "hurdle" in lower_case_race_name:
            return "Hurdle"
        elif ('nhf' in lower_case_race_name) | ('Flat' in lower_case_race_name):
            return "NHF"
        else:
            return "Flat"
        
    def get_variable_race_sub_info(self, race_sub_info: List[str]) -> List[str]:
        """ 
        Deals with issue where Irish courses don't use race class so need to set that as N/A
        """
        if len(race_sub_info) == 5:
            [race_age_group, distance, going, num_runners, track] = race_sub_info
            class_of_race = 'N/A'  
        else:
            [race_age_group, class_of_race, distance, going, num_runners, track] = race_sub_info
        
        return [race_age_group, class_of_race, distance, going, num_runners, track] 
        
        
    def get_time_in_seconds(self, winning_time: str) -> float:
        """
        Input looks like this '5m 6.71s' -> convert to total seconds
        """
        if len(winning_time.split(' ')) == 1:
            seconds = float(winning_time.replace('s',''))
            self.winning_time = seconds
            return seconds
        else:
            [min, sec] = winning_time.split(' ')
            min_to_seconds = int(min.replace('m', '')) * 60
            seconds = float(sec.replace('s',''))
            total_time =  min_to_seconds + seconds
            self.winning_time = total_time
            return total_time
        
    def get_distance_in_yards(self, distance: str) -> int:
        """ 
        Converts the race distance to yards for ease of analysis
        """
        segments = distance.split(' ')
        race_yards = 0
        for item in segments:
            if 'm' in item:
                miles = item.replace('m','')
                miles_to_yards = int(miles) * 1760
                race_yards += miles_to_yards
            elif 'f' in item: 
                furlongs = item.replace('f', '')
                furlongs_to_yards = int(furlongs) * 220 
                race_yards += furlongs_to_yards 
            elif 'y' in item:
                yards = item.replace('y', '')
                race_yards += int(yards)
        return race_yards
    
    def get_winnings_in_gbp(self, winnings_string: str) -> float:
        """ 
        Where prize money is in euros, convert to gbp using the currency converter API 
        (uses European Central Bank rates)
        """
        c = CurrencyConverter()
        winning_val = winnings_string[1:].replace(',','')
        if winnings_string[0] == '€':
            winnings = float(winning_val)
            return c.convert(winnings, 'EUR', 'GBP')
        else: 
            return float(winning_val)
    
    def get_prize_money_dict(self) -> dict:
        """ 
        Gets the prize money for the race and returns dict of {'position': 'winnings'}
        """
        prize_money_elements = self.browser.find_elements(By.CSS_SELECTOR, "div[class^='PrizeMoney__PrizeSummary-sc-199orl7-3']")[0].text.split('\n')
        prize_money = {} 
        for i, item in enumerate(prize_money_elements):
            if (i % 2 == 0) | (i == 0):
                key = prize_money_elements[i][:-3] #removes text just leaves the position number as a string
                val = prize_money_elements[i+1] 
                prize_money[key] = self.get_winnings_in_gbp(winnings_string = val) 
                
        return prize_money
    
    def add_to_df(self, df):
        """ 
        appends row to pandas dataframe
        TODO: this will change to append row to postgres table using sqlalchemy
        """
        row_dict = {
            "date": self.date,
            "track": self.track, 
            "race_name": self.race_name, 
            "race_age_group": self.race_age_group,
            "race_class": self.race_class,
            "distance": self.distance,
            "going": self.going,
            "runners": self.runners,
            "track_type": self.track_type,
            "race_type": self.race_type,
            "off_time": self.off_time,
            "winning_time": self.winning_time, 
            "prize_money_json": self.prize_money_json, 
            "purse": self.purse
        }
        df = df.append(row_dict, ignore_index=True)
        return df
    
    
class Day:
    def __init__(self, browser, date, race_df, horse_df):
        self.browser = browser
        self.date = date 
        self.race_df = race_df 
        self.horse_df = horse_df
    
    def get_stats(self):
        """ 
        date needs to be format yyyy-mm-dd 2021-12-05
        """
        url_date = str(self.date).split(' ')[0]
        self.browser.get(f'https://www.sportinglife.com/racing/results/{url_date}')
        time.sleep(float(decimal.Decimal(random.randrange(200, 300))/100))
        cookies_button = self.browser.find_elements(By.CSS_SELECTOR, "button[class^='BaseButton__BaseButtonStyled-e225m1-0']")
        if len(cookies_button) > 0:
            cookies_button[0].click() 
        num_races = len(self.browser.find_elements(By.CSS_SELECTOR, "span[class^='Race__RaceTime-sc-16yubq3-1']"))
        race_selectors = ["span[class^='Race__RaceTime-sc-16yubq3-1']", 
                        "div[class^='FutureRace__RaceName-sc-1yen8s9-0']", 
                        "div[class^='FutureRace__RaceDetailsContainer-sc-1yen8s9-1']"] # this gives impression of clicking on different parts of the race button

        for i in range(0, num_races):
            # First get info for the race itself
            print(self.race_df)
            timeout = 10
            idx = random.randint(0,2)
            WebDriverWait(self.browser, timeout).until(EC.visibility_of_element_located((By.CSS_SELECTOR, race_selectors[idx])))
            self.browser.refresh()
            time.sleep(float(decimal.Decimal(random.randrange(100, 200))/100))
            deal_with_popup(self.browser, race_selectors[idx], "popup checking 1")
            races = self.browser.find_elements(By.CSS_SELECTOR, race_selectors[idx])
            
            time.sleep(float(decimal.Decimal(random.randrange(20, 50))/100))
            self.browser.execute_script("arguments[0].click();", races[i])
            time.sleep(float(decimal.Decimal(random.randrange(500, 1000))/100))
            new_race = Race(self.browser)
            new_race.get_race_info()
            
            self.race_df = new_race.add_to_df(self.race_df)
            # print(self.race_df.tail(1))
            # Now get info for each runner in the race
            num_runners = len(self.browser.find_elements(By.CSS_SELECTOR, "div[class^='ResultRunner__StyledResultRunner']") )
            for runs_idx in range(0, num_runners):
                self.browser.refresh()
                time.sleep(float(decimal.Decimal(random.randrange(100, 200))/100))
                deal_with_popup(self.browser, "div[class^='ResultRunner__StyledResultRunner']", "popup checking 2")
                runner = self.browser.find_elements(By.CSS_SELECTOR, "div[class^='ResultRunner__StyledResultRunner']")[runs_idx]
                runner_info = runner.text.split('\n')
                horse = Horse(self.browser, new_race.race_name, new_race.prize_money_json, new_race.race_type)
                deal_with_popup(self.browser, "div[class^='ResultRunner__StyledResultRunner']", "popup checking 3") 
                if len(runner_info) == 0:
                    runner = self.browser.find_elements(By.CSS_SELECTOR, "div[class^='ResultRunner__StyledResultRunner']")[runs_idx]
                    runner_info = runner.text.split('\n')
                horse.get_runner_info(runner_info) 
                horse.get_breeding_info()
                self.horse_df = horse.add_to_df(self.horse_df)
            
            # print(self.horse_df.tail(1))
            self.browser.back()
            time.sleep(float(decimal.Decimal(random.randrange(500, 1200))/100))
        return self.race_df, self.horse_df
        

In [112]:
def deal_with_popup(browser, css_class, message):
    popup = browser.find_elements(By.CSS_SELECTOR, "div[class^='washington-campaign Campaign CampaignType--popup Campaign--css']")
    if len(popup) > 0: 
        time.sleep(float(decimal.Decimal(random.randrange(1000, 1300))/100))
        print(message)
        try:
            popup[0].click()
        except:
            browser.find_elements(By.XPATH, "//body")[0].click()
    while len(browser.find_elements(By.CSS_SELECTOR, css_class)) == 0:
        time.sleep(float(decimal.Decimal(random.randrange(1000, 1100))/100))
        if len(popup) > 0: 
            try:
                popup[0].click()
            except:
                browser.find_elements(By.XPATH, "//body")[0].click()
        print(message)
    

In [90]:
str(day.date).split(' ')[0]

'2021-01-05'

In [122]:
chrome_options = Options()
chrome_options.add_argument("--headless")

browser = webdriver.Chrome(ChromeDriverManager().install())
date_range = pd.date_range('2021-01-02', periods=1)
for date in date_range:
    print(f'getting stats for {date}')
    day = Day(browser, date, race_df, horse_df) 
    race_df, horse_df = day.get_stats()
    


[WDM] - 

[WDM] - Current google-chrome version is 96.0.4664
[WDM] - Get LATEST driver version for 96.0.4664
[WDM] - Driver [/Users/jackpickard/.wdm/drivers/chromedriver/mac64/96.0.4664.45/chromedriver] found in cache
  browser = webdriver.Chrome(ChromeDriverManager().install())


getting stats for 2021-01-02 00:00:00
Empty DataFrame
Columns: [date, track, race_name, race_age_group, race_class, distance, going, runners, track_type, race_type, off_time, winning_time, prize_money_json, purse]
Index: []


WebDriverException: Message: chrome not reachable
  (Session info: chrome=96.0.4664.55)
Stacktrace:
0   chromedriver                        0x000000010d1c4269 __gxx_personality_v0 + 582729
1   chromedriver                        0x000000010d14fc33 __gxx_personality_v0 + 106003
2   chromedriver                        0x000000010cd0ccdf chromedriver + 171231
3   chromedriver                        0x000000010ccfb7c2 chromedriver + 100290
4   chromedriver                        0x000000010ccfc155 chromedriver + 102741
5   chromedriver                        0x000000010ccfdf42 chromedriver + 110402
6   chromedriver                        0x000000010ccf6c22 chromedriver + 80930
7   chromedriver                        0x000000010cd0e2b3 chromedriver + 176819
8   chromedriver                        0x000000010cd71b0c chromedriver + 584460
9   chromedriver                        0x000000010cd5fc23 chromedriver + 511011
10  chromedriver                        0x000000010cd3575e chromedriver + 337758
11  chromedriver                        0x000000010cd36a95 chromedriver + 342677
12  chromedriver                        0x000000010d1808ab __gxx_personality_v0 + 305803
13  chromedriver                        0x000000010d197863 __gxx_personality_v0 + 399939
14  chromedriver                        0x000000010d19cc7f __gxx_personality_v0 + 421471
15  chromedriver                        0x000000010d198bba __gxx_personality_v0 + 404890
16  chromedriver                        0x000000010d174e51 __gxx_personality_v0 + 258097
17  chromedriver                        0x000000010d1b4158 __gxx_personality_v0 + 516920
18  chromedriver                        0x000000010d1b42e1 __gxx_personality_v0 + 517313
19  chromedriver                        0x000000010d1cb6f8 __gxx_personality_v0 + 612568
20  libsystem_pthread.dylib             0x00007fff6cbfb109 _pthread_start + 148
21  libsystem_pthread.dylib             0x00007fff6cbf6b8b thread_start + 15


In [116]:
race_df

Unnamed: 0,date,track,race_name,race_age_group,race_class,distance,going,runners,track_type,race_type,off_time,winning_time,prize_money_json,purse


# Appendix 
***

In [None]:
# browser.get('https://www.sportinglife.com/racing/results/2021-12-05')
# time.sleep(float(decimal.Decimal(random.randrange(200, 300))/100))
# cookies_button = browser.find_elements(By.CSS_SELECTOR, "button[class^='BaseButton__BaseButtonStyled-e225m1-0']")
# if len(cookies_button) > 0:
#     cookies_button[0].click() 
# num_races = len(browser.find_elements(By.CSS_SELECTOR, "span[class^='Race__RaceTime-sc-16yubq3-1']"))
# race_selectors = ["span[class^='Race__RaceTime-sc-16yubq3-1']", 
#                   "div[class^='FutureRace__RaceName-sc-1yen8s9-0']", 
#                   "div[class^='FutureRace__RaceDetailsContainer-sc-1yen8s9-1']"] # this gives impression of clicking on different parts of the race button

# for i in range(0, num_races):
#     # First get info for the race itself
#     timeout = 10
#     idx = random.randint(0,2)
#     WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.CSS_SELECTOR, race_selectors[idx])))
#     browser.refresh()
#     time.sleep(float(decimal.Decimal(random.randrange(100, 200))/100))
#     deal_with_popup(browser, race_selectors[idx], "popup checking 1")
#     races = browser.find_elements(By.CSS_SELECTOR, race_selectors[idx])
    
#     time.sleep(float(decimal.Decimal(random.randrange(20, 50))/100))
#     browser.execute_script("arguments[0].click();", races[i])
#     time.sleep(float(decimal.Decimal(random.randrange(500, 1000))/100))
#     new_race = Race(browser)
#     new_race.get_race_info()
    
#     race_df = new_race.add_to_df(race_df)
    
#     # Now get info for each runner in the race
#     num_runners = len(browser.find_elements(By.CSS_SELECTOR, "div[class^='ResultRunner__StyledResultRunner']") )
#     for runs_idx in range(0, num_runners):
#         browser.refresh()
#         time.sleep(float(decimal.Decimal(random.randrange(100, 200))/100))
#         deal_with_popup(browser, "div[class^='ResultRunner__StyledResultRunner']", "popup checking 2")
#         runner = browser.find_elements(By.CSS_SELECTOR, "div[class^='ResultRunner__StyledResultRunner']")[runs_idx]
#         runner_info = runner.text.split('\n')
#         horse = Horse(browser, new_race.race_name, new_race.prize_money_json, new_race.race_type)
#         deal_with_popup(browser, "div[class^='ResultRunner__StyledResultRunner']", "popup checking 3") 
#         horse.get_runner_info(runner_info) 
#         horse.get_breeding_info()
#         horse_df = horse.add_to_df(horse_df)
    
#     browser.back()
#     time.sleep(float(decimal.Decimal(random.randrange(500, 1200))/100))
#     print(f'{i+1}/{num_races}')