## Daily racing stats scraping notebook
***

In [102]:
import pandas as pd
import numpy as np
import requests

from bs4 import BeautifulSoup

from selenium import webdriver 
from selenium.webdriver.common.by import By 
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.support import expected_conditions as EC 
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

from currency_converter import CurrencyConverter

import json 
import hashlib

import time
import random
import decimal

In [99]:

race_cols = [
    "date",
    "track", 
    "race_name", 
    "race_age_group", 
    "race_class", 
    "distance", 
    "going", 
    "runners", 
    "track_type", 
    "race_type",
    "off_time", 
    "winning_time",
    "prize_money_json", 
    "purse"
]

horse_cols = [
    "race_id",
    "horse_id",  
    "horse_name",
    "horse_age",
    "horse_sex",  
    "dam", 
    "sire", 
    "owner", 
    "trainer", 
    "jockey", 
    "weight", 
    "position", 
    "beaten_by", 
    "winnings", 
    "stall_number", 
    "horse_running_description",
    "estimated_running_time"
]

In [117]:
race_df = pd.DataFrame(columns=race_cols)
horse_df = pd.DataFrame(columns=horse_cols)

In [107]:
from typing import List

class Horse:
    def __init__(self, browser):
        self.browser = browser 
        
    def get_horse_id(self):
        hash_object = hashlib.md5(self.name.encode())
        return hash_object.hexdigest()
        
    def get_horse_details(self, position):
        self.position = position
        self.id = self.get_horse_id(self)
        
    def get_breeding_details(self, sire):
        self.sire = sire
        
    # function to get horse id
    # function to add record to races dataset
    # function to add horse to horse dataset if not already in there
        
class Race:
    def __init__(self, browser):
        self.browser=browser
        
    def get_race_info(self):
        self.browser.current_url.split('/')
        self.date = self.browser.current_url.split('/')[5]
        self.track = self.browser.current_url.split('/')[6]

        race_info = self.browser.find_elements(By.CSS_SELECTOR, "div[class^='RacePage__SummaryWrapper']")[0].text.split('\n')
        
        self.race_name = race_info[0]
        race_sub_info = self.get_variable_race_sub_info(race_info[1].split('  |   '))
        self.race_age_group = race_sub_info[0]
        self.race_class = race_sub_info[1]
        self.distance = self.get_distance_in_yards(race_sub_info[2])
        self.going = race_sub_info[3]
        self.runners = race_sub_info[4].split(' ')[0]
        self.track_type = race_sub_info[5]
        timing = race_info[3].split('  |   ') 
        self.off_time = timing[0].split(': ')[1]
        self.winning_time = self.get_time_in_seconds(timing[1].split(': ')[1])
        self.race_type='Need to dev'
        prize_money = self.get_prize_money_dict()
        self.prize_money_json = json.dumps(prize_money)
        self.purse = sum(prize_money.values())
        
    def get_variable_race_sub_info(self, race_sub_info: List[str]) -> List[str]:
        if len(race_sub_info) == 5:
            [race_age_group, distance, going, num_runners, track] = race_sub_info
            class_of_race = 'N/A'  
        else:
            [race_age_group, class_of_race, distance, going, num_runners, track] = race_sub_info
        
        return [race_age_group, class_of_race, distance, going, num_runners, track] 
        
        
    def get_time_in_seconds(self, winning_time: str) -> float:
        """
        Input looks like this '5m 6.71s' -> convert to total seconds
        """
        if len(winning_time.split(' ')) == 1:
            seconds = float(winning_time.replace('s',''))
            self.winning_time = seconds
            return seconds
        else:
            [min, sec] = winning_time.split(' ')
            min_to_seconds = int(min.replace('m', '')) * 60
            seconds = float(sec.replace('s',''))
            total_time =  min_to_seconds + seconds
            self.winning_time = total_time
            return total_time
        
    def get_distance_in_yards(self, distance: str) -> int:
        segments = distance.split(' ')
        race_yards = 0
        for item in segments:
            if 'm' in item:
                miles = item.replace('m','')
                miles_to_yards = int(miles) * 1760
                race_yards += miles_to_yards
            elif 'f' in item: 
                furlongs = item.replace('f', '')
                furlongs_to_yards = int(furlongs) * 220 
                race_yards += furlongs_to_yards 
            elif 'y' in item:
                yards = item.replace('y', '')
                race_yards += int(yards)
        return race_yards
    
    def get_winnings_in_gbp(self, winnings_string: str) -> float:
        winning_val = winnings_string[1:].replace(',','')
        if winnings_string[0] == '€':
            winnings = float(winning_val)
            return c.convert(winnings, 'EUR', 'GBP')
        else: 
            return float(winning_val)
    
    def get_prize_money_dict(self) -> dict:
        prize_money_elements = self.browser.find_elements(By.CSS_SELECTOR, "div[class^='PrizeMoney__PrizeSummary-sc-199orl7-3']")[0].text.split('\n')
        prize_money = {} 
        for i, item in enumerate(prize_money_elements):
            if (i % 2 == 0) | (i == 0):
                key = prize_money_elements[i][:-3] #removes text just leaves the position number as a string
                val = prize_money_elements[i+1] 
                prize_money[key] = self.get_winnings_in_gbp(winnings_string = val) 
                
        return prize_money
    
    def add_to_df(self, df):
        row_dict = {
            "date": self.date,
            "track": self.track, 
            "race_name": self.race_name, 
            "race_age_group": self.race_age_group,
            "race_class": self.race_class,
            "distance": self.distance,
            "going": self.going,
            "runners": self.runners,
            "track_type": self.track_type,
            "race_type": self.race_type,
            "off_time": self.off_time,
            "winning_time": self.winning_time, 
            "prize_money_json": self.prize_money_json, 
            "purse": self.purse
        }
        df = df.append(row_dict, ignore_index=True)
        return df
        

In [108]:
browser = webdriver.Chrome(ChromeDriverManager().install())
browser.get('https://www.sportinglife.com/racing/results/yesterday')
time.sleep(float(decimal.Decimal(random.randrange(200, 300))/100))
cookies_button = browser.find_elements(By.CSS_SELECTOR, "button[class^='BaseButton__BaseButtonStyled-e225m1-0']")
if len(cookies_button) > 0:
    cookies_button[0].click() 
num_races = len(browser.find_elements(By.CSS_SELECTOR, "span[class^='Race__RaceTime-sc-16yubq3-1']"))
race_selectors = ["span[class^='Race__RaceTime-sc-16yubq3-1']", 
                  "div[class^='FutureRace__RaceName-sc-1yen8s9-0']", 
                  "div[class^='FutureRace__RaceDetailsContainer-sc-1yen8s9-1']"] # this gives impression of clicking on different parts of the race button

for i in range(0, num_races):
    timeout = 10
    idx = random.randint(0,2)
    WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.CSS_SELECTOR, race_selectors[idx])))
    browser.refresh()
    time.sleep(float(decimal.Decimal(random.randrange(100, 200))/100))
    races = browser.find_elements(By.CSS_SELECTOR, race_selectors[idx])
    browser.execute_script("arguments[0].click();", races[i])
    time.sleep(float(decimal.Decimal(random.randrange(500, 1000))/100))
    new_race = Race(browser)
    new_race.get_race_info()
    race_df = new_race.add_to_df(race_df)
    browser.back()
    time.sleep(float(decimal.Decimal(random.randrange(500, 1200))/100))
    print(f'{i+1}/{num_races}')


[WDM] - 

[WDM] - Current google-chrome version is 96.0.4664
[WDM] - Get LATEST driver version for 96.0.4664
[WDM] - Driver [/Users/jackpickard/.wdm/drivers/chromedriver/mac64/96.0.4664.45/chromedriver] found in cache
  browser = webdriver.Chrome(ChromeDriverManager().install())


1/30
2/30
3/30
4/30
5/30
6/30
7/30
8/30
9/30
10/30
11/30
12/30
13/30
14/30
15/30
16/30
17/30
18/30
19/30
20/30
21/30
22/30
23/30
24/30
25/30
26/30
27/30
28/30
29/30
30/30


In [113]:
race_df.groupby('track').purse.sum()

track
cork           172302.34000
huntingdon     162206.00000
kelso          105429.00000
punchestown    154901.50122
Name: purse, dtype: float64

In [48]:
race_df.groupby(['track']).winning_time.mean()

track
aintree          352.971250
chepstow         323.657500
sandown          288.992857
wetherby         321.400000
wolverhampton    116.105714
Name: winning_time, dtype: float64

# Horse stuff

***

In [36]:
# get the list of runners in results page
runners = browser.find_elements(By.CSS_SELECTOR, "div[class^='ResultRunner__StyledResultRunner']")

runners[0].text.split('\n')

['1st',
 '3',
 'Tiquer',
 '1311-9',
 '25/1',
 'T: Joe PontingJ: D A Jacob',
 'Prominent, led 4 out, shaken up after 3 out and went clear, stayed on well, unchallenged home straight op 20/1 tchd 18/1',
 'My Stable',
 'My Stable']

In [41]:
# Links to click on the horse 
horseLinks = browser.find_elements(By.XPATH, "//a")
selectedHorse = [i for i in horseLinks if i.text == 'Tiquer']
if len(selectedHorse) > 0:
    selectedHorse[0].click()

In [47]:
# get breeding info for the horse
browser.find_elements(By.CSS_SELECTOR, "table[class^='Header__DataTable']")[0].text.split('\n')

['Age 13 (Foaled 3rd April 2008)',
 'Trainer Joe Ponting',
 'Sex Gelding',
 'Sire Equerry',
 'Dam Tirenna',
 'Owner Mr J Ponting']