In [None]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options
from bs4 import BeautifulSoup
import re

from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait

from bs4 import NavigableString

import time
import numpy as np
import csv

options = Options()
options.headless = True

driver = webdriver.Firefox(executable_path="../Others/geckodriver.exe",options=options)

In [None]:
# read race_result which we scrape previously
race_result = pd.read_csv('race_result_2021season.csv') #, encoding= 'unicode_escape', low_memory=False)

In [None]:
# go thru whitespace
def get_sibling(tag,previous=False):
    if previous:
        sibling = tag.previous_sibling
        while isinstance(sibling, NavigableString):
            sibling = sibling.previous_sibling
    else:
        sibling = tag.next_sibling
        while isinstance(sibling, NavigableString):
            sibling = sibling.next_sibling        
    return sibling

In [None]:
# crawl function
def form_record(url, horseid):
    # Function to access a page and save all horses into a list

    # Fetch the page
    driver.get(url)
    
    # Is there anything?
    if driver.page_source.find("No information.") != -1:
        return []
    
    # Wait 10 secs so that the dynamic content has time to load.
    try:
        wait = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "htable_eng_text")))
    except Exception as e:
        print("An exception occurred: ", horseid)
        return e
    
    # Load the page into BeautifulSoup
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Find all tags with href containing "RaceDate"
    horses = soup.find_all(href=re.compile("RaceDate"))

    # 'form_record_list' is the whole table
    # 'output' is a single row
    form_record_list = []
    
    # Loop through horses
    for horse in horses:
        # Get the horse name
        output = [horse.text.strip()]
        # This while loop fetch all remaining fields in a row
        a = get_sibling(horse.parent)
        
        while a != None:
            output.append(a.text
                          .strip()
                          # The last two lines are for running positions
                          .replace('\n','') 
                          .replace(' '*20,' ')
                          .replace("-", " ")
                         )
            a = get_sibling(a)
            
        output.append(horseid)
        # Append each row to the output list
        form_record_list.append(output)

    return form_record_list

In [None]:
# crawl on unique id only as each form record page is for one horse 
id_fetch = list(race_result.horseid.unique())
n = len(race_result.horseid.unique())

start_time = time.time()

url_front = "https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseId="

# the path you want to save the result
filepath = "form_record_2021.csv"

with open(filepath, 'w', newline='') as csvfile:
    mywriter = csv.writer(csvfile)
    mywriter.writerow(["RaceIndex", "Pla", "Date", "RC/Track/Course", "Dist", "Ground", 
                      "RaceClass", "Draw", "Rating", "Trainer", "Jockey", "LBW", "WinOdds", 
                       "ActWt", "RunPo", "FinishTime", "Declare_Horse_Wt", "Gear", "VideoReplay", "horseid"])
    
    #Copy the loop from above and incorporate the csv-saving code
    
    for i in range(0, n):
        time.sleep(5)
        url = url_front + str(id_fetch[i]) + "&Option=1"
                
        #Call our function to fetch and process data given the URL
        content = form_record(url, id_fetch[i])
        content = np.array(content)
                
        #Only save if there is something in content
        if len(content) > 0:
            mywriter.writerows(content)
            print(id_fetch[i], i, "saved.")

print("It takes:", (time.time() - start_time)/60, "minutes")
print("Done")