In [1]:
%%time

import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import re

from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait

from bs4 import NavigableString

import time
import numpy as np

from selenium.webdriver.chrome.options import Options
import threading

#--------------------------------------------------------------------------------------
from multiprocessing.dummy import Pool as ThreadPool

threadLocal = threading.local()


# Function to open web driver
def get_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless") 
    driver = webdriver.Chrome("/usr/local/chromedriver", options=chrome_options)
    return driver


# go thru whitespace
def get_sibling(tag,previous=False):
    if previous:
        sibling = tag.previous_sibling
        while isinstance(sibling, NavigableString):
            sibling = sibling.previous_sibling
    else:
        sibling = tag.next_sibling
        while isinstance(sibling, NavigableString):
            sibling = sibling.next_sibling        
    return sibling



# crawl function
def form_record(url):
    # Function to access a page and save all horses into a list
    driver = get_driver()
    
    # Fetch the page
    driver.get(url)
    
    # get horse id from url
    horse_id = url.split('HorseId=')[1][0:12]
    
    # Is there anything?
    if driver.page_source.find("No information.") != -1:
        return []
    
    # Wait 10 secs so that the dynamic content has time to load.
    try:
        wait = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "htable_eng_text")))
    except Exception as e:
        print("An exception occurred: ", horse_id)
        return []
    
    # Load the page into BeautifulSoup
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Find all tags with href containing "RaceDate"
    horses = soup.find_all(href=re.compile("RaceDate"))

    # 'form_record_list' is the whole table
    # 'output' is a single row
    form_record_list = []
    
    # Loop through horses
    for horse in horses:
        # Get the horse name
        output = [horse.text.strip()]
        # This while loop fetch all remaining fields in a row
        a = get_sibling(horse.parent)
        
        while a != None:
            output.append(a.text
                          .strip()
                          
                          .replace('\n','') 
                          .replace(' '*20,' ')
                          
                         )
            a = get_sibling(a)
        
        # horse id
        output.append(horse_id)
        # Append each row to the output list
        form_record_list.append(output)

    driver.close()
    
    # try disable it to see if it caused the problem report: quit unexpectedly
    #driver.quit()
    
    # get the progress
    #print(urls.index(url))
    
    return form_record_list


# Function for multi-threading
def main():
    
    pool = ThreadPool(10)
    records = pool.map(form_record, urls)
    pool.close()
    pool.join()
    
    return records


if __name__ == "__main__":
    url_front = "https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseId="
    urls = []


    # read race_result which we scrape previously to get uniuqe horse id
    race_result = pd.read_csv('clean/race_result.csv') #, encoding= 'unicode_escape', low_memory=False)

    # crawl on unique id only as each form record page is for one horse 
    id_fetch = list(race_result.horseid.unique())

    for _id in id_fetch:
        # create a list of url
        urls.append(url_front + str(_id) + "&Option=1")
    
    #print(len(urls))
    result = main()
    
    # Flaten a list of list
    result = [item for items in result for item in items]
    
    write_to_csv = pd.DataFrame(result, 
                                columns = ["RaceIndex", "Pla", "Date", "RC/Track/Course", "Dist", "Ground", 
                                           "RaceClass", "Draw", "Rating", "Trainer", "Jockey", "LBW", "WinOdds", 
                                           "ActWt", "RunPo", "FinishTime", "Declare_Horse_Wt", "Gear", "VideoReplay", "horseid"])

  return caller(func, *(extras + args), **kw)


An exception occurred:  HK_2015_V325
CPU times: user 10min 2s, sys: 1min 45s, total: 11min 48s
Wall time: 1h 8min


In [3]:
write_to_csv

Unnamed: 0,RaceIndex,Pla,Date,RC/Track/Course,Dist,Ground,RaceClass,Draw,Rating,Trainer,Jockey,LBW,WinOdds,ActWt,RunPo,FinishTime,Declare_Horse_Wt,Gear,VideoReplay,horseid
0,356,12,21/01/2018,"ST / Turf / ""A""",2000,G,5,12,35,A Lee,M L Yeung,7-3/4,16,126,13 13 13 13 12,2.05.83,1095,B/TT,,HK_2012_P405
1,210,03,22/11/2017,"HV / Turf / ""C+3""",2200,G,5,6,35,A Lee,M F Poon,1-1/4,8.7,121,12 11 10 8 3 3,2.18.54,1095,B/TT,,HK_2012_P405
2,145,01,29/10/2017,"HV / Turf / ""A""",2200,GF,5,5,30,A Lee,M L Yeung,1-1/4,5.8,128,9 6 5 5 5 1,2.18.77,1086,B/TT,,HK_2012_P405
3,092,04,08/10/2017,"ST / Turf / ""B+2""",2000,G,5,2,31,A Lee,M F Poon,2-1/2,7.9,120,4 7 5 6 4,2.03.98,1082,B/TT,,HK_2012_P405
4,002,09,03/09/2017,"ST / Turf / ""B""",1600,G,5,10,32,A Lee,M F Poon,4,40,118,11 9 8 9,1.36.80,1084,B/TT,,HK_2012_P405
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83289,814,09,07/07/21,"HV / Turf / ""A""",1200,G,3,12,71,C H Yip,B Shinn,4-1/4,87,124,11 10 9,1.10.16,1090,--,,HK_2020_E396
83290,820,12,11/07/21,"ST / Turf / ""A""",1200,GF,4,3,52,P O'Sullivan,R Maia,9,172,125,8 9 12,1.09.89,1036,--,,HK_2020_E200
83291,824,06,11/07/21,"ST / Turf / ""A""",1200,GF,3,1,73,F C Lor,C Y Ho,4-1/2,15,127,3 4 6,1.09.25,1117,--,,HK_2020_E315
83292,826,08,11/07/21,"ST / Turf / ""A""",1400,GF,3,11,75,K W Lui,C Y Ho,4-3/4,87,131,12 12 13 8,1.21.71,1062,--,,HK_2020_E347


In [4]:
# the path you want to save the result
filepath = "clean/form_record.csv"

write_to_csv.to_csv(filepath, index=False)