In [1]:
%%time

import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import re

from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait

import time

from selenium.webdriver.chrome.options import Options
import threading

#--------------------------------------------------------------------------------------
from multiprocessing.dummy import Pool as ThreadPool

threadLocal = threading.local()


# Function to open web driver
def get_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless") 
    driver = webdriver.Chrome("/usr/local/chromedriver", options=chrome_options)
    return driver


def table(url):
    driver = get_driver()
    
    driver.get(url)
    
    date = str(url.split('RaceDate=')[1][0:10])
    match = str(url.split('RaceNo=')[1])
    
#     # Is there anything?
#     if driver.page_source.find("Information will be released shortly") != -1:
#         return []
    
    # Wait 10 secs so that the dynamic content has time to load.
    # Proceed to next date if page doesn't load.
    try:
        wait = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located(((By.XPATH, '//table[@class="table_bd f_tac race_table"]'))))
    except Exception as e: 
        print(e, date, match, url)
        return []
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    table = soup.find('table', class_ = 'table_bd f_tac race_table')

    output_list = [] 
    
    for tr in table.find('tbody').find_all('tr'):
        cols = []
        position = [td.string for td in tr('td')]
        section_time = [p.string for p in tr('p')]
        margin_behind = [i.string for i in tr('i')]
        
        # using filter()
        # to remove None values in list
        position = list(filter(None, position))
        section_time = list(filter(None, section_time))
        margin_behind = list(filter(None, margin_behind))
        
        # keep it consistent with length of 6 total
        section_time = section_time + [''] * (6 - len(section_time))
        margin_behind = margin_behind + [''] * (6 - len(margin_behind))
        
        
        cols.extend(position)
        cols.extend(section_time)
        cols.extend(margin_behind)
        
        cols.append(date)
        cols.append(match)
        
        output_list.append(cols)
    
    driver.close()
    
    driver.quit()
    
    return output_list


# Function for multi-threading
def main():
    
    pool = ThreadPool(10)
    records = pool.map(table, urls)
    pool.close()
    pool.join()
    
    return records


if __name__ == "__main__":
    url_front = "https://racing.hkjc.com/racing/information/english/Racing/DisplaySectionalTime.aspx?RaceDate="
    urls = [] #01/01/2010&RaceNo=1


    # read race_result_full which we cleaned previously to get uniuqe race day and match
    race_result_full = pd.read_csv('clean/race_result_full.csv') #, encoding= 'unicode_escape', low_memory=False)
    
    # create url from dataframe for easy checking: if any exception raise, it is a error
    race_result_full['url'] = url_front + (pd.to_datetime(race_result_full['date'], format = '%Y/%m/%d').dt.strftime('%d/%m/%Y')) + '&RaceNo=' + (race_result_full['match']).astype(str)

    
    # crawl on unique url only
    urls = list(race_result_full['url'].unique())

    #print(len(urls))
    result = main()
    
    # Flaten a list of list
    result = [item for items in result for item in items]
    
    write_to_csv = pd.DataFrame(result, 
                                columns = ["finishing_order", "horse_no", "horse", "time",
                                           
                                           "section_time_1", "section_time_2", "section_time_3", 
                                           "section_time_4", "section_time_5", "section_time_6",
                       
                                           "margin_behind_1", "margin_behind_2", "margin_behind_3", 
                                           "margin_behind_4", "margin_behind_5", "margin_behind_6", 
                       
                                           "date", "match"])

Message: 
 06/06/2018 6 https://racing.hkjc.com/racing/information/english/Racing/DisplaySectionalTime.aspx?RaceDate=06/06/2018&RaceNo=6
Message: 
 03/05/2020 3 https://racing.hkjc.com/racing/information/english/Racing/DisplaySectionalTime.aspx?RaceDate=03/05/2020&RaceNo=3
Message: 
 05/06/2019 3 https://racing.hkjc.com/racing/information/english/Racing/DisplaySectionalTime.aspx?RaceDate=05/06/2019&RaceNo=3
Message: 
 16/06/2019 1 https://racing.hkjc.com/racing/information/english/Racing/DisplaySectionalTime.aspx?RaceDate=16/06/2019&RaceNo=1
Message: 
 16/02/2020 6 https://racing.hkjc.com/racing/information/english/Racing/DisplaySectionalTime.aspx?RaceDate=16/02/2020&RaceNo=6
Message: 
 19/12/2009 8 https://racing.hkjc.com/racing/information/english/Racing/DisplaySectionalTime.aspx?RaceDate=19/12/2009&RaceNo=8
Message: 
 13/04/2011 4 https://racing.hkjc.com/racing/information/english/Racing/DisplaySectionalTime.aspx?RaceDate=13/04/2011&RaceNo=4
Message: 
 13/01/2010 3 https://racing.hk

In [2]:
write_to_csv

Unnamed: 0,finishing_order,horse_no,horse,time,section_time_1,section_time_2,section_time_3,section_time_4,section_time_5,section_time_6,margin_behind_1,margin_behind_2,margin_behind_3,margin_behind_4,margin_behind_5,margin_behind_6,date,match
0,1,7,TELEPHATIA(P405),1:49.08,14.91,22.54,23.42,24.31,23.90,,7-3/4,15-1/4,12,5-1/4,SH,,01/01/2015,1
1,2,1,NAMJONG TURBO(N250),1:49.08,13.95,21.86,24.10,24.75,24.42,,1-3/4,5,6,2,SH,,01/01/2015,1
2,3,12,HEAR THE ROAR(M152),1:49.38,14.51,22.30,23.38,24.59,24.60,,5-1/4,11-1/4,7-3/4,2-3/4,2,,01/01/2015,1
3,4,5,CASA JUNIOR(M366),1:49.40,13.79,21.82,23.98,25.15,24.66,,3/4,3-3/4,4,2-1/2,2,,01/01/2015,1
4,5,8,JOYFUL MISSION(S094),1:49.40,14.71,22.02,23.70,24.63,24.34,,6-1/2,10-3/4,9-1/4,4-1/2,2,,01/01/2015,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121841,7,3,CHARLES THE GREAT(N171),1:09.40,24.56,22.62,22.22,,,,6-1/4,3-3/4,4-1/2,,,,26/10/2014,8
121842,8,4,FREDERICK ENGELS(N187),1:09.46,24.32,22.70,22.44,,,,4-3/4,2-3/4,4-3/4,,,,26/10/2014,8
121843,9,8,BULLISH FRIEND(N333),1:09.53,24.00,22.94,22.59,,,,2-3/4,2-1/4,5-1/4,,,,26/10/2014,8
121844,10,2,STERLING CITY(N152),1:09.76,24.24,22.74,22.78,,,,4-1/4,2-1/2,6-3/4,,,,26/10/2014,8


In [3]:
# the path you want to save the result
filepath = "clean/sectional_time.csv"

write_to_csv.to_csv(filepath, index=False)