In [1]:
import sys
from pathlib import Path

# in jupyter (lab / notebook), based on notebook path
module_path = str(Path.cwd().parents[0] / "horse")

if module_path not in sys.path:
    sys.path.append(module_path)

from calculate_run import calculate_run

In [2]:
from selenium.webdriver.chrome.options import Options
chrome_options = Options() # 啟動無頭模式
chrome_options.add_argument('--headless')  #規避google bug
chrome_options.add_argument('--disable-gpu')

In [3]:
def update_date_traverse_status(_id, status):
    import pymongo
    with pymongo.MongoClient("mongodb://localhost:27017/") as connection:
        db = connection["horse"]
        col = db["match_date"]
        col.update_one({'_id': _id}, {"$set":{"traverse_status":status}})        

In [4]:
def get_match_date_from_mongo():
    import pymongo
    with pymongo.MongoClient("mongodb://localhost:27017/") as connection:
        db = connection["horse"]
        col = db["match_date"]
        query = {"$and": [{"traverse_status":{"$ne": -1}}, {"traverse_status":{"$ne": 1}},{"traverse_status":{"$ne": 2}}]}
        
        datas = list(col.find(query))    
        return datas
    

In [5]:
def update_horse_to_mongo(horse):
    import pymongo
    with pymongo.MongoClient("mongodb://localhost:27017/") as connection:
        db = connection["horse"]
        col = db["match"]
        
        x = col.replace_one({
            "pos":horse['pos'],
            "match_no": horse['match_no'],
            "date": horse['date']
        }, horse, upsert = True)  
        
        

In [6]:
def normalize_between(between):
    b = None    
    try:
        if between in '---':                       
            return b
        if any(between in s for s in ['頸位','短馬頭位','頭位', '鼻位']):
            b = 0.0            
            return b
        v = between.split('-')        
            
        if len(v) == 1:
            d = v[0].split('/')
            d_value = 0
            if len(d) == 2:
                d_value = float(d[0]) / float(d[1])
            elif len(d) == 1:
                b_value = float(d[0])            
            b = b_value + d_value                                    
        elif len(v) == 2:
            d = v[1].split('/')
            d_value = 0
            if len(d) == 2:
                d_value = float(d[0]) / float(d[1])                
            b = float(v[0]) + d_value            
    except Exception as e:                
        pass
    return b

In [7]:
def read_match_result(driver, data):    
    
    location_elem = driver.find_element(By.CSS_SELECTOR, ".raceMeeting_select span.f_fl.f_fs13")    
    location = None
    match_no = None
    distance = None
    land_status = None
    track = None
    report = None
    
    import re
    if "賽事日期" in location_elem.text:        
        location_match = re.search("賽事日期:\s*\S*\s*(.*)", location_elem.text)                
        location = location_match.group(1)
    
    
    match_no_elem = driver.find_element(By.CSS_SELECTOR, ".race_tab thead td:first-child")        
    match_no_match = re.search("\S*\s*\((\S*)\)", match_no_elem.text)             
    match_no = match_no_match.group(1)
    
    distance_elem = driver.find_element(By.CSS_SELECTOR, ".race_tab tbody tr:nth-child(2) td:first-child")            
    distance_match = re.search("\S*\s*-\s*(\S*)米*", distance_elem.text)             
    distance = distance_match.group(1)
              
    land_status_elem = driver.find_element(By.CSS_SELECTOR, ".race_tab tbody tr:nth-child(2) td:nth-child(3)")            
    land_status = land_status_elem.text
    
    track_elem = driver.find_element(By.CSS_SELECTOR, ".race_tab tbody tr:nth-child(3) td:nth-child(3)")            
    track = track_elem.text
    
    report_elem = driver.find_element(By.CSS_SELECTOR, ".race_incident_report .info_p")            
    report = report_elem.text.replace("<br>", "\n")
    
    distance_match_2 = re.search("(\d+)", distance)                             
    d = distance_match_2.group(1)        
    track_match = re.search("^(\S+)|\s+-\s+.+$", track)
    t = track_match.group(1)
    normalize_track = location+t+d            
    
    #bad_land status ignore
    bad_land = any(substring in land_status for substring in ['濕','黏','慢'])
    
    #horses = list()
    performances = driver.find_elements(By.CSS_SELECTOR, ".performance tbody tr")            
    for performance in performances:
        pos = performance.find_element(By.CSS_SELECTOR, "td:first-child")
        horse_name = performance.find_element(By.CSS_SELECTOR, "td:nth-child(3)")
        jockey = performance.find_element(By.CSS_SELECTOR, "td:nth-child(4)")
        trainer = performance.find_element(By.CSS_SELECTOR, "td:nth-child(5)")
        jwt = performance.find_element(By.CSS_SELECTOR, "td:nth-child(6)")
        wt = performance.find_element(By.CSS_SELECTOR, "td:nth-child(7)")
        draw = performance.find_element(By.CSS_SELECTOR, "td:nth-child(8)")
        between = performance.find_element(By.CSS_SELECTOR, "td:nth-child(9)")
        progress = performance.find_element(By.CSS_SELECTOR, "td:nth-child(10)")
        finish_time = performance.find_element(By.CSS_SELECTOR, "td:nth-child(11)")
        odd = performance.find_element(By.CSS_SELECTOR, "td:nth-child(12)")           
        
        
        #normalized time
        normalized_time = None
        run = None
        finish_time_text = finish_time.text
        minute_idx = finish_time_text.find(':')
        second_idx = finish_time_text.find('.')
        minute_str = finish_time_text[:minute_idx]
        second_str = finish_time_text[minute_idx+1:second_idx]
        millsecond_str = finish_time_text[second_idx+1:]
        
        try:
            minute = int(minute_str)
            second = int(second_str)
            millsecond = int(millsecond_str)
            normalized_time = minute * 60 + second + millsecond/100            
            
            standard_time_dataframe = calculate_run(normalize_track)
            standard_time = standard_time_dataframe.iloc[0]    
            run = 100-(normalized_time - standard_time['marks']) / 0.16 * standard_time['dev']                                  
        except:
            pass        
        
        #normalized pos
        normalized_pos = None
        try:
            pos_match = re.search("^\s*(\d+)", pos.text)
            normalized_pos = int(pos_match.group(1))
        except:        
            pass
        
        #normalized_between
        normalized_between = normalize_between(between.text)
        
        horse = {
            "date": data['date'],
            "date_str": data['date_str'],
            "pos": pos.text,
            "horse_name": horse_name.text,
            "jockey": jockey.text,
            "trainer": trainer.text,
            "jwt":  int(jwt.text) if jwt.text.isdigit() else None,
            "wt": int(wt.text) if  wt.text.isdigit()  else None,
            "draw": int(draw.text) if draw.text.isdigit() else None,             
            "finish_time": finish_time.text,
            "normalized_time": normalized_time,
            "odd": odd.text,     
            "match_no": match_no,            
            "location": location,
            "distance": distance,
            "land_status": land_status,
            "track": track,        
            "report": report,
            "normalize_track": normalize_track,
            "between": between.text,
            "progress": progress.text,
            "run": run,
            "normalized_pos": normalized_pos,
            "normalized_between": normalized_between
        }        
        if bad_land:
            horse['ignored'] = 1
            horse['examined'] = 1
        
        update_horse_to_mongo(horse)

In [8]:
def traverse_result(driver, data):
    import time
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support import expected_conditions as EC
    
    racecard_container = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR , ".js_racecard"))
    )
    
    racecards = racecard_container.find_elements(By.CSS_SELECTOR, "tr")    
    venue_list = ["沙田", "跑馬地"]
    hrefs = list()
    v = None
    for racecard in racecards:
        venue = racecard.find_element(By.CSS_SELECTOR, "td.f_tar")        
        if not any([substring in venue.text for substring in venue_list]):            
            continue
        print(venue.text)
        v = venue.text
        print(driver.current_url)
        hrefs.append(driver.current_url)
        matches = racecard.find_elements(By.CSS_SELECTOR, "td a")
        
        for match in matches:
            if "local" in match.get_attribute("class"):
                break
            href = match.get_attribute("href")
            print(href)
            hrefs.append(href)            
            
    for href in hrefs:
        if href != driver.current_url:            
            driver.get(href)
        read_match_result(driver, data)
        #break
        time.sleep(1)
        
        
    update_date_traverse_status(data.get('_id'), 1)
            

In [9]:
datas = get_match_date_from_mongo()

In [10]:
print(datas)

[{'_id': ObjectId('634929a0be5aff7fd24f674e'), 'date_str': '16/10/2022', 'date': datetime.datetime(2022, 10, 16, 0, 0)}, {'_id': ObjectId('634929a1be5aff7fd24f6755'), 'date_str': '15/10/2022', 'date': datetime.datetime(2022, 10, 15, 0, 0)}, {'_id': ObjectId('6357bcfebe5aff7fd25b2622'), 'date_str': '26/10/2022', 'date': datetime.datetime(2022, 10, 26, 0, 0)}, {'_id': ObjectId('6357bcfebe5aff7fd25b2629'), 'date_str': '23/10/2022', 'date': datetime.datetime(2022, 10, 23, 0, 0)}, {'_id': ObjectId('6357bcfebe5aff7fd25b262f'), 'date_str': '19/10/2022', 'date': datetime.datetime(2022, 10, 19, 0, 0)}]


In [11]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
#driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
url = 'https://racing.hkjc.com/racing/information/Chinese/racing/LocalResults.aspx'
driver.get(url)



Current google-chrome version is 106.0.5249
Get LATEST chromedriver version for 106.0.5249 google-chrome
Driver [C:\Users\kenyeung\.wdm\drivers\chromedriver\win32\106.0.5249.61\chromedriver.exe] found in cache


In [12]:
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException
from datetime import datetime
import traceback
for idx, data in enumerate(datas):    
    date_str = data['date_str']   
    if datetime.now().date() <= datetime.strptime(date_str, "%d/%m/%Y").date():
        print("skip {date}".format(date = date_str))
        continue
    _id = data.get('_id')
    try:
        selectId = driver.find_element(By.ID, "selectId")
        date_option = selectId.find_element(By.CSS_SELECTOR, "option[value='{d}']".format(d=date_str))
        value = date_option.get_attribute('value')
        from selenium.webdriver.support.ui import Select
        select = Select(selectId)          
        select.select_by_value(value)
        print(date_str)
        submitBtn = driver.find_element(By.ID, "submitBtn")
        submitBtn.click()
        traverse_result(driver, data)
    except (NoSuchElementException,TimeoutException) as error:
        update_date_traverse_status(_id, -1)
        driver.get(url)                
        
        traceback.print_exc()
    if idx >= 80:
        break

16/10/2022
沙田:
https://racing.hkjc.com/racing/information/chinese/Racing/LocalResults.aspx?RaceDate=2022/10/16
https://racing.hkjc.com/racing/information/chinese/Racing/LocalResults.aspx?RaceDate=2022/10/16&Racecourse=ST&RaceNo=2
https://racing.hkjc.com/racing/information/chinese/Racing/LocalResults.aspx?RaceDate=2022/10/16&Racecourse=ST&RaceNo=3
https://racing.hkjc.com/racing/information/chinese/Racing/LocalResults.aspx?RaceDate=2022/10/16&Racecourse=ST&RaceNo=4
https://racing.hkjc.com/racing/information/chinese/Racing/LocalResults.aspx?RaceDate=2022/10/16&Racecourse=ST&RaceNo=5
https://racing.hkjc.com/racing/information/chinese/Racing/LocalResults.aspx?RaceDate=2022/10/16&Racecourse=ST&RaceNo=6
https://racing.hkjc.com/racing/information/chinese/Racing/LocalResults.aspx?RaceDate=2022/10/16&Racecourse=ST&RaceNo=7
https://racing.hkjc.com/racing/information/chinese/Racing/LocalResults.aspx?RaceDate=2022/10/16&Racecourse=ST&RaceNo=8
https://racing.hkjc.com/racing/information/chinese/Racin

Traceback (most recent call last):
  File "C:\Users\kenyeung\AppData\Local\Temp/ipykernel_13988/1939520798.py", line 22, in <module>
    traverse_result(driver, data)
  File "C:\Users\kenyeung\AppData\Local\Temp/ipykernel_13988/2833814994.py", line 7, in traverse_result
    racecard_container = WebDriverWait(driver, 10).until(
  File "C:\Users\kenyeung\AppData\Local\Programs\Python\Python39\lib\site-packages\selenium\webdriver\support\wait.py", line 87, in until
    raise TimeoutException(message, screen, stacktrace)
selenium.common.exceptions.TimeoutException: Message: 
Stacktrace:
Backtrace:
	Ordinal0 [0x00251ED3+2236115]
	Ordinal0 [0x001E92F1+1807089]
	Ordinal0 [0x000F66FD+812797]
	Ordinal0 [0x001255DF+1005023]
	Ordinal0 [0x001257CB+1005515]
	Ordinal0 [0x00157632+1209906]
	Ordinal0 [0x00141AD4+1120980]
	Ordinal0 [0x001559E2+1202658]
	Ordinal0 [0x001418A6+1120422]
	Ordinal0 [0x0011A73D+960317]
	Ordinal0 [0x0011B71F+964383]
	GetHandleVerifier [0x004FE7E2+2743074]
	GetHandleVerifier [0

skip 26/10/2022
23/10/2022
沙田:
https://racing.hkjc.com/racing/information/chinese/Racing/LocalResults.aspx?RaceDate=2022/10/23
https://racing.hkjc.com/racing/information/chinese/Racing/LocalResults.aspx?RaceDate=2022/10/23&Racecourse=ST&RaceNo=2
https://racing.hkjc.com/racing/information/chinese/Racing/LocalResults.aspx?RaceDate=2022/10/23&Racecourse=ST&RaceNo=3
https://racing.hkjc.com/racing/information/chinese/Racing/LocalResults.aspx?RaceDate=2022/10/23&Racecourse=ST&RaceNo=4
https://racing.hkjc.com/racing/information/chinese/Racing/LocalResults.aspx?RaceDate=2022/10/23&Racecourse=ST&RaceNo=5
https://racing.hkjc.com/racing/information/chinese/Racing/LocalResults.aspx?RaceDate=2022/10/23&Racecourse=ST&RaceNo=6
https://racing.hkjc.com/racing/information/chinese/Racing/LocalResults.aspx?RaceDate=2022/10/23&Racecourse=ST&RaceNo=7
https://racing.hkjc.com/racing/information/chinese/Racing/LocalResults.aspx?RaceDate=2022/10/23&Racecourse=ST&RaceNo=8
https://racing.hkjc.com/racing/informati

In [14]:
driver.quit()