In [64]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin
from datetime import datetime
import scrapy

### Links to scrape

In [None]:
links = [
    "https://racing.hkjc.com/racing/information/English/racing/Entries.aspx",
    "https://racing.hkjc.com/racing/information/English/Horse/ListByLocation.aspx?Location=HK",
    "https://racing.hkjc.com/racing/information/English/Jockey/JockeyRanking.aspx",
    "https://racing.hkjc.com/racing/information/English/Jockey/JockeyPastRec.aspx?JockeyId=PZ&Season=Current",
]

### From the subNav menu get all the links

In [14]:
def extract_menu_links(url, menu_rel=None):
    """Extract links with optional filtering by the 'rel' attribute of the menu section's <a> tag"""
    html_content = requests.get(url).content
    soup = BeautifulSoup(html_content, "html.parser")
    links = []

    def process_menu(menu_div):
        nonlocal links
        for dl in menu_div.find_all("dl"):
            for dt in dl.find_all("dt"):
                # If dt.find returns any result
                if a := dt.find("a", href=True):
                    href = a["href"]
                    if href and not href.startswith(("javascript:", "mailto:")):
                        absolute_url = urljoin(url, href)
                        links.append(absolute_url)
                if submenu := dt.find("div", class_="pullmenu"):
                    process_menu(submenu)

    # Find target menu section using rel attribute
    if menu_rel:
        menu_header = soup.find("a", rel=menu_rel)
        if not menu_header:
            return []
        parent_li = menu_header.find_parent("li")
        menu_div = parent_li.find("div", class_="pullmenu") if parent_li else None
        if menu_div:
            process_menu(menu_div)
    else:
        for nav in soup.find_all("div", class_="subNav"):
            process_menu(nav)

    return list(set(links))


target = "https://racing.hkjc.com/racing/english/index.aspx"
rel = "racing-info"
all_links = extract_menu_links(target, menu_rel=rel)
all_links


['https://racing.hkjc.com/racing/information/English/Racing/JKCScheduledRides.aspx',
 'https://racing.hkjc.com/racing/information/English/Racing/Localtrackwork.aspx',
 'https://racing.hkjc.com/racing/english/racing-info/newhorse.asp',
 'https://racing.hkjc.com/racing/english/racing-info/racing_course_time.aspx',
 'https://racing.hkjc.com/racing/information/english/Racing/summary.aspx',
 'https://racing.hkjc.com/racing/information/English/Reports/RaceReportFull.aspx',
 'https://racing.hkjc.com/racing/information/English/Racing/ExceptionalFactors.aspx',
 'https://racing.hkjc.com/racing/information/English/tnc/tncStat.aspx',
 'https://member.hkjc.com/member/english/horse-owner/list-of-bloodstock-agents.aspx',
 'https://racing.hkjc.com/racing/information/english/Horse/HorseFormerName.aspx',
 'https://racing.hkjc.com/racing/information/English/Racing/TNCEntries.aspx',
 'https://campaign.hkjc.com/en/racing/conghua-movement-records.aspx',
 'https://racing.hkjc.com/racing/english/racing-info/r

### Test: Parsing tables from link.

In [None]:
url = "https://racing.hkjc.com/racing/information/English/racing/Draw.aspx"
url_get = requests.get(url)


def scrape_racing_tables(html_content):
    soup = BeautifulSoup(html_content, "html.parser")
    search_results = soup.find_all("div", class_="searchResult")

    all_data = []

    for result in search_results:
        table = result.find("table", class_="table_bd")
        if not table:
            continue

        # Extract race header information
        header = table.find("tr", class_="bg_blue")
        race_info = header.td.text.strip() if header else "Unknown Race"

        for row in table.tbody.find_all("tr"):
            cols = row.find_all("td")
            if len(cols) >= 11:
                all_data.append(
                    {
                        "Race": race_info,
                        "Draw": cols[0].text.strip(),
                        "Runners": cols[1].text.strip(),
                        "Win": cols[2].text.strip(),
                        "Second": cols[3].text.strip(),
                        "Third": cols[4].text.strip(),
                        "Fourth": cols[5].text.strip(),
                        "Win%": cols[6].text.strip(),
                        "Place%": cols[7].text.strip(),
                        "Show%": cols[8].text.strip(),
                        "Fourth%": cols[9].text.strip(),
                    }
                )

    return pd.DataFrame(all_data)


scrape_racing_tables(url_get.content)

### Horse Info

In [49]:
def scrape_horse_info_links(url: str):
    html_content = requests.get(url).content
    soup = BeautifulSoup(html_content, "html.parser")
    cells = soup.find_all("td", class_="table_eng_text")

    hrefs = []

    for cell in cells:
        links = cell.find_all("a", {"class": "table_eng_text"})
        for link in links:
            href = link.get("href")
            if href:
                hrefs.append("https://racing.hkjc.com" + href)
    return hrefs


# links = scrape_horse_info_links(
#     "https://racing.hkjc.com/racing/information/english/Horse/ListByLocation.aspx?Location=HK"
# )

# TODO: Rewrite the below crap, too convoluted.


def parse_horse_profile(url):
    html_content = requests.get(url).content
    soup = BeautifulSoup(html_content, "html.parser")
    profile = {"form_links": {}}

    # Name extraction with error handling
    if title := soup.find("span", {"class": "title_text"}):
        profile["name"] = title.get_text(strip=True)

    # Form links parsing
    if left_col := soup.find("td", class_="table_eng_text"):
        profile["form_links"] = {
            link.text.strip(): f"https://racing.hkjc.com{link['href']}"
            for link in left_col.find_all("a", href=True)
            if not link["href"].startswith("javascript")
        }

    # Main data parsing
    for table in soup.find_all("table", class_="table_eng_text"):
        for row in table.find_all("tr"):
            cells = row.find_all("td")

            key = cells[0].get_text(strip=True).rstrip(":*")
            value_cell = cells[2]

            # Handle links
            # In the main parsing loop:
            # Handle dropdowns FIRST
            if select := value_cell.find("select"):
                profile[key] = [
                    opt.get_text(strip=True) for opt in select.find_all("option")
                ]
                continue  # Skip link processing for this cell

            # Then handle links (but skip javascript links)
            if link := value_cell.find(
                "a", href=lambda h: h and not h.startswith("javascript")
            ):
                profile[f"{key} Text"] = link.get_text(strip=True)
                profile[f"{key} URL"] = f"https://racing.hkjc.com{link['href']}"
                continue

            # Extract and clean value
            raw_value = value_cell.get_text(separator=" ", strip=True)

            # Date handling
            if "(" in raw_value and ")" in raw_value:
                location, date_part = raw_value.split("(", 1)
                date_str = date_part.split(")")[0].strip()
                try:
                    date = datetime.strptime(date_str, "%d/%m/%Y").date().isoformat()
                except ValueError:
                    date = date_str
                profile["Current Stable Location"] = location.strip()
                profile["Arrival Date"] = date
                continue

            # Numeric conversion
            if raw_value.replace("$", "").replace(",", "").isdigit():
                value = int(raw_value.replace("$", "").replace(",", ""))
            elif "-" in raw_value and len(raw_value.split("-")) == 4:
                value = list(map(int, raw_value.split("-")))
            else:
                value = raw_value

            # Split compound keys
            if "/" in key:
                parts = key.split("/")
                values = value.split("/") if isinstance(value, str) else [value]
                for p, v in zip(parts, values):
                    profile[p.strip()] = v.strip() if isinstance(v, str) else v
            else:
                profile[key] = value

    return profile


parse_horse_profile(
    "https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseId=HK_2024_K079"
)


{'form_links': {'Form Records': 'https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseId=HK_2024_K079',
  'Rating/Wt/Placing': 'https://racing.hkjc.com/racing/information/English/Horse/RatingResultWeight.aspx?HorseId=HK_2024_K079',
  'Performance by Distance': 'https://racing.hkjc.com/racing/information/English/Horse/Performance.aspx?HorseId=HK_2024_K079',
  'Trackwork Records': 'https://racing.hkjc.com/racing/information/English/Trackwork/TrackworkResult.aspx?HorseId=HK_2024_K079',
  'Veterinary Records': 'https://racing.hkjc.com/racing/information/English/VeterinaryRecords/OveHorse.aspx?HorseId=HK_2024_K079',
  'Movement Records': 'https://racing.hkjc.com/racing/information/English/Horse/MovementRecords.aspx?HorseId=HK_2024_K079',
  'Overseas formrecords': 'https://racing.hkjc.com/racing/content/english/pp_formsheet/fse_K079.pdf',
  'Other Horses': 'https://racing.hkjc.com/racing/information/English/Horse/SelectHorse.aspx'},
 'name': 'KINGDOM OF RICHES (K079)',
 '

In [89]:
def parse_horse_profile_formlinks(url):
    html_content = requests.get(url).content
    soup = BeautifulSoup(html_content, "html.parser")
    profile = {}

    # Form links parsing
    if left_col := soup.find("td", class_="table_eng_text"):
        for link in left_col.find_all("a", href=True):
            # if not link["href"].startswith("javascript"):
            if not link["href"]:
                profile[link.text.strip()] = f"https://racing.hkjc.com{link['href']}"

    return profile


def parse_horse_profile(url):
    yield scrapy.Request(url=url, callback=parse)


def parse(response):
    sel = scrapy.selector.Selector(response)

    profile_loc = sel.xpath(
        "/html/body/div[1]/div[3]/div[2]/div[2]/div[2]/div[1]/table[@class='horseProfile']"
    ).getall()
    print(profile_loc)


# parse_horse_profile_formlinks(
#     "https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseId=HK_2024_K079"
# )

parse_horse_profile(
    "https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseId=HK_2024_K079"
)


<generator object parse_horse_profile at 0x11b61dcb0>

In [1]:
from scrapy.crawler import CrawlerProcess
from scrapy import Spider, Request


class testCrawl(Spider):
    name = "test"
    url = "https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseId=HK_2020_E486&Option=1"

    def start_requests(self):
        yield Request(url=self.url, callback=self.parse_profile)

    def parse_profile(self, response):
        profile_loc = response.xpath(
            "/html/body/div[1]/div[3]/div[2]/div[2]/div[2]/div[1]/table[@class='horseProfile']"
        ).getall()

        for line in profile_loc:
            print(line)


process = CrawlerProcess()
process.crawl(testCrawl)
process.start()


2025-02-12 13:45:08 [scrapy.utils.log] INFO: Scrapy 2.12.0 started (bot: scrapybot)
2025-02-12 13:45:08 [scrapy.utils.log] INFO: Versions: lxml 5.3.1.0, libxml2 2.12.9, cssselect 1.2.0, parsel 1.10.0, w3lib 2.3.1, Twisted 24.11.0, Python 3.12.0 (v3.12.0:0fb18b02c8, Oct  2 2023, 09:45:56) [Clang 13.0.0 (clang-1300.0.29.30)], pyOpenSSL 25.0.0 (OpenSSL 3.4.1 11 Feb 2025), cryptography 44.0.1, Platform macOS-15.3-x86_64-i386-64bit
2025-02-12 13:45:08 [scrapy.addons] INFO: Enabled addons:
[]
2025-02-12 13:45:08 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2025-02-12 13:45:08 [scrapy.extensions.telnet] INFO: Telnet Password: 4273a144d0eded04
2025-02-12 13:45:08 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.logstats.LogStats']
2025-02-12 13:45:08 [scrapy.crawler] INFO: Overridden settings:
{}
2025-02-12 13:

<table class="horseProfile">
    <tbody>
        <tr>
            <td style="width: 280px;" valign="top">
                <table style="width: 280px; border:0;">
                    <tbody>
                        <tr>
                            <td class="subsubheader" colspan="2"><span class="title_text">ROMANTIC WARRIOR (E486)</span></td>
                        </tr>
                        <tr>
                            <td style="width: 105px;" valign="top">
                                <table id="Table3" cellspacing="0" cellpadding="0">
                                    <tbody>
                                        <tr>
                                            <td><a onclick="window.open(this.href,'_blank','height=330,width=370');return false;" href="/racing/english/horse_img.asp?filename=E486_l.jpg"><img src="/racing/content/Images/horse/E486_s.jpg" onerror="this.style.opacity='0'; this.parentNode.removeAttribute('onclick'); this.parentNode.removeAttribute('href');