In [9]:
import time
import csv
from urllib.parse import urljoin

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException, ElementClickInterceptedException

In [None]:
# Scrape match links for past seasons (click left arrow to go backwards)
import re
import time
import csv
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import JavascriptException

START_URL = "https://www.sofascore.com/tournament/football/singapore/premier-league/634"
SEASON_IDS = [78478, 59708, 48772]


def build_driver(headless=False):
    opts = Options()
    if headless:
        opts.add_argument("--headless=new")
    opts.add_argument("--window-size=1400,900")
    opts.add_argument("--disable-gpu")
    opts.add_argument("--no-sandbox")
    opts.add_argument("--disable-dev-shm-usage")
    opts.add_argument("--lang=en-US")
    return webdriver.Chrome(options=opts)


def parse_event_id(url: str):
    m = re.search(r"#id:(\d+)", url)
    return int(m.group(1)) if m else None


def js_collect_links(driver):
    return driver.execute_script("""
        const out = [];
        const anchors = document.querySelectorAll("a[href*='/football/match/']");
        for (const a of anchors) {
            const href = a.getAttribute("href");
            if (!href) continue;
            if (href.includes("#id:")) out.push(new URL(href, location.origin).href);
        }
        return Array.from(new Set(out));
    """)


def wait_match_list_present(driver, timeout=20):
    wait = WebDriverWait(driver, timeout)
    wait.until(lambda d: d.execute_script(
        "return document.querySelectorAll(\"a[href*='/football/match/']\").length"
    ) > 0)


def js_click_prev_round(driver):
    """
    Click the LEFT arrow button that loads older rounds.
    """
    try:
        return driver.execute_script(r"""
            const sig = "M6 11.99 14.058 4l1.432 1.42-6.636 6.57 6.646 6.6L14.078 20z";
            const paths = Array.from(document.querySelectorAll("button svg path"));

            for (const p of paths) {
                const d = (p.getAttribute("d") || "").trim();
                if (d === sig) {
                    const btn = p.closest("button");
                    if (!btn) continue;

                    const ariaDisabled = btn.getAttribute("aria-disabled");
                    const isDisabled = btn.disabled || ariaDisabled === "true" || btn.hasAttribute("disabled");
                    if (isDisabled) return false;

                    btn.scrollIntoView({block: "center"});
                    btn.click();
                    return true;
                }
            }
            return false;
        """)
    except JavascriptException:
        return False


def scrape_one_season_by_clicking_prev(driver, season_id, max_clicks=80, pause=1.2, stop_when_no_new=5):
    # Load season page and wait for the match list to render
    driver.get(f"{START_URL}#id:{season_id}")
    time.sleep(2.0)

    wait_match_list_present(driver)
    time.sleep(1.0)

    all_links = set()
    no_new = 0

    # initial collect
    for u in js_collect_links(driver):
        all_links.add(u)

    for i in range(max_clicks):
        before = len(all_links)

        clicked = js_click_prev_round(driver)
        if not clicked:
            print(f"season {season_id} | prev button disabled/not found -> stop")
            break

        # Give the site some time to load the next round
        time.sleep(pause)

        for u in js_collect_links(driver):
            all_links.add(u)

        after = len(all_links)
        print(f"season {season_id} | click {i+1} | links {after}")

        if after == before:
            no_new += 1
        else:
            no_new = 0

        if no_new >= stop_when_no_new:
            print(f"season {season_id} | no new links for {stop_when_no_new} clicks -> stop")
            break

    rows = []
    for u in sorted(all_links):
        eid = parse_event_id(u)
        if eid is not None:
            rows.append({"season_id": season_id, "event_id": eid, "match_url": u})

    # de-dupe
    seen = set()
    uniq = []
    for r in rows:
        key = (r["season_id"], r["event_id"])
        if key not in seen:
            seen.add(key)
            uniq.append(r)

    return uniq


def save_csv(rows, out_path="data/spl_match_links_3_seasons.csv"):
    with open(out_path, "w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=["season_id", "event_id", "match_url"])
        w.writeheader()
        w.writerows(rows)
    print("saved:", out_path, "rows:", len(rows))


if __name__ == "__main__":
    driver = build_driver(headless=False)
    try:
        all_rows = []
        for sid in SEASON_IDS:
            print(f"\n=== scraping season {sid} ===")
            all_rows.extend(scrape_one_season_by_clicking_prev(driver, sid))

        # global de-dupe
        seen = set()
        final_rows = []
        for r in all_rows:
            key = (r["season_id"], r["event_id"])
            if key not in seen:
                seen.add(key)
                final_rows.append(r)

        print("TOTAL:", len(final_rows))
        print("SAMPLE:", final_rows[:5])
        save_csv(final_rows)

    finally:
        driver.quit()



=== scraping season 78478 ===
season 78478 | click 1 | links 16
season 78478 | click 2 | links 20
season 78478 | click 3 | links 27
season 78478 | click 4 | links 31
season 78478 | click 5 | links 35
season 78478 | prev button disabled/not found -> stop

=== scraping season 59708 ===
season 59708 | click 1 | links 12
season 59708 | click 2 | links 12
season 59708 | click 3 | links 12
season 59708 | click 4 | links 17
season 59708 | click 5 | links 24
season 59708 | click 6 | links 28
season 59708 | click 7 | links 32
season 59708 | click 8 | links 36
season 59708 | click 9 | links 40
season 59708 | click 10 | links 44
season 59708 | click 11 | links 48
season 59708 | click 12 | links 52
season 59708 | click 13 | links 56
season 59708 | click 14 | links 60
season 59708 | click 15 | links 64
season 59708 | click 16 | links 68
season 59708 | click 17 | links 72
season 59708 | click 18 | links 76
season 59708 | click 19 | links 80
season 59708 | click 20 | links 84
season 59708 | click 21

In [None]:
# Scrape future fixture links for the current season
# Click the right arrow to move to newer rounds and collect match URLs.

import re
import time
import csv
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import JavascriptException

START_URL = "https://www.sofascore.com/tournament/football/singapore/premier-league/634"
SEASON_IDS = [78478]


def build_driver(headless=True):
    opts = Options()
    if headless:
        opts.add_argument("--headless=new")
    opts.add_argument("--window-size=1400,900")
    opts.add_argument("--disable-gpu")
    opts.add_argument("--no-sandbox")
    opts.add_argument("--disable-dev-shm-usage")
    opts.add_argument("--lang=en-US")
    return webdriver.Chrome(options=opts)


def parse_event_id(url: str):
    m = re.search(r"#id:(\d+)", url)
    return int(m.group(1)) if m else None


def js_collect_links(driver):
    return driver.execute_script("""
        const out = [];
        const anchors = document.querySelectorAll("a[href*='/football/match/']");
        for (const a of anchors) {
            const href = a.getAttribute("href");
            if (!href) continue;
            if (href.includes("#id:")) out.push(new URL(href, location.origin).href);
        }
        return Array.from(new Set(out));
    """)


def wait_match_list_present(driver, timeout=20):
    wait = WebDriverWait(driver, timeout)
    wait.until(lambda d: d.execute_script(
        "return document.querySelectorAll(\"a[href*='/football/match/']\").length"
    ) > 0)


def js_click_next_round(driver):
    """Click the right arrow button to load newer rounds.

    Returns True if clicked, False if the button is missing/disabled.
    """
    try:
        return driver.execute_script(r"""
            // right-chevron path signature
            const sig = "M18 12.01 9.942 20 8.51 18.58l6.636-6.57L8.5 5.41 9.922 4z";
            const paths = Array.from(document.querySelectorAll("button svg path"));

            for (const p of paths) {
                const d = (p.getAttribute("d") || "").trim();
                if (d === sig) {
                    const btn = p.closest("button");
                    if (!btn) continue;

                    // check disabled states
                    const ariaDisabled = btn.getAttribute("aria-disabled");
                    const isDisabled = btn.disabled || ariaDisabled === "true" || btn.hasAttribute("disabled");
                    if (isDisabled) return false;

                    btn.scrollIntoView({block: "center"});
                    btn.click();
                    return true;
                }
            }
            return false;
        """)
    except JavascriptException:
        return False


def scrape_one_season_by_clicking_next(driver, season_id, max_clicks=80, pause=0.9, stop_when_no_new=5):
    driver.get(f"{START_URL}#id:{season_id}")
    wait_match_list_present(driver)

    all_links = set()
    no_new = 0

    # initial collect
    for u in js_collect_links(driver):
        all_links.add(u)

    for i in range(max_clicks):
        before = len(all_links)

        clicked = js_click_next_round(driver)
        if not clicked:
            print(f"season {season_id} | next button disabled/not found -> stop")
            break

        time.sleep(pause)

        # after click, collect links again
        for u in js_collect_links(driver):
            all_links.add(u)

        after = len(all_links)
        print(f"season {season_id} | next click {i+1} | links {after}")

        if after == before:
            no_new += 1
        else:
            no_new = 0

        if no_new >= stop_when_no_new:
            print(f"season {season_id} | no new links for {stop_when_no_new} clicks -> stop")
            break

    rows = []
    for u in sorted(all_links):
        eid = parse_event_id(u)
        if eid is not None:
            rows.append({"season_id": season_id, "event_id": eid, "match_url": u})

    # de-dupe by (season_id, event_id)
    seen = set()
    uniq = []
    for r in rows:
        key = (r["season_id"], r["event_id"])
        if key not in seen:
            seen.add(key)
            uniq.append(r)

    return uniq


def save_csv(rows, out_path="data/spl_match_links_next_3_seasons.csv"):
    with open(out_path, "w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=["season_id", "event_id", "match_url"])
        w.writeheader()
        w.writerows(rows)
    print("saved:", out_path, "rows:", len(rows))


if __name__ == "__main__":
    driver = build_driver(headless=False)
    try:
        all_rows = []
        for sid in SEASON_IDS:
            all_rows.extend(scrape_one_season_by_clicking_next(driver, sid))

        # global de-dupe
        seen = set()
        final_rows = []
        for r in all_rows:
            key = (r["season_id"], r["event_id"])
            if key not in seen:
                seen.add(key)
                final_rows.append(r)

        print("TOTAL:", len(final_rows))
        print("SAMPLE:", final_rows[:5])
        save_csv(final_rows)

    finally:
        driver.quit()

season 78478 | next click 1 | links 18
season 78478 | next click 2 | links 22
season 78478 | next click 3 | links 29
season 78478 | next click 4 | links 36
season 78478 | next click 5 | links 43
season 78478 | next click 6 | links 43
season 78478 | next click 7 | links 43
season 78478 | next click 8 | links 43
season 78478 | next click 9 | links 43
season 78478 | next click 10 | links 47
season 78478 | next click 11 | links 52
season 78478 | next click 12 | links 60
season 78478 | next click 13 | links 66
season 78478 | next click 14 | links 71
season 78478 | next click 15 | links 75
season 78478 | next click 16 | links 76
season 78478 | next button disabled/not found -> stop
TOTAL: 76
SAMPLE: [{'season_id': 78478, 'event_id': 14195558, 'match_url': 'https://www.sofascore.com/football/match/albirex-niigata-singapore-balestier-khalsa/OkbsGGn#id:14195558'}, {'season_id': 78478, 'event_id': 14270526, 'match_url': 'https://www.sofascore.com/football/match/albirex-niigata-singapore-balestie

In [None]:
# Merge past + future match links into one file
import pandas as pd

# Input/output files
prev_path = "data/spl_match_links_3_seasons.csv"
next_path = "data/spl_match_links_next_3_seasons.csv"
out_path  = "data/spl_match_links_merged.csv"

# Read both CSVs
prev_df = pd.read_csv(prev_path)
next_df = pd.read_csv(next_path)

# Mark whether a row is a future fixture
prev_df["is_future_fixture"] = 0
next_df["is_future_fixture"] = 1

# Stack the two tables
merged = pd.concat([prev_df, next_df], ignore_index=True)

# Clean types (helps avoid weird joins later)
merged["season_id"] = pd.to_numeric(merged["season_id"], errors="coerce").astype("Int64")
merged["event_id"]  = pd.to_numeric(merged["event_id"], errors="coerce").astype("Int64")

# Deduplicate by (season_id, event_id). If something appears twice,
# keep the row marked as future_fixture = 1.
merged = merged.sort_values(["season_id", "event_id", "is_future_fixture"], ascending=[True, True, False])
merged = merged.drop_duplicates(subset=["season_id", "event_id"], keep="first")

# Save
merged.to_csv(out_path, index=False)
print("saved:", out_path, "rows:", len(merged))
print(merged.head())


saved: spl_match_links_merged.csv rows: 358
     season_id  event_id                                          match_url  \
188      48772  11041308  https://www.sofascore.com/football/match/albir...   
182      48772  11041309  https://www.sofascore.com/football/match/albir...   
267      48772  11041310  https://www.sofascore.com/football/match/tampi...   
284      48772  11041311  https://www.sofascore.com/football/match/young...   
204      48772  11041312  https://www.sofascore.com/football/match/brune...   

     is_future_fixture  
188                  0  
182                  0  
267                  0  
284                  0  
204                  0  


In [None]:
# Fetch match statistics (SofaScore API) for each event_id
import os
import re
import json
import time
import random
import csv
import unicodedata
import pandas as pd

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait

IN_PATH = "data/spl_match_links_merged.csv"
OUT_PATH = "data/spl_stats_3_seasons.csv"
CACHE_DIR = ".sofascore_stats_cache"

STATS_URL_TMPL = "https://www.sofascore.com/api/v1/event/{event_id}/statistics"

RESUME = True          # set True if you want to continue from existing OUT_PATH
HEADLESS = True
MAX_EVENTS = None       # set e.g. 20 to test quickly


def build_driver(headless=False):
    opts = Options()
    if headless:
        opts.add_argument("--headless=new")
    opts.add_argument("--window-size=1400,900")
    opts.add_argument("--disable-gpu")
    opts.add_argument("--no-sandbox")
    opts.add_argument("--disable-dev-shm-usage")
    opts.add_argument("--lang=en-US")
    return webdriver.Chrome(options=opts)


def selenium_fetch_json(driver, url, timeout=20, debug=False):
    driver.get(url)
    wait = WebDriverWait(driver, timeout)

    try:
        wait.until(lambda d: len(d.find_elements("tag name", "pre")) > 0)
        txt = driver.find_element("tag name", "pre").text
        if debug:
            print("PRE_HEAD:", txt[:120].replace("\n", " "))
        return json.loads(txt)
    except Exception:
        try:
            txt = driver.find_element("tag name", "body").text
            if debug:
                print("BODY_HEAD:", txt[:120].replace("\n", " "))
            return json.loads(txt)
        except Exception:
            if debug:
                print("Could not parse JSON. Current URL:", driver.current_url)
            return None


def slugify_stat_name(name: str) -> str:
    s = unicodedata.normalize("NFKD", name).encode("ascii", "ignore").decode("ascii")
    s = s.lower().strip()
    s = s.replace("&", " and ")
    s = re.sub(r"[^a-z0-9]+", "_", s)
    s = re.sub(r"_+", "_", s).strip("_")
    return s


def flatten_sofascore_statistics(stats_json: dict, period: str = "ALL") -> dict:
    row = {}
    if not stats_json:
        return row

    blocks = stats_json.get("statistics", [])
    block = next((b for b in blocks if b.get("period") == period), blocks[0] if blocks else None)
    if not block:
        return row

    for group in block.get("groups", []):
        for item in group.get("statisticsItems", []):
            base = slugify_stat_name(item.get("name", ""))

            # collision guard
            if f"home_{base}" in row or f"away_{base}" in row:
                k = item.get("key")
                if k:
                    base = f"{base}__{slugify_stat_name(k)}"
                else:
                    c = 2
                    while f"home_{base}_{c}" in row:
                        c += 1
                    base = f"{base}_{c}"

            home_val = item.get("homeValue", item.get("home"))
            away_val = item.get("awayValue", item.get("away"))

            row[f"home_{base}"] = home_val
            row[f"away_{base}"] = away_val

            if "homeTotal" in item:
                row[f"home_{base}_total"] = item.get("homeTotal")
            if "awayTotal" in item:
                row[f"away_{base}_total"] = item.get("awayTotal")

    return row


def ensure_dir(path: str):
    os.makedirs(path, exist_ok=True)


def cache_path(event_id: int) -> str:
    return os.path.join(CACHE_DIR, f"{event_id}.json")


def load_cached(event_id: int):
    p = cache_path(event_id)
    if os.path.exists(p):
        try:
            with open(p, "r", encoding="utf-8") as f:
                return json.load(f)
        except Exception:
            return None
    return None


def save_cached(event_id: int, data: dict):
    p = cache_path(event_id)
    try:
        with open(p, "w", encoding="utf-8") as f:
            json.dump(data, f)
    except Exception:
        pass


def write_rows(rows, out_path):
    cols = set()
    for r in rows:
        cols.update(r.keys())

    base_cols = [c for c in ["season_id", "event_id", "fetch_ok"] if c in cols]
    other_cols = sorted([c for c in cols if c not in base_cols])
    fieldnames = base_cols + other_cols

    with open(out_path, "w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=fieldnames)
        w.writeheader()
        w.writerows(rows)


def main():
    ensure_dir(CACHE_DIR)

    df = pd.read_csv(IN_PATH)

    if "event_id" not in df.columns and "match_url" in df.columns:
        df["event_id"] = df["match_url"].str.extract(r"#id:(\d+)").astype("Int64")

    df = df.dropna(subset=["event_id"]).copy()
    df["event_id"] = df["event_id"].astype(int)

    season_map = None
    if "season_id" in df.columns:
        season_map = df.drop_duplicates("event_id").set_index("event_id")["season_id"].to_dict()

    event_ids = df["event_id"].drop_duplicates().tolist()
    print("loaded event_ids:", len(event_ids))

    if MAX_EVENTS is not None:
        event_ids = event_ids[:MAX_EVENTS]
        print("testing only first:", len(event_ids))

    done_ids = set()
    rows = []

    if RESUME and os.path.exists(OUT_PATH):
        old = pd.read_csv(OUT_PATH)
        if "event_id" in old.columns:
            done_ids = set(old["event_id"].dropna().astype(int).tolist())
        rows = old.to_dict(orient="records")
        print("resume ON | already in OUT:", len(done_ids))
    else:
        # overwrite mode
        if os.path.exists(OUT_PATH):
            print("resume OFF | will overwrite existing OUT_PATH")

    driver = build_driver(headless=HEADLESS)

    try:
        fetched = 0
        for idx, eid in enumerate(event_ids, 1):
            if RESUME and eid in done_ids:
                continue

            sid = season_map.get(eid) if season_map else None

            stats_json = load_cached(eid)
            if not stats_json:
                url = STATS_URL_TMPL.format(event_id=eid)

                debug = (fetched < 3)  # show first 3 fetches
                stats_json = selenium_fetch_json(driver, url, timeout=20, debug=debug)

                if not stats_json:
                    row = {"event_id": eid, "fetch_ok": 0}
                    if sid is not None:
                        row["season_id"] = sid
                    rows.append(row)
                    print("FAIL", eid, "|", idx, "/", len(event_ids))
                    time.sleep(0.8 + random.random() * 0.6)
                    continue

                save_cached(eid, stats_json)

            flat = flatten_sofascore_statistics(stats_json, period="ALL")
            row = {"event_id": eid, "fetch_ok": 1}
            if sid is not None:
                row["season_id"] = sid
            row.update(flat)
            rows.append(row)

            fetched += 1
            print("OK", eid, "| fetched:", fetched, "| progress:", idx, "/", len(event_ids), "| cols in row:", len(row))

            if fetched % 25 == 0:
                write_rows(rows, OUT_PATH)

            time.sleep(0.8 + random.random() * 0.6)

        write_rows(rows, OUT_PATH)
        print("DONE. saved:", OUT_PATH, "rows:", len(rows))

    finally:
        driver.quit()


if __name__ == "__main__":
    main()


loaded event_ids: 342
OK 11041308 | fetched: 1 | progress: 1 / 342 | cols in row: 91
OK 11041309 | fetched: 2 | progress: 2 / 342 | cols in row: 91
OK 11041310 | fetched: 3 | progress: 3 / 342 | cols in row: 3
OK 11041311 | fetched: 4 | progress: 4 / 342 | cols in row: 91
OK 11041312 | fetched: 5 | progress: 5 / 342 | cols in row: 91
OK 11041313 | fetched: 6 | progress: 6 / 342 | cols in row: 81
OK 11041314 | fetched: 7 | progress: 7 / 342 | cols in row: 3
OK 11041316 | fetched: 8 | progress: 8 / 342 | cols in row: 91
OK 11041317 | fetched: 9 | progress: 9 / 342 | cols in row: 91
OK 11041318 | fetched: 10 | progress: 10 / 342 | cols in row: 89
OK 11041320 | fetched: 11 | progress: 11 / 342 | cols in row: 3
OK 11041321 | fetched: 12 | progress: 12 / 342 | cols in row: 3
OK 11041322 | fetched: 13 | progress: 13 / 342 | cols in row: 89
OK 11041323 | fetched: 14 | progress: 14 / 342 | cols in row: 91
OK 11041324 | fetched: 15 | progress: 15 / 342 | cols in row: 91
OK 11041325 | fetched: 16

In [None]:
# Scrape home/away team names + match date/time from the match page
import os
import re
import csv
import time
import json
import random
import pandas as pd

from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException

IN_PATH = "data/spl_match_links_merged.csv"
OUT_PATH = "data/spl_event_teams_3_seasons.csv"
CACHE_DIR = ".sofascore_event_page_cache"

HEADLESS = True
RESUME = True          # if OUT_PATH exists, skip event_ids already done
MAX_EVENTS = None      # set small number to test quickly, e.g. 20

def build_driver(headless=False):
    opts = Options()
    if headless:
        opts.add_argument("--headless=new")
    opts.add_argument("--window-size=1400,900")
    opts.add_argument("--disable-gpu")
    opts.add_argument("--no-sandbox")
    opts.add_argument("--disable-dev-shm-usage")
    opts.add_argument("--lang=en-US")
    return webdriver.Chrome(options=opts)

def ensure_dir(path: str):
    os.makedirs(path, exist_ok=True)

def parse_event_id(url: str):
    m = re.search(r"#id:(\d+)", str(url))
    return int(m.group(1)) if m else None

def cache_path(event_id: int):
    return os.path.join(CACHE_DIR, f"{event_id}.json")

def load_cached(event_id: int):
    p = cache_path(event_id)
    if os.path.exists(p):
        try:
            with open(p, "r", encoding="utf-8") as f:
                return json.load(f)
        except Exception:
            return None
    return None

def save_cached(event_id: int, data: dict):
    p = cache_path(event_id)
    try:
        with open(p, "w", encoding="utf-8") as f:
            json.dump(data, f)
    except Exception:
        pass


def scrape_home_away_date_from_match_page(driver, match_url, timeout=20, debug=False):
    """Scrape basic match info from the match page.

    Returns: (home_team, away_team, match_date_ddmmyyyy, match_time_hhmm, ok)

    Assumptions:
    - first team link on the page is the home team, second is away
    - date is shown as dd/mm/yyyy somewhere in a visible <span>
    - time is shown as hh:mm somewhere in a visible <span> (optional)
    """
    driver.get(match_url)
    wait = WebDriverWait(driver, timeout)

    team_sel = "a[href*='/football/team/']"

    try:
        # wait until at least 2 team anchors appear
        wait.until(lambda d: len(d.find_elements("css selector", team_sel)) >= 2)

        # ---- teams ----
        links = driver.find_elements("css selector", team_sel)
        teams = []
        for a in links:
            t = (a.text or "").strip()
            if t and (len(teams) == 0 or t != teams[-1]):
                teams.append(t)
            if len(teams) >= 2:
                break

        home_team = teams[0] if len(teams) >= 1 else None
        away_team = teams[1] if len(teams) >= 2 else None

        # ---- date + time from DOM (no fragile classes) ----
        # We scan spans for dd/mm/yyyy and hh:mm patterns.
        # This matches what you showed: <span>09/01/2026</span> and <span>07:30</span>
        dt = driver.execute_script(r"""
            const spans = Array.from(document.querySelectorAll("span"));
            let dateStr = null;
            let timeStr = null;

            for (const s of spans) {
                const t = (s.innerText || "").trim();
                if (!dateStr && /^\d{2}\/\d{2}\/\d{4}$/.test(t)) dateStr = t;
                if (!timeStr && /^\d{2}:\d{2}$/.test(t)) timeStr = t;
                if (dateStr && timeStr) break;
            }
            return [dateStr, timeStr];
        """)

        date_str = dt[0] if dt else None
        time_str = dt[1] if dt else None

        # normalize date to dd/mm/yyyy (already is, but keep safe)
        match_date = ""
        if date_str:
            try:
                match_date = datetime.strptime(date_str, "%d/%m/%Y").strftime("%d/%m/%Y")
            except ValueError:
                match_date = ""

        match_time = time_str or ""

        ok = 1 if (home_team and away_team and match_date) else 0

        if ok == 0 and debug:
            print("DEBUG URL:", match_url)
            print("  teams:", teams)
            print("  date_str:", date_str, "time_str:", time_str)

        return home_team, away_team, match_date, match_time, ok

    except TimeoutException:
        if debug:
            print("Timeout waiting teams/date. URL:", match_url)
        return None, None, "", "", 0


def write_rows(rows, out_path):
    fieldnames = ["season_id", "event_id", "home_team", "away_team", "match_date", "match_time", "ok", "match_url"]
    with open(out_path, "w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=fieldnames)
        w.writeheader()
        w.writerows(rows)


def main():
    ensure_dir(CACHE_DIR)

    df = pd.read_csv(IN_PATH)

    # ensure event_id exists
    if "event_id" not in df.columns and "match_url" in df.columns:
        df["event_id"] = df["match_url"].str.extract(r"#id:(\d+)").astype("Int64")

    df = df.dropna(subset=["event_id"]).copy()
    df["event_id"] = df["event_id"].astype(int)

    if "season_id" not in df.columns:
        raise ValueError("IN_PATH must contain season_id")

    df = df[["season_id", "event_id", "match_url"]].drop_duplicates()

    if MAX_EVENTS is not None:
        df = df.head(MAX_EVENTS)

    done = set()
    rows = []

    if RESUME and os.path.exists(OUT_PATH):
        old = pd.read_csv(OUT_PATH)

        if "event_id" in old.columns:
            done = set(old["event_id"].dropna().astype(int).tolist())

        rows = old.to_dict(orient="records")
        print("resume ON | already done:", len(done))
    else:
        if os.path.exists(OUT_PATH):
            print("resume OFF | will overwrite:", OUT_PATH)

    driver = build_driver(headless=HEADLESS)

    try:
        total = len(df)
        for i, r in enumerate(df.itertuples(index=False), 1):
            season_id = int(r.season_id)
            event_id = int(r.event_id)
            match_url = str(r.match_url)

            if RESUME and event_id in done:
                continue

            cached = load_cached(event_id)
            if cached:
                home_team = cached.get("home_team")
                away_team = cached.get("away_team")
                match_date = cached.get("match_date", "")
                match_time = cached.get("match_time", "")
                ok = cached.get("ok", 0)
            else:
                debug = (i <= 3)
                home_team, away_team, match_date, match_time, ok = scrape_home_away_date_from_match_page(
                    driver, match_url, timeout=25, debug=debug
                )
                save_cached(event_id, {
                    "home_team": home_team,
                    "away_team": away_team,
                    "match_date": match_date,
                    "match_time": match_time,
                    "ok": ok
                })

            out_row = {
                "season_id": season_id,
                "event_id": event_id,
                "home_team": home_team,
                "away_team": away_team,
                "match_date": match_date,   # dd/mm/yyyy
                "match_time": match_time,   # hh:mm (may be empty)
                "ok": ok,
                "match_url": match_url
            }
            rows.append(out_row)

            if ok == 1:
                print(f"[{i}/{total}] OK  event {event_id} | {home_team} vs {away_team} | {match_date} {match_time}")
            else:
                print(f"[{i}/{total}] FAIL event {event_id} | url={match_url}")

            # periodic save
            if len(rows) % 25 == 0:
                write_rows(rows, OUT_PATH)

            time.sleep(0.8 + random.random() * 0.6)

        write_rows(rows, OUT_PATH)
        print("DONE. saved:", OUT_PATH, "rows:", len(rows))

    finally:
        driver.quit()


if __name__ == "__main__":
    main()


[1/358] OK  event 11041308 | Tanjong Pagar Utd. vs Albirex Niigata | 11/08/2023 07:45
[2/358] OK  event 11041309 | Albirex Niigata vs Lion City Sailors | 16/07/2023 06:00
[3/358] OK  event 11041310 | Balestier Khalsa vs Tampines Rovers | 26/08/2023 06:00
[4/358] OK  event 11041311 | Geylang Int. vs Young Lions | 19/08/2023 06:00
[5/358] OK  event 11041312 | Hougang Utd vs DPMM | 04/08/2023 07:45
[6/358] OK  event 11041313 | Tampines Rovers vs DPMM | 18/08/2023 07:45
[7/358] OK  event 11041314 | Lion City Sailors vs Geylang Int. | 27/08/2023 06:00
[8/358] OK  event 11041316 | Lion City Sailors vs Balestier Khalsa | 20/12/2025 07:30
[9/358] OK  event 11041317 | Albirex Niigata vs Young Lions | 22/07/2023 06:00
[10/358] OK  event 11041318 | Geylang Int. vs Balestier Khalsa | 12/08/2023 06:00
[11/358] OK  event 11041320 | Albirex Niigata vs Hougang Utd | 27/08/2023 06:00
[12/358] OK  event 11041321 | Tanjong Pagar Utd. vs DPMM | 26/08/2023 06:00
[13/358] OK  event 11041322 | Lion City Sail

In [26]:
import pandas as pd

MATCH_PATH = "spl_event_teams_3_seasons.csv"
STATS_PATH = "spl_stats_3_seasons.csv"
OUT_PATH   = "spl_full_data.csv"

# -------------------------
# SPL canonical team list
# -------------------------
SPL_TEAMS = {
    'Albirex Niigata', 'Geylang Int.',
    'Lion City Sailors',
    'Balestier Khalsa', 
    'Tampines Rovers','Hougang Utd',
    'Tanjong Pagar Utd.', 'Young Lions','DPMM'
}

# -------------------------
# Season label map
# -------------------------
SEASON_LABEL = {
    78478: "25/26",
    59708: "2024",
    48772: "2023"
}

# -------------------------
# Load data
# -------------------------
df_match = pd.read_csv(MATCH_PATH)
df_stats = pd.read_csv(STATS_PATH)

print(df_match['home_team'].unique())
# ensure event_id int
df_match["event_id"] = df_match["event_id"].astype(int)
df_stats["event_id"] = df_stats["event_id"].astype(int)

# drop duplicate stats rows
df_stats = df_stats.drop_duplicates(subset=["event_id"])

# -------------------------
# Merge (LEFT JOIN)
# -------------------------
df = df_match.merge(
    df_stats,
    on="event_id",
    how="left",
    suffixes=("", "_stats")
)

# -------------------------
# Add / enforce season_label
# -------------------------
df["season_label"] = df["season_id"].map(SEASON_LABEL)

# -------------------------
# Filter: SPL teams only
# (both home AND away must be SPL)
# -------------------------
df = df[
    df["home_team"].isin(SPL_TEAMS)
].copy()

# -------------------------
# Reorder key columns
# -------------------------
key_cols = [
    "season_id",
    "season_label",
    "event_id",
    "home_team",
    "away_team",
    "fetch_ok",
    "match_url",
]

other_cols = [c for c in df.columns if c not in key_cols]
df = df[key_cols + other_cols]

# -------------------------
# Save
# -------------------------
df.to_csv(OUT_PATH, index=False)

print("saved:", OUT_PATH)
print("rows:", len(df))
print(
    "matches with stats:",
    df["fetch_ok"].fillna(0).astype(int).sum(),
    "/",
    len(df)
)
print(df.head(5))

['Tanjong Pagar Utd.' 'Albirex Niigata' 'Balestier Khalsa' 'Geylang Int.'
 'Hougang Utd' 'Tampines Rovers' 'Lion City Sailors' 'Young Lions' 'DPMM'
 'Everton' 'Man City' 'Newcastle' 'Tottenham' 'Real Madrid' 'Man Utd'
 'Guadalajara' 'CF Talavera']
saved: spl_full_data.csv
rows: 334
matches with stats: 334 / 334
   season_id season_label  event_id           home_team          away_team  \
0      48772         2023  11041308  Tanjong Pagar Utd.    Albirex Niigata   
1      48772         2023  11041309     Albirex Niigata  Lion City Sailors   
2      48772         2023  11041310    Balestier Khalsa    Tampines Rovers   
3      48772         2023  11041311        Geylang Int.        Young Lions   
4      48772         2023  11041312         Hougang Utd               DPMM   

   fetch_ok                                          match_url  match_date  \
0         1  https://www.sofascore.com/football/match/albir...  11/08/2023   
1         1  https://www.sofascore.com/football/match/albir...