### Main

In [None]:
import os
import csv
import time
import requests
import re
from bs4 import BeautifulSoup
from urllib.parse import urljoin

CSV_FILE = "./myfootdr_clinics.csv"
FAILED_REGIONS_FILE = "./failed_regions.txt"
FAILED_CLINICS_FILE = "./failed_clinics.txt"

DEFAULT_SNAPSHOTS = [
    "20250708180027",
    "20250517063937",
    "20250516141742",
]

REGION_SNAPSHOTS = {
    "victoria": ["20250618234903"],
    "western-australia": ["20250707232629"],
}

BASE_SITE = "https://www.myfootdr.com.au"
ARCHIVE_BASE = "http://web.archive.org/web"

FIELDNAMES = [
    "Name of Clinic",
    "Address",
    "Email",
    "Phone",
    "Services",
]

session = requests.Session()
session.headers.update({
    "User-Agent": "Mozilla/5.0",
    "Accept-Encoding": "identity",
    "Connection": "close",
})

def build_url(snapshot, path):
    return f"{ARCHIVE_BASE}/{snapshot}/{BASE_SITE}{path}"

def get_soup(url, retries=3):
    for _ in range(retries):
        try:
            time.sleep(1.2)
            r = session.get(url, timeout=40)
            if r.status_code == 200 and len(r.text) > 500:
                return BeautifulSoup(r.text, "html.parser")
        except requests.exceptions.RequestException:
            pass
    return None

def get_soup_with_snapshots(path, snapshots):
    for snap in snapshots:
        soup = get_soup(build_url(snap, path))
        if soup:
            return soup, snap
    return None, None

def clean_text(el):
    if not el:
        return ""
    for icon in el.select(".i-heartxp"):
        icon.decompose()
    return el.get_text(" ", strip=True)

def extract_phone(soup):
    for a in soup.select("a[href]"):
        href = a.get("href", "")
        if "tel:" in href:
            phone = href.split("tel:", 1)[-1]
            phone = re.sub(r"[^0-9+]", "", phone)
            if phone:
                return phone

    for a in soup.select(".clinic-metabox a"):
        text = a.get_text(strip=True)
        digits = re.sub(r"[^0-9+]", "", text)
        if len(digits) >= 8:
            return digits

    return "NA"

def extract_services(soup):
    services = []
    content = soup.select_one(".entry-content")
    if content:
        for ul in content.find_all("ul"):
            for li in ul.find_all("li"):
                text = li.get_text(strip=True)
                if text:
                    services.append(text)

    for card in soup.select(".clinic-2020-services .featured-post-content h3 a"):
        text = card.get_text(strip=True)
        if text:
            services.append(text)

    services = list(dict.fromkeys(services))  # dedupe, preserve order

    if not services:
        return "NA"

    return "; ".join(services)

# Load existing CSV (for upsert)
existing = {}

if os.path.exists(CSV_FILE):
    with open(CSV_FILE, newline="", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            key = (row["Name of Clinic"].strip(), row["Address"].strip())

            # normalize empty fields
            row["Email"] = row["Email"].strip() or "NA"
            row["Phone"] = row["Phone"].strip() or "NA"
            row["Services"] = row["Services"].strip() or "NA"

            existing[key] = row

print(f"Loaded {len(existing)} existing clinics")

failed_regions = []
failed_clinics = []

# Discover regions
main_soup, _ = get_soup_with_snapshots("/our-clinics/", DEFAULT_SNAPSHOTS)

if not main_soup:
    raise SystemExit("Failed to load main clinics page")

region_slugs = sorted({
    a["href"].rstrip("/").split("/")[-1]
    for a in main_soup.select('a[href*="/our-clinics/regions/"]')
})

print(f"Regions found: {len(region_slugs)}")

# Process regions & clinics (UPsert in memory)
try:
    for slug in region_slugs:
        snapshots = REGION_SNAPSHOTS.get(slug, DEFAULT_SNAPSHOTS)

        soup, used_snapshot = get_soup_with_snapshots(
            f"/our-clinics/regions/{slug}/", snapshots
        )

        if not soup:
            print(f"REGION FAILED: {slug}")
            failed_regions.append(slug)
            continue

        region_url = build_url(used_snapshot, f"/our-clinics/regions/{slug}/")

        clinic_links = [
            urljoin(region_url, a["href"])
            for a in soup.select(".div-table.regional-clinics .table-row a[href]")
        ]

        clinic_links = sorted(set(clinic_links))
        print(f"REGION {slug}: {len(clinic_links)} clinics")

        for url in clinic_links:
            print(f"  Scraping {url}")

            path = "/" + url.split(BASE_SITE, 1)[-1].lstrip("/")
            soup, _ = get_soup_with_snapshots(path, [used_snapshot] + DEFAULT_SNAPSHOTS)

            if not soup:
                print("    FAILED")
                failed_clinics.append(url)
                continue

            name_el = soup.select_one("#clinic-metacard-2020 h1.entry-title")
            name = name_el.get_text(strip=True) if name_el else "NA"

            address_el = soup.select_one(".clinic-metabox .address")
            address = clean_text(address_el)

            email = "NA"
            for a in soup.select("#clinic-metacard-2020 a[href]"):
                if "mailto:" in a.get("href", ""):
                    email = a.get_text(strip=True)
                    break

            phone = extract_phone(soup)
            services = extract_services(soup)
            if not services or not services.strip():
                services = "NA"


            key = (name.strip(), address.strip())
            action = "UPDATED" if key in existing else "ADDED"

            existing[key] = {
                "Name of Clinic": name,
                "Address": address,
                "Email": email,
                "Phone": phone,
                "Services": services,
            }

            print(f"    {action}: {name}")

except KeyboardInterrupt:
    print("\nInterrupted by user, saving progress...")

# Write final CSV (single authoritative write)
with open(CSV_FILE, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=FIELDNAMES)
    writer.writeheader()
    for row in existing.values():
        writer.writerow(row)

if failed_regions:
    with open(FAILED_REGIONS_FILE, "w") as f:
        for r in sorted(set(failed_regions)):
            f.write(r + "\n")

if failed_clinics:
    with open(FAILED_CLINICS_FILE, "w") as f:
        for c in sorted(set(failed_clinics)):
            f.write(c + "\n")

print(f"Saved {len(existing)} clinics")
print(f"Failed regions: {len(set(failed_regions))}")
print(f"Failed clinics: {len(set(failed_clinics))}")
print("Done")


Loaded 0 existing clinics
Regions found: 11
REGION brisbane: 31 clinics
  Scraping http://web.archive.org/web/20250618202752/https://www.myfootdr.com.au/our-clinics/albany-creek-allsports-podiatry/
    ADDED: Allsports Podiatry Albany Creek
  Scraping http://web.archive.org/web/20250618202752/https://www.myfootdr.com.au/our-clinics/aspley-allsports-podiatry/
    ADDED: My FootDr Allsports Podiatry Aspley
  Scraping http://web.archive.org/web/20250618202752/https://www.myfootdr.com.au/our-clinics/brisbane-cbd-podiatry-centre/
    ADDED: My FootDr Brisbane CBD
  Scraping http://web.archive.org/web/20250618202752/https://www.myfootdr.com.au/our-clinics/brookwater-podiatry-centre/
    ADDED: My FootDr Brookwater
  Scraping http://web.archive.org/web/20250618202752/https://www.myfootdr.com.au/our-clinics/calamvale-podiatry-centre-allsports/
    ADDED: Allsports Podiatry Calamvale
  Scraping http://web.archive.org/web/20250618202752/https://www.myfootdr.com.au/our-clinics/camp-hill-podiatry-

### Try failed clinics

In [None]:
import os
import csv
import time
import requests
import re
from bs4 import BeautifulSoup

CSV_FILE = "./myfootdr_clinics.csv"
FAILED_CLINICS_FILE = "./failed_clinics.txt"

DEFAULT_SNAPSHOTS = [
    "20250708180027",
    "20250517063937",
    "20250516141742",
]

BASE_SITE = "https://www.myfootdr.com.au"
ARCHIVE_BASE = "http://web.archive.org/web"

FIELDNAMES = [
    "Name of Clinic",
    "Address",
    "Email",
    "Phone",
    "Services",
]

session = requests.Session()
session.headers.update({
    "User-Agent": "Mozilla/5.0",
    "Accept-Encoding": "identity",
    "Connection": "close",
})

def build_url(snapshot, path):
    return f"{ARCHIVE_BASE}/{snapshot}/{BASE_SITE}{path}"

def get_soup(url, retries=3):
    for _ in range(retries):
        try:
            time.sleep(1.2)
            r = session.get(url, timeout=40)
            if r.status_code == 200 and len(r.text) > 500:
                return BeautifulSoup(r.text, "html.parser")
        except requests.exceptions.RequestException:
            pass
    return None

def get_soup_with_snapshots(path):
    for snap in DEFAULT_SNAPSHOTS:
        soup = get_soup(build_url(snap, path))
        if soup:
            return soup
    return None

def clean_text(el):
    if not el:
        return ""
    for icon in el.select(".i-heartxp"):
        icon.decompose()
    return el.get_text(" ", strip=True)

def extract_phone(soup):
    for a in soup.select("a[href]"):
        href = a.get("href", "")
        if "tel:" in href:
            phone = href.split("tel:", 1)[-1]
            phone = re.sub(r"[^0-9+]", "", phone)
            if phone:
                return phone
    for a in soup.select(".clinic-metabox a"):
        text = a.get_text(strip=True)
        digits = re.sub(r"[^0-9+]", "", text)
        if len(digits) >= 8:
            return digits
    return "NA"

def extract_services(soup):
    services = []
    content = soup.select_one(".entry-content")
    if content:
        for ul in content.find_all("ul"):
            for li in ul.find_all("li"):
                text = li.get_text(strip=True)
                if text:
                    services.append(text)

    for card in soup.select(".clinic-2020-services .featured-post-content h3 a"):
        text = card.get_text(strip=True)
        if text:
            services.append(text)

    services = list(dict.fromkeys(services))  

    if not services:
        return "NA"

    return "; ".join(services)

# Load existing CSV
existing = {}

with open(CSV_FILE, newline="", encoding="utf-8") as f:
    reader = csv.DictReader(f)
    for row in reader:
        key = (row["Name of Clinic"].strip(), row["Address"].strip())
        existing[key] = row

# Load failed clinics
with open(FAILED_CLINICS_FILE) as f:
    failed_urls = [line.strip() for line in f if line.strip()]

print(f"Retrying {len(failed_urls)} failed clinics")

still_failed = []

# Retry loop
for idx, url in enumerate(failed_urls, start=1):
    print(f"[{idx}/{len(failed_urls)}] Retrying {url}")

    path = "/" + url.split(BASE_SITE, 1)[-1].lstrip("/")
    soup = get_soup_with_snapshots(path)

    if not soup:
        print("  still failed")
        still_failed.append(url)
        continue

    name_el = soup.select_one("#clinic-metacard-2020 h1.entry-title")
    name = name_el.get_text(strip=True) if name_el else "NA"

    address_el = soup.select_one(".clinic-metabox .address")
    address = clean_text(address_el)

    email = "NA"
    for a in soup.select("#clinic-metacard-2020 a[href]"):
        if "mailto:" in a.get("href", ""):
            email = a.get_text(strip=True)
            break

    phone = extract_phone(soup)
    services = extract_services(soup)

    if not services.strip():
        services = "NA"

    key = (name.strip(), address.strip())

    existing[key] = {
        "Name of Clinic": name,
        "Address": address,
        "Email": email,
        "Phone": phone,
        "Services": services,
    }

    print(f"  recovered: {name}")

# Rewrite CSV
with open(CSV_FILE, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=FIELDNAMES)
    writer.writeheader()
    for row in existing.values():
        writer.writerow(row)

# Rewrite failed list
with open(FAILED_CLINICS_FILE, "w") as f:
    for url in still_failed:
        f.write(url + "\n")

print(f"Recovered: {len(failed_urls) - len(still_failed)}")
print(f"Still failing: {len(still_failed)}")
print("Done")


Retrying 0 failed clinics
Recovered: 0
Still failing: 0
Done


### Try failed regions

In [None]:
import os
import csv
import time
import requests
import re
from bs4 import BeautifulSoup
from urllib.parse import urljoin

CSV_FILE = "./myfootdr_clinics.csv"
FAILED_REGIONS_FILE = "./failed_regions.txt"
FAILED_CLINICS_FILE = "./failed_clinics.txt"

DEFAULT_SNAPSHOTS = [
    "20250708180027",
    "20250517063937",
    "20250516141742",
]

BASE_SITE = "https://www.myfootdr.com.au"
ARCHIVE_BASE = "http://web.archive.org/web"

FIELDNAMES = [
    "Name of Clinic",
    "Address",
    "Email",
    "Phone",
    "Services",
]

session = requests.Session()
session.headers.update({
    "User-Agent": "Mozilla/5.0",
    "Accept-Encoding": "identity",
    "Connection": "close",
})

def build_url(snapshot, path):
    return f"{ARCHIVE_BASE}/{snapshot}/{BASE_SITE}{path}"

def get_soup(url, retries=3):
    for _ in range(retries):
        try:
            time.sleep(1.2)
            r = session.get(url, timeout=40)
            if r.status_code == 200 and len(r.text) > 500:
                return BeautifulSoup(r.text, "html.parser")
        except requests.exceptions.RequestException:
            pass
    return None

def get_soup_with_snapshots(path):
    for snap in DEFAULT_SNAPSHOTS:
        soup = get_soup(build_url(snap, path))
        if soup:
            return soup, snap
    return None, None

def clean_text(el):
    if not el:
        return ""
    for icon in el.select(".i-heartxp"):
        icon.decompose()
    return el.get_text(" ", strip=True)

def extract_phone(soup):
    for a in soup.select("a[href]"):
        href = a.get("href", "")
        if "tel:" in href:
            phone = href.split("tel:", 1)[-1]
            phone = re.sub(r"[^0-9+]", "", phone)
            if phone:
                return phone
    for a in soup.select(".clinic-metabox a"):
        digits = re.sub(r"[^0-9+]", "", a.get_text(strip=True))
        if len(digits) >= 8:
            return digits
    return "NA"

def extract_services(soup):
    services = []

    content = soup.select_one(".entry-content")
    if content:
        for ul in content.find_all("ul"):
            for li in ul.find_all("li"):
                text = li.get_text(strip=True)
                if text:
                    services.append(text)

    for card in soup.select(".clinic-2020-services .featured-post-content h3 a"):
        text = card.get_text(strip=True)
        if text:
            services.append(text)

    services = list(dict.fromkeys(services))
    return "; ".join(services) if services else "NA"

# Load CSV (UPSERT)
existing = {}

if os.path.exists(CSV_FILE):
    with open(CSV_FILE, newline="", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            key = (row["Name of Clinic"].strip(), row["Address"].strip())
            existing[key] = row

# Load failed regions
with open(FAILED_REGIONS_FILE) as f:
    failed_regions = [r.strip() for r in f if r.strip()]

print(f"Retrying {len(failed_regions)} failed regions")

recovered_regions = []
still_failed_regions = []
new_failed_clinics = []

# Retry regions
for slug in failed_regions:
    print(f"\nREGION RETRY: {slug}")

    soup, snapshot = get_soup_with_snapshots(f"/our-clinics/regions/{slug}/")

    if not soup:
        print("  still failing")
        still_failed_regions.append(slug)
        continue

    recovered_regions.append(slug)

    region_url = build_url(snapshot, f"/our-clinics/regions/{slug}/")

    clinic_links = sorted({
        urljoin(region_url, a["href"])
        for a in soup.select(".div-table.regional-clinics .table-row a[href]")
    })

    print(f"  clinics found: {len(clinic_links)}")

    for url in clinic_links:
        print(f"    scraping {url}")

        path = "/" + url.split(BASE_SITE, 1)[-1].lstrip("/")
        soup, _ = get_soup_with_snapshots(path)

        if not soup:
            print("      FAILED")
            new_failed_clinics.append(url)
            continue

        name_el = soup.select_one("#clinic-metacard-2020 h1.entry-title")
        name = name_el.get_text(strip=True) if name_el else "NA"

        address_el = soup.select_one(".clinic-metabox .address")
        address = clean_text(address_el)

        email = "NA"
        for a in soup.select("#clinic-metacard-2020 a[href]"):
            if "mailto:" in a.get("href", ""):
                email = a.get_text(strip=True)
                break

        phone = extract_phone(soup)
        services = extract_services(soup)

        key = (name.strip(), address.strip())

        existing[key] = {
            "Name of Clinic": name,
            "Address": address,
            "Email": email,
            "Phone": phone,
            "Services": services,
        }

        print(f"      OK: {name}")

# Rewrite CSV
with open(CSV_FILE, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=FIELDNAMES)
    writer.writeheader()
    for row in existing.values():
        writer.writerow(row)

# Rewrite failed regions
with open(FAILED_REGIONS_FILE, "w") as f:
    for r in still_failed_regions:
        f.write(r + "\n")

# Append newly failed clinics
if new_failed_clinics:
    with open(FAILED_CLINICS_FILE, "a") as f:
        for url in sorted(set(new_failed_clinics)):
            f.write(url + "\n")

print("\nSummary")
print(f"Recovered regions: {len(recovered_regions)}")
print(f"Still failing regions: {len(still_failed_regions)}")
print(f"New failed clinics added: {len(set(new_failed_clinics))}")
print("Done")


Retrying 0 failed regions

Summary
Recovered regions: 0
Still failing regions: 0
New failed clinics added: 0
Done
