In [1]:
## IMPORTS
import time
import re
from typing import List, Optional
import pandas as pd

from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    NoSuchElementException, TimeoutException, StaleElementReferenceException
)
from selenium.webdriver.chrome.options import Options

BASE_URL = "https://tdcplanningsearch.tandridge.gov.uk/Default#TDCInfo"

In [2]:
# UTILIITIES 
def validate_mmddyyyy(s: str) -> str:
    """Ensure MM/DD/YYYY; zero-pad month/day. Not strictly needed for Parish mode but kept for consistency."""
    m = re.fullmatch(r"\s*(\d{1,2})/(\d{1,2})/(\d{4})\s*", s or "")
    if not m:
        raise ValueError(f"Date must be MM/DD/YYYY: {s}")
    mm, dd, yyyy = int(m.group(1)), int(m.group(2)), int(m.group(3))
    return f"{mm:02d}/{dd:02d}/{yyyy}"

def wait_for_any(wait: WebDriverWait, locators, timeout_msg: str):
    last_err = None
    for by, value in locators:
        try:
            return wait.until(EC.presence_of_element_located((by, value)))
        except Exception as e:
            last_err = e
    raise TimeoutException(timeout_msg) from last_err

def find_first(driver, locators):
    for by, value in locators:
        try:
            return driver.find_element(by, value)
        except NoSuchElementException:
            continue
    raise NoSuchElementException("None of the provided locators matched the page.")

def click_safely(driver, element):
    driver.execute_script("arguments[0].scrollIntoView({block:'center'});", element)
    time.sleep(0.15)
    element.click()

def safe_text(el):
    try: return el.text.strip()
    except: return ""

def try_select_by_visible_text(select_elem, text):
    sel = Select(select_elem)
    try:
        sel.select_by_visible_text(text)
        return
    except:
        pass
    # case-insensitive / partial match
    for opt in [o.text.strip() for o in sel.options]:
        if text.lower() == opt.lower() or text.lower() in opt.lower():
            sel.select_by_visible_text(opt)
            return
    raise NoSuchElementException(f"Option not found in dropdown: {text}")

In [3]:
# ---------- Debug helpers (optional) ----------

def list_search_criteria_options(headless=False):
    drv = build_driver(headless=headless)
    try:
        drv.get(BASE_URL)
        time.sleep(1.2)
        sel = Select(find_first(drv, [
            (By.XPATH, "//label[contains(.,'Search criteria')]/following::select[1]"),
            (By.XPATH, "//select[contains(@id,'Search') or contains(@id,'Criteria')]"),
            (By.CSS_SELECTOR, "select")
        ]))
        print("Search criteria options:")
        for o in sel.options:
            print("-", o.text.strip())
    finally:
        drv.quit()

def list_parish_options(headless=False):
    drv = build_driver(headless=headless)
    try:
        drv.get(BASE_URL)
        time.sleep(1.2)
        # choose "Parish" in the criteria first so the parish dropdown appears
        sel = Select(find_first(drv, [
            (By.XPATH, "//label[contains(.,'Search criteria')]/following::select[1]"),
            (By.XPATH, "//select[contains(@id,'Search') or contains(@id,'Criteria')]"),
            (By.CSS_SELECTOR, "select")
        ]))
        try_select_by_visible_text(sel, "Parish")
        time.sleep(0.6)
        parish_sel = find_first(drv, [
            (By.XPATH, "//label[contains(.,'Parish')]/following::select[1]"),
            (By.XPATH, "//select[contains(@id,'Parish')]"),
        ])
        sel2 = Select(parish_sel)
        print("Parish options:")
        for o in sel2.options:
            print("-", o.text.strip())
    finally:
        drv.quit()


In [14]:
# ---------- Core actions ----------

def fill_search_form_parish(
    driver,
    wait: WebDriverWait,
    parish: Optional[str],
    start_date: Optional[str] = None,
    end_date: Optional[str] = None,
    keywords: Optional[List[str]] = None
):
    """Selects 'Parish' criteria, sets parish, and fills optional date/keyword fields if present, then clicks Search."""
    # Ensure page area exists
    wait_for_any(
        wait,
        [
            (By.CSS_SELECTOR, "#TDCInfo"),
            (By.XPATH, "//*[contains(@id,'ContentPlaceHolder1')]"),
        ],
        "Search pane not ready."
    )

    # 1) Select 'Search criteria' = Parish
    criteria_select = find_first(driver, [
        (By.XPATH, "//label[contains(.,'Search criteria')]/following::select[1]"),
        (By.XPATH, "//select[contains(@id,'Search') or contains(@id,'Criteria')]"),
        (By.CSS_SELECTOR, "select")
    ])
    try_select_by_visible_text(criteria_select, "Parish")
    time.sleep(0.6)  # let WebForms render fields for this criteria

    # 2) Select Parish (required for Parish criteria)
    if parish:
        try:
            parish_select = find_first(driver, [
                (By.XPATH, "//label[contains(.,'Parish')]/following::select[1]"),
                (By.XPATH, "//select[contains(@id,'Parish')]"),
            ])
            try_select_by_visible_text(parish_select, parish)
        except Exception as e:
            print(f"[warn] Parish not found/selectable ('{parish}'); continuing. ({e})")
    else:
        print("[warn] Parish criteria selected but no parish provided.")

    # 3) Optional dates (only if present for Parish mode)
    if start_date:
        try:
            df = find_first(driver, [
                (By.XPATH, "//label[contains(.,'From')]/following::input[@type='text'][1]"),
                (By.XPATH, "//input[contains(@placeholder,'From')]"),
                (By.XPATH, "//input[contains(@id,'From') or contains(@id,'DateFrom') or contains(@id,'Start')]"),
            ])
            df.clear(); df.send_keys(validate_mmddyyyy(start_date))
        except Exception:
            print("[info] 'From' date field not present; skipping.")
    if end_date:
        try:
            dt = find_first(driver, [
                (By.XPATH, "//label[contains(.,'To')]/following::input[@type='text'][1]"),
                (By.XPATH, "//input[contains(@placeholder,'To')]"),
                (By.XPATH, "//input[contains(@id,'To') or contains(@id,'DateTo') or contains(@id,'End')]"),
            ])
            dt.clear(); dt.send_keys(validate_mmddyyyy(end_date))
        except Exception:
            print("[info] 'To' date field not present; skipping.")

    # 4) Optional keywords (only if present)
    if keywords and any(k.strip() for k in keywords):
        try:
            kw = find_first(driver, [
                (By.XPATH, "//label[contains(.,'Keyword') or contains(.,'Document Title')]/following::input[1]"),
                (By.XPATH, "//input[contains(@id,'Keyword') or contains(@id,'DocTitle')]"),
                (By.CSS_SELECTOR, "input[type='text'][name*='Keyword']")
            ])
            kw.clear()
            kw.send_keys(" ".join(k.strip() for k in keywords if k.strip()))
        except Exception as e:
            print(f"[info] Keyword input not present; skipping. ({e})")

    # 5) Click Search
    search_btn = find_first(driver, [
        (By.XPATH, "//input[@type='submit' and (contains(@value,'Search') or contains(@value,'Find'))]"),
        (By.XPATH, "//button[contains(.,'Search') or contains(.,'Find')]"),
        (By.XPATH, "//*[@onclick[contains(.,'__doPostBack')]][contains(.,'Search') or contains(.,'Find')]")
    ])
    click_safely(driver, search_btn)

def go_to_results_anchor(driver):
    try:
        driver.execute_script("if (location.hash !== '#dvRecsFound') location.hash = '#dvRecsFound';")
        time.sleep(0.2)
    except:
        pass

def parse_results_page(driver):
    """Parse first non-empty table as results; fallback to card blocks."""
    rows_data = []

    # Prefer table layout
    tables = driver.find_elements(By.TAG_NAME, "table")
    chosen = None
    for t in tables:
        try:
            trs = t.find_elements(By.TAG_NAME, "tr")
            if len(trs) >= 2:
                chosen = t
                break
        except StaleElementReferenceException:
            continue

    if chosen:
        trs = chosen.find_elements(By.TAG_NAME, "tr")
        headers = [safe_text(th) for th in trs[0].find_elements(By.XPATH, ".//th|.//td")]
        has_header = any(h for h in headers)
        start = 1 if has_header else 0

        for tr in trs[start:]:
            tds = tr.find_elements(By.TAG_NAME, "td")
            if not tds:
                continue
            cells = [safe_text(td) for td in tds]
            link_elems = tr.find_elements(By.TAG_NAME, "a")
            link = link_elems[0].get_attribute("href") if link_elems else ""
            rows_data.append({
                "application_number": cells[0] if len(cells) > 0 else "",
                "proposal":           cells[1] if len(cells) > 1 else "",
                "address":            cells[2] if len(cells) > 2 else "",
                "status":             cells[3] if len(cells) > 3 else "",
                "validated_date":     cells[4] if len(cells) > 4 else "",
                "decision":           cells[5] if len(cells) > 5 else "",
                "detail_link":        link,
            })
        return rows_data

    # Fallback: card/list layout
    cards = driver.find_elements(By.XPATH, "//div[contains(@class,'result') or contains(@class,'card') or contains(@id,'Item')]")
    for card in cards:
        link_elems = card.find_elements(By.TAG_NAME, "a")
        link = link_elems[0].get_attribute("href") if link_elems else ""
        txt = safe_text(card)
        rows_data.append({
            "application_number": "",
            "proposal": txt,
            "address": "",
            "status": "",
            "validated_date": "",
            "decision": "",
            "detail_link": link,
        })
    return rows_data

def paginate_and_collect(driver, wait, max_pages: int = 30):
    all_rows = []
    page = 1
    while page <= max_pages:
        try:
            wait.until(lambda d: len(parse_results_page(d)) > 0 or "No records" in d.page_source)
        except TimeoutException:
            pass

        go_to_results_anchor(driver)
        rows = parse_results_page(driver)
        if rows:
            all_rows.extend(rows)
            print(f"[info] Page {page}: {len(rows)} rows (total {len(all_rows)})")
        else:
            if page == 1:
                print("[info] No results found on first page.")
            break

        # Next button variants
        clicked_next = False
        for by, value in [
            (By.LINK_TEXT, "Next"),
            (By.PARTIAL_LINK_TEXT, "Next"),
            (By.XPATH, "//a[contains(.,'Next') or contains(.,'>>') or contains(.,'>')]"),
            (By.XPATH, "//*[contains(@onclick,'Page$Next')]"),
            (By.CSS_SELECTOR, "a[aria-label='Next']"),
        ]:
            try:
                nxt = driver.find_element(by, value)
                if nxt.is_displayed() and nxt.is_enabled():
                    click_safely(driver, nxt)
                    clicked_next = True
                    time.sleep(0.8)
                    break
            except (NoSuchElementException, StaleElementReferenceException):
                continue

        if not clicked_next:
            break
        page += 1

    return all_rows


def save_csv(rows, out_path="results.csv"):
    if not rows:
        print("[info] 0 rows — nothing to save.")
        return
    df = pd.DataFrame(rows)
    # de-dupe if keys present
    keep_cols = [c for c in ["application_number", "detail_link"] if c in df.columns]
    if keep_cols:
        df = df.drop_duplicates(subset=keep_cols, keep="first")
    df.to_csv(out_path, index=False)
    print(f"[done] Saved {len(df)} rows to {out_path}")

def build_driver(headless: bool = True):
    opts = Options()
    if headless:
        opts.add_argument("--headless=new")
    opts.add_argument("--no-sandbox")
    opts.add_argument("--disable-dev-shm-usage")
    opts.add_argument("--window-size=1400,1000")
    service = ChromeService(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=opts)
    driver.set_page_load_timeout(60)
    return driver

def run(
    start_date: Optional[str] = None,   # optional in Parish mode
    end_date: Optional[str] = None,     # optional in Parish mode
    keywords: Optional[List[str]] = None,
    parish: Optional[str] = None,
    headless: bool = True
):
    driver = build_driver(headless=headless)
    wait = WebDriverWait(driver, 20)
    try:
        driver.get(BASE_URL)
        time.sleep(1.2)
        fill_search_form_parish(
            driver, wait,
            parish=parish,
            start_date=start_date,
            end_date=end_date,
            keywords=keywords or []
        )

        try:
            wait_for_any(
                wait,
                [
                    (By.ID, "dvRecsFound"),
                    (By.XPATH, "//*[contains(@id,'RecsFound') or contains(@id,'Results')]"),
                    (By.TAG_NAME, "table"),
                ],
                "Results container not found."
            )
        except TimeoutException:
            pass

        all_rows = paginate_and_collect(driver, wait)
        # all_rows = collect_first_page(driver, wait)

        # save_csv(all_rows, "results_test.csv")
        return all_rows
    finally:
        driver.quit()


In [15]:
# # Minimal run in Parish mode:
# results = run(
#     parish="All",               
#     start_date="09/01/2025",            
#     end_date="09/05/2025",              
#     keywords=[""],  # optional
#     headless=False # set True for headless
# )

# df = pd.DataFrame(results)

[info] Page 1: 15 rows (total 15)
[done] Saved 15 rows to results_test.csv


In [17]:
# listDF = df.drop(columns=["status", "validated_date", "decision", "detail_link"], errors="ignore")
# listDF = listDF.rename(columns={
#     "proposal": "address",
#     "address": "description"

# })
# listDF
# # listDF.to_csv("01_List.csv", index=False)

Unnamed: 0,application_number,address,description
0,2025/1028/TPO,"Listowel, Osmunda Bank, Dormans Park, East Gri...",G1) - Various tree line (overhanging fence/bou...
1,2025/1031/TPO,"2 Wheeler Avenue, Oxted, Surrey, RH8 9LE",Please refer to photos provided: T1) - Oak - R...
2,2025/1032/TPO,"1 Glendale, Felbridge, East Grinstead, Surrey,...",Please refer to photos provided: T1) - Beech -...
3,2025/923,"187a Croydon Road, Caterham, Surrey, CR3 6PH",Variation of Condition 2 ( approved plans ) of...
4,2025/878,"173 Godstone Road, Whyteleafe, Surrey, CR3 0EH",Erection of a single storey ground floor wrapa...
5,2025/839,"Meadowside, Southfields Road, Woldingham, Cate...",Variation of Condition 2 ( approved plans ) of...
6,2025/1013/TPO,"Church Of St George, Crowhurst Lane, Crowhurst...",T1) - Yew - Prune tertiary branches with secat...
7,2025/1014/TPO,"Cormont Beeches, Lunghurst Road, Woldingham, C...",Please refer to photos provided: T1) - Oak - F...
8,2025/993,"59 Chelsham Road, Chelsham, Warlingham, Surrey...",Erection of a detached garage.
9,2025/978/TPO,"9 St Mary's Mount, Caterham, Surrey, CR3 6SJ",01 - Various - Reduce height by 2m and crown l...
