In [12]:
import os
import re
import time
from urllib.parse import urljoin, urlparse
import pandas as pd

import requests
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By


In [13]:
def build_driver(headless=False):
    opts = webdriver.ChromeOptions()
    if headless:
        # Try new headless; if your Chrome is older, change to --headless
        opts.add_argument("--headless=new")
    opts.add_argument("--disable-gpu")
    opts.add_argument("--no-sandbox")
    opts.add_argument("--window-size=1400,1000")
    return webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=opts)

def selenium_to_requests_session(driver):
    s = requests.Session()
    for c in driver.get_cookies():
        s.cookies.set(c["name"], c["value"], domain=c.get("domain"), path=c.get("path", "/"))
    s.headers.update({
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36"
    })
    return s

def safe_filename(name: str) -> str:
    name = re.sub(r"[\\/:*?\"<>|]+", "_", name)
    name = re.sub(r"\s+", " ", name).strip()
    return name[:180]

def is_doc_href(href: str) -> bool:
    if not href:
        return False
    hl = href.lower()
    if hl.endswith(COMMON_FILE_EXTS):
        return True
    # Portals sometimes use no extensions; allow common indicators
    return any(k in hl for k in ["document", "download", "file", "getfile", "attachment"])

In [16]:
import pandas as pd
import time
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By


## first 100 to 200
# app_ids = ['2025/298']
# app_ids = ['2025/975/TPO', '2025/1032/TPO']
# '2025/298', '2025/222', '2024/1032', '2025/284', '2025/291', '2025/279/TPO', '2025/185', '2025/165', '2025/126', '2025/151', '2025/293', '2024/1200', '2025/367/NH', '2025/282', '2025/280', '2025/275', '2025/252', '2025/239', '2025/213', '2025/98', '2025/301/NH', '2025/273/TPO', '2025/271/TPO', '2025/238', '2025/152', '2024/1162', '2025/267/TPO', '2025/156', '2025/260', '2025/258/NC', '2025/229', '2025/195', '2024/1375', '2025/285', '2025/248/TPO', '2025/251/TPO', '2025/225', '2025/81', '2024/1321', '2025/244', '2025/246', '2025/221', '2025/44', '2024/1383', '2025/237/TPO', '2024/348/Cond1', '2025/235', '2025/198', '2025/191', '2025/88', '2024/1373', '2023/186/Cond1', '2025/231', '2025/189', '2025/149', '2024/1326', '2025/228/TPO', '2025/227', '2025/203', '2025/158', '2025/159', '2025/294', '2025/224', '2025/223', '2025/175', '2024/1308', '2025/220/TPO', '2025/122', '2024/1034', '2025/210', '2025/211', '2025/212/TPO', '2025/150', '2025/45', '2024/685', '2025/250/NH', '2024/1259', '2025/265/TCA', '2021/1800/Cond6', '2024/503/Cond2', '2024/697/Cond2', '2025/254/TCA', '2025/199/TPO', '2025/160', '2025/145', '2024/1289', '2024/977/Cond1', '2025/117', '2025/116', '2025/86', '2025/242/NH', '2025/196/NC', '2024/420/Cond2', '2025/120', '2025/70', '2025/331', '2025/247/NH', '2025/240/TCA', '2025/193/TCA', '2025/184/TPO'
# ]
SKIP_WORDS = {"plan", "plans"}  # exclude these anywhere in the file name (case-insensitive)

def should_skip(name: str) -> bool:
    n = (name or "").lower()
    if "skip to main content" in n:
        return True
    return any(sw in n for sw in SKIP_WORDS)

rows = []  # each row: {"application_number": ..., "file_name": ..., "url": ...}

for app_id in app_ids:
    APP_URL = f"https://plandocs.tandridge.gov.uk/planning/planning-documents?SDescription={app_id}"
    driver = build_driver(headless=True)  # or False to watch it
    try:
        driver.get(APP_URL)

        # Wait for anchors to stabilize (robust against dynamic loads)
        prev_count, stable_iters = -1, 0
        for _ in range(60):  # ~30s max
            anchors = driver.find_elements(By.TAG_NAME, "a")
            count = len(anchors)
            if count == prev_count:
                stable_iters += 1
            else:
                stable_iters = 0
            prev_count = count
            if stable_iters >= 4:  # stable ~2s
                break
            time.sleep(0.5)

        soup = BeautifulSoup(driver.page_source, "lxml")

        # collect document-like links
        seen_urls = set()
        for a in soup.find_all("a", href=True):
            href = a["href"]
            text = (a.get_text(strip=True) or "").strip()
            full = urljoin(driver.current_url, href)
            if not is_doc_href(full):
                continue
            if full in seen_urls:
                continue
            seen_urls.add(full)
            name = text or (urlparse(full).path.rsplit("/", 1)[-1] or "document")
            if should_skip(name):
                continue
            rows.append({
                "application_number": app_id,
                "file_name": name,
                "url": full
            })

    except Exception as e:
        # Record the failure so you can inspect later
        rows.append({
            "application_number": app_id,
            "file_name": "__ERROR__",
            "url": f"{type(e).__name__}: {e}"
        })
    finally:
        driver.quit()
        time.sleep(0.8)  # be polite

# Build the flat DataFrame
files_df = pd.DataFrame(rows)

# Optional: remove error rows if any
files_df = files_df[files_df["file_name"] != "__ERROR__"].reset_index(drop=True)

# Show without truncation in notebooks/terminals
pd.set_option("display.max_colwidth", None)
print(files_df)

# Save for later use
# files_df.to_csv("02_fileNameAndDocUrl.csv", index=False)
# files_df.to_parquet("02_fileNameAndDocUrl.parquet", index=False)


  application_number                                               file_name  \
0       2025/975/TPO                          Application Details - IMG_1961   
1       2025/975/TPO                          Application Details - IMG_1957   
2       2025/975/TPO                          Application Details - IMG_1960   
3       2025/975/TPO                               Application Details - T25   
4       2025/975/TPO  Statement of reasons for work - tree details and works   
5       2025/975/TPO       Application Details - ApplicationFormRedacted.pdf   
6      2025/1032/TPO       Application Details - ApplicationFormRedacted.pdf   

                                                                            url  
0  https://plandocs.tandridge.gov.uk/my-requests/document-viewer?DocNo=25375147  
1  https://plandocs.tandridge.gov.uk/my-requests/document-viewer?DocNo=25375145  
2  https://plandocs.tandridge.gov.uk/my-requests/document-viewer?DocNo=25375146  
3  https://plandocs.tandridge.g

In [17]:
## Filter FileNames
# search_terms = [
#     "Report", "Decision Notice", "Surrey Highways", "National Trust", "Surrey County Council", "Statutory Correspondence", "Appeal", "Decision", "Rebuttal"
# ]
# pattern = '|'.join(search_terms)  
# filtered_df = files_df[files_df['file_name'].str.contains(pattern, case=False, na=False)]
# print(filtered_df)
# print(f"\nNumber of matching rows: {len(filtered_df)}")
# print(f"Original DataFrame had {len(files_df)} rows")


Empty DataFrame
Columns: [application_number, file_name, url]
Index: []

Number of matching rows: 0
Original DataFrame had 7 rows
