### Scrpaing BMO Credit Card Homepage info

In [None]:
import requests
import pandas as pd
import bs4 as beautifulsoup4

API_URL = "https://www.bmo.com/public-data/api/v2.0/bmo-ca-credit-cards-en.json"
HEADERS = {"User-Agent": "Mozilla/5.0"}

data = requests.get(API_URL, headers=HEADERS, timeout=30).json()

# 1) Find the list of cards (quick helper)
def find_card_lists(obj, path="root"):
    found = []
    if isinstance(obj, dict):
        for k, v in obj.items():
            found += find_card_lists(v, f"{path}.{k}")
    elif isinstance(obj, list):
        if obj and isinstance(obj[0], dict):
            keys = set(obj[0].keys())
            # Heuristic: a "card-like" object often has a name/title + some url/link
            if any(k in keys for k in ["name", "title"]) and any(k in keys for k in ["url", "href", "link", "ctaLink", "detailUrl"]):
                found.append((path, len(obj), sorted(list(keys))[:25]))
        for i, v in enumerate(obj[:3]):
            found += find_card_lists(v, f"{path}[{i}]")
    return found

candidates = find_card_lists(data)
for path, n, keys in candidates[:15]:
    print(path, n, keys)

cards = list(data.values())
df = pd.json_normalize(cards)

df.head(2)

### Scraping the individual pages

In [None]:
import re
import time
from pathlib import Path
from urllib.parse import urljoin

import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager


BASE_URL = "https://www.bmo.com"
OUT_DIR = Path("bmo_card_text")
OUT_DIR.mkdir(exist_ok=True)


def slugify(s: str) -> str:
    s = str(s or "").lower().strip()
    s = re.sub(r"<[^>]+>", "", s)          # strip HTML tags in productName
    s = re.sub(r"[^a-z0-9]+", "_", s)
    return (s.strip("_")[:80] or "card")


def build_driver() -> webdriver.Chrome:
    opts = Options()
    opts.add_argument("--headless=new")
    opts.add_argument("--disable-gpu")
    opts.add_argument("--no-sandbox")
    opts.add_argument("--disable-dev-shm-usage")
    opts.add_argument(
        "user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
        "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"
    )
    return webdriver.Chrome(
        service=Service(ChromeDriverManager().install()),
        options=opts,
    )


def wait_ready(driver: webdriver.Chrome, timeout: int = 30) -> None:
    WebDriverWait(driver, timeout).until(
        lambda d: d.execute_script("return document.readyState") == "complete"
    )
    time.sleep(0.8)


def clean_text(text: str) -> str:
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\n{3,}", "\n\n", text).strip()
    return text


def safe_click(driver: webdriver.Chrome, el) -> None:
    driver.execute_script("arguments[0].scrollIntoView({block:'center'});", el)
    time.sleep(0.2)
    try:
        el.click()
    except Exception:
        driver.execute_script("arguments[0].click();", el)


def click_tab(driver: webdriver.Chrome, tab_num: int) -> bool:
    """
    Click the tab control for #tab-<n>. Returns True if we found something to click.
    """
    tab_hash = f"#tab-{tab_num}"
    selectors = [
        f'a[href="{tab_hash}"]',
        f'[role="tab"][href="{tab_hash}"]',
        f'[role="tab"][aria-controls="tab-{tab_num}"]',
        f'button[aria-controls="tab-{tab_num}"]',
        f'[data-tab="tab-{tab_num}"]',
        f'[data-target="{tab_hash}"]',
    ]

    for sel in selectors:
        els = driver.find_elements(By.CSS_SELECTOR, sel)
        if els:
            safe_click(driver, els[0])
            return True

    # fallback: set hash (sometimes triggers)
    driver.execute_script(f"window.location.hash = 'tab-{tab_num}';")
    return False


def get_tab_panel_text(driver: webdriver.Chrome, tab_num: int) -> str:
    """
    Returns text from the tab panel element with id="tab-<n>" if it exists.
    """
    panel_id = f"tab-{tab_num}"
    selectors = [
        f'#{panel_id}',
        f'[id="{panel_id}"]',
        f'[role="tabpanel"][id="{panel_id}"]',
    ]

    panel = None
    for sel in selectors:
        try:
            panel = WebDriverWait(driver, 6).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, sel))
            )
            break
        except Exception:
            panel = None

    return clean_text(panel.text) if panel else ""


def get_rendered_text_with_tabs(driver: webdriver.Chrome, url: str) -> str:
    """
    Returns page visible text PLUS tab-1..tab-4 panel texts (if present).
    """
    driver.get(url)
    wait_ready(driver)

    # Load some lazy content
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight * 0.35);")
    time.sleep(0.8)

    # Base visible page text
    body_text = driver.execute_script("return document.body.innerText || ''")
    body_text = clean_text(body_text)

    # Tab texts (these are often missing from body_text because tabs 2-4 are hidden)
    tab_texts = {}
    for n in [1, 2, 3, 4]:
        click_tab(driver, n)
        time.sleep(0.6)  # allow JS to swap content/visibility
        tab_texts[f"tab-{n}"] = get_tab_panel_text(driver, n)

    # Combine into one big text blob (one file per card)
    combined = []
    combined.append(f"URL: {url}\n")
    combined.append("\n===== PAGE BODY (VISIBLE TEXT) =====\n")
    combined.append(body_text)

    for n in [1, 2, 3, 4]:
        combined.append(f"\n\n===== TAB-{n} =====\n")
        combined.append(tab_texts.get(f"tab-{n}", ""))

    return "\n".join(combined).strip()


def save_text_file(text: str, filename_prefix: str) -> Path:
    path = OUT_DIR / f"{filename_prefix}.txt"
    path.write_text(text, encoding="utf-8")
    return path


def scrape_all_cards(df: pd.DataFrame) -> pd.DataFrame:
    required = {"productId", "productName", "productPageUrl"}
    missing = required - set(df.columns)
    if missing:
        raise ValueError(f"df is missing required columns: {sorted(missing)}")

    driver = build_driver()
    results = []

    try:
        for _, row in df.iterrows():
            product_page = row["productPageUrl"]
            if not isinstance(product_page, str) or not product_page.strip():
                continue

            full_url = urljoin(BASE_URL, product_page)

            pid = str(row["productId"])
            name = row.get("productName", pid)
            prefix = f"{pid}_{slugify(name)}"

            try:
                text = get_rendered_text_with_tabs(driver, full_url)
                out_path = save_text_file(text, prefix)

                results.append(
                    {"productId": pid, "url": full_url, "chars": len(text), "file": str(out_path), "status": "OK"}
                )
                print(f"[OK] {pid} -> {len(text)} chars -> {out_path.name}")

            except Exception as e:
                results.append(
                    {"productId": pid, "url": full_url, "chars": 0, "file": "", "status": f"ERROR: {type(e).__name__}"}
                )
                print(f"[FAIL] {pid} {full_url} -> {e}")

            time.sleep(1.5)

    finally:
        driver.quit()

    return pd.DataFrame(results)