### Scraping BMO Credit Card Homepage info

In [12]:
import requests
import pandas as pd
import bs4 as beautifulsoup4

API_URL = "https://www.bmo.com/public-data/api/v2.0/bmo-ca-credit-cards-en.json"
HEADERS = {"User-Agent": "Mozilla/5.0"}

data = requests.get(API_URL, headers=HEADERS, timeout=30).json()

# 1) Find the list of cards (quick helper)
def find_card_lists(obj, path="root"):
    found = []
    if isinstance(obj, dict):
        for k, v in obj.items():
            found += find_card_lists(v, f"{path}.{k}")
    elif isinstance(obj, list):
        if obj and isinstance(obj[0], dict):
            keys = set(obj[0].keys())
            # Heuristic: a "card-like" object often has a name/title + some url/link
            if any(k in keys for k in ["name", "title"]) and any(k in keys for k in ["url", "href", "link", "ctaLink", "detailUrl"]):
                found.append((path, len(obj), sorted(list(keys))[:25]))
        for i, v in enumerate(obj[:3]):
            found += find_card_lists(v, f"{path}[{i}]")
    return found

candidates = find_card_lists(data)
for path, n, keys in candidates[:15]:
    print(path, n, keys)

cards = list(data.values())
df = pd.json_normalize(cards)

df.head(2)

Unnamed: 0,productName,productId,productPageUrl,offerCode,offerStartDate,fullCardImage,partialCardImage,cardImageAccessibilityCopy,welcomeOffer,annualFee,...,baseCardMid,baseCardLevelTwoPid,baseCardLevelTwoMid,upsellPid,upsellMid,minimumIndividualIncome,minimumHouseholdIncome,monthlySpend,upsellLevelTwoPid,upsellLevelTwoMid
0,<span role='text'><abbr>BMO</abbr> eclipse Vis...,VISDX,/main/personal/credit-cards/bmo-eclipse-visa-i...,RQTSX00008,20251031,/dist/images/personal/credit-cards/infinite/Np...,/dist/images/personal/credit-cards/card-art/bm...,b m o eclipse visa infinite card,<strong><badge text='New' styletype='badge-inl...,120.0,...,3930758,,,VISDY,6011141.0,60000,100000,1250,,
1,<span role='text'><abbr>BMO</abbr> Ascend Worl...,MXSDW,/main/personal/credit-cards/bmo-ascend-world-e...,RQTWE00019,20251031,/dist/images/personal/credit-cards/bmo-rewards...,/dist/images/personal/credit-cards/card-art/bm...,b m o ascend world elite mastercard,<badge text='New' styletype='badge-inline--ult...,150.0,...,3930760,,,,,80000,150000,2100,,


### Scraping the individual pages

In [13]:
import re
import time
from pathlib import Path
from urllib.parse import urljoin

import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager


BASE_URL = "https://www.bmo.com"
OUT_DIR = Path("bmo_card_text")
OUT_DIR.mkdir(exist_ok=True)


def slugify(s: str) -> str:
    s = str(s or "").lower().strip()
    s = re.sub(r"<[^>]+>", "", s)
    s = re.sub(r"[^a-z0-9]+", "_", s)
    return (s.strip("_")[:80] or "card")


def build_driver(headless: bool = True) -> webdriver.Chrome:
    opts = Options()
    if headless:
        opts.add_argument("--headless=new")
    opts.add_argument("--disable-gpu")
    opts.add_argument("--no-sandbox")
    opts.add_argument("--disable-dev-shm-usage")
    opts.add_argument("--window-size=1400,1000")
    opts.add_argument(
        "user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
        "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"
    )
    return webdriver.Chrome(
        service=Service(ChromeDriverManager().install()),
        options=opts,
    )


def wait_ready(driver: webdriver.Chrome, timeout: int = 30) -> None:
    WebDriverWait(driver, timeout).until(
        lambda d: d.execute_script("return document.readyState") == "complete"
    )
    time.sleep(0.8)


def clean_text(text: str) -> str:
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\n{3,}", "\n\n", text).strip()
    return text


def normalize_url(product_page_url: str) -> str:
    if not product_page_url:
        return ""
    u = str(product_page_url).strip()
    if u.startswith("http://") or u.startswith("https://"):
        return u
    return urljoin(BASE_URL, u)


def find_panel_id_for_title(driver: webdriver.Chrome, title: str) -> str | None:
    """
    Find the tab *button/link* whose visible text contains the given title,
    then infer its panel id from aria-controls or href="#id".
    """
    title_l = title.lower()

    # Match *any* element whose text contains the title (case-insensitive)
    xpath = (
        "//*[contains(translate(normalize-space(.), "
        "'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), "
        f"'{title_l}')]"
    )

    candidates = driver.find_elements(By.XPATH, xpath)

    for el in candidates:
        # tab buttons normally have aria-controls pointing at the panel id
        aria = el.get_attribute("aria-controls")
        if aria:
            return aria

        href = el.get_attribute("href") or ""
        if "#tab-" in href:
            return href.split("#")[-1]

    return None


def get_page_and_tabs_text(driver: webdriver.Chrome, url: str) -> str:
    """
    Load the card page once, grab:
    - full body visible text
    - textContent of the 4 logical tab sections:
      Travel coverage / Extended insurance / Security features / Additional perks
    """
    driver.get(url)
    wait_ready(driver)

    # Trigger lazy loading by scrolling a bit
    for frac in (0.25, 0.6, 1.0):
        driver.execute_script(f"window.scrollTo(0, document.body.scrollHeight * {frac});")
        time.sleep(0.6)

    # 1) Main visible page text
    body_text = driver.execute_script("return document.body.innerText || ''")
    body_text = clean_text(body_text)

    # 2) Tab sections by *semantic* title
    tab_titles = {
        "travel coverage": "TRAVEL COVERAGE",
        "extended insurance": "EXTENDED INSURANCE",
        "security features": "SECURITY FEATURES",
        "additional perks": "ADDITIONAL PERKS",
    }

    tab_sections: dict[str, str] = {}

    for key, label in tab_titles.items():
        panel_id = find_panel_id_for_title(driver, key)
        panel_text = ""

        if panel_id:
            try:
                panel_el = driver.find_element(By.ID, panel_id)
                raw = panel_el.get_attribute("textContent") or ""
                panel_text = clean_text(raw)
            except Exception:
                panel_text = ""

        tab_sections[label] = panel_text

    # Combine everything into one string
    parts = []
    parts.append(f"URL: {url}\n")
    parts.append("===== PAGE BODY (VISIBLE TEXT) =====\n")
    parts.append(body_text)

    for label, content in tab_sections.items():
        parts.append(f"\n\n===== {label} =====\n{content}")

    return "\n".join(parts).strip()


def save_text_file(text: str, filename_prefix: str) -> Path:
    path = OUT_DIR / f"{filename_prefix}.txt"
    path.write_text(text, encoding="utf-8")
    return path


def scrape_all_cards_from_df(df: pd.DataFrame, limit: int | None = None, headless: bool = True) -> pd.DataFrame:
    required = {"productId", "productName", "productPageUrl"}
    missing = required - set(df.columns)
    if missing:
        raise ValueError(f"df is missing required columns: {sorted(missing)}")

    driver = build_driver(headless=headless)
    results = []

    try:
        rows = df if limit is None else df.head(limit)

        for _, row in rows.iterrows():
            pid = str(row.get("productId", "")).strip()
            name = row.get("productName", pid)
            page_url = normalize_url(row.get("productPageUrl", ""))

            if not page_url:
                continue

            prefix = f"{pid}_{slugify(name)}"

            try:
                text = get_page_and_tabs_text(driver, page_url)
                out_path = save_text_file(text, prefix)

                results.append(
                    {"productId": pid, "url": page_url, "chars": len(text), "file": str(out_path), "status": "OK"}
                )
                print(f"[OK] {pid} → {len(text)} chars → {out_path.name}")

            except Exception as e:
                results.append(
                    {
                        "productId": pid,
                        "url": page_url,
                        "chars": 0,
                        "file": "",
                        "status": f"ERROR: {type(e).__name__}: {e}",
                    }
                )
                print(f"[FAIL] {pid} {page_url} → {type(e).__name__}: {e}")

            time.sleep(1.2)

    finally:
        driver.quit()

    return pd.DataFrame(results)


# ---- RUN IT (start with a few) ----
results_df = scrape_all_cards_from_df(df, headless=True)
results_df


[OK] VISDX → 11841 chars → VISDX_bmo_eclipse_visa_infinite_card.txt
[OK] MXSDW → 12955 chars → MXSDW_bmo_ascend_world_elite_mastercard.txt
[OK] VISDY → 13934 chars → VISDY_bmo_eclipse_visa_infinite_privilege_card.txt
[OK] MXSDE → 11471 chars → MXSDE_bmo_cashback_world_nbsp_elite_mastercard.txt
[OK] MXSDT → 12232 chars → MXSDT_bmo_air_nbsp_miles_world_nbsp_elite_mastercard.txt
[OK] MCSDF → 8427 chars → MCSDF_bmo_preferred_rate_mastercard.txt
[OK] MCSDZ → 9756 chars → MCSDZ_bmo_cashback_mastercard.txt
[OK] MCRPZ → 9822 chars → MCRPZ_student_bmo_cashback_mastercard.txt
[OK] VPVDM → 10632 chars → VPVDM_bmo_eclipse_rise_visa_card.txt
[OK] MCSDS → 10308 chars → MCSDS_bmo_air_nbsp_miles_mastercard.txt
[OK] MCSPN → 5135 chars → MCSPN_bmo_prepaid_mastercard.txt
[OK] MCRPS → 8461 chars → MCRPS_student_bmo_air_nbsp_miles_mastercard.txt
[OK] MXROL → 7529 chars → MXROL_bmo_viporter_world_elite_mastercard.txt
[OK] MWROK → 7147 chars → MWROK_bmo_viporter_world_mastercard_card.txt
[OK] MCROQ → 7225 ch

Unnamed: 0,productId,url,chars,file,status
0,VISDX,https://www.bmo.com/main/personal/credit-cards...,11841,bmo_card_text/VISDX_bmo_eclipse_visa_infinite_...,OK
1,MXSDW,https://www.bmo.com/main/personal/credit-cards...,12955,bmo_card_text/MXSDW_bmo_ascend_world_elite_mas...,OK
2,VISDY,https://www.bmo.com/main/personal/credit-cards...,13934,bmo_card_text/VISDY_bmo_eclipse_visa_infinite_...,OK
3,MXSDE,https://www.bmo.com/main/personal/credit-cards...,11471,bmo_card_text/MXSDE_bmo_cashback_world_nbsp_el...,OK
4,MXSDT,https://www.bmo.com/main/personal/credit-cards...,12232,bmo_card_text/MXSDT_bmo_air_nbsp_miles_world_n...,OK
5,MCSDF,https://www.bmo.com/main/personal/credit-cards...,8427,bmo_card_text/MCSDF_bmo_preferred_rate_masterc...,OK
6,MCSDZ,https://www.bmo.com/main/personal/credit-cards...,9756,bmo_card_text/MCSDZ_bmo_cashback_mastercard.txt,OK
7,MCRPZ,https://www.bmo.com/main/personal/credit-cards...,9822,bmo_card_text/MCRPZ_student_bmo_cashback_maste...,OK
8,VPVDM,https://www.bmo.com/en-ca/main/personal/credit...,10632,bmo_card_text/VPVDM_bmo_eclipse_rise_visa_card...,OK
9,MCSDS,https://www.bmo.com/main/personal/credit-cards...,10308,bmo_card_text/MCSDS_bmo_air_nbsp_miles_masterc...,OK
