### Scrpaing BMO Credit Card Homepage info

In [None]:
import requests
import pandas as pd
import bs4 as beautifulsoup4

API_URL = "https://www.bmo.com/public-data/api/v2.0/bmo-ca-credit-cards-en.json"
HEADERS = {"User-Agent": "Mozilla/5.0"}

data = requests.get(API_URL, headers=HEADERS, timeout=30).json()

# 1) Find the list of cards (quick helper)
def find_card_lists(obj, path="root"):
    found = []
    if isinstance(obj, dict):
        for k, v in obj.items():
            found += find_card_lists(v, f"{path}.{k}")
    elif isinstance(obj, list):
        if obj and isinstance(obj[0], dict):
            keys = set(obj[0].keys())
            # Heuristic: a "card-like" object often has a name/title + some url/link
            if any(k in keys for k in ["name", "title"]) and any(k in keys for k in ["url", "href", "link", "ctaLink", "detailUrl"]):
                found.append((path, len(obj), sorted(list(keys))[:25]))
        for i, v in enumerate(obj[:3]):
            found += find_card_lists(v, f"{path}[{i}]")
    return found

candidates = find_card_lists(data)
for path, n, keys in candidates[:15]:
    print(path, n, keys)

cards = list(data.values())
df = pd.json_normalize(cards)

### Scraping the individual pages

In [None]:
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

URL = "https://www.bmo.com/main/personal/credit-cards/bmo-eclipse-visa-infinite/"

opts = Options()
opts.add_argument("--headless=new")
opts.add_argument("--disable-gpu")
opts.add_argument("--no-sandbox")
opts.add_argument("--disable-dev-shm-usage")
opts.add_argument(
    "user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
    "AppleWebKit/537.36 (KHTML, like Gecko) "
    "Chrome/121.0.0.0 Safari/537.36"
)

driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=opts,
)
wait = WebDriverWait(driver, 30)

VALUE_RE = re.compile(r"(\$|\d+(\.\d+)?%)")  # matches "$" or "21.99%"

def pick_value_and_label_from_block(block):
    """
    In these BMO blocks, the clean value is usually in a <p> like "$0" or "21.99%".
    The label is often the next <p> like "annual fee8" or "for purchases8".
    """
    ps = block.find_elements(By.CSS_SELECTOR, "p")
    texts = [p.text.strip() for p in ps if p.text and p.text.strip()]

    value = None
    label = None

    # find first p that looks like a value
    value_idx = None
    for i, t in enumerate(texts):
        if VALUE_RE.search(t):
            value = t
            value_idx = i
            break

    # label is usually next p after the value
    if value_idx is not None and value_idx + 1 < len(texts):
        label = texts[value_idx + 1]
    else:
        # fallback: first non-value text as label
        for t in texts:
            if not VALUE_RE.search(t):
                label = t
                break

    return label, value, texts

def find_block_by_suffix(suffix):
    return wait.until(
        EC.presence_of_element_located((By.CSS_SELECTOR, f'div[id$="{suffix}"]'))
    )

def scrape_field(suffix):
    try:
        block = find_block_by_suffix(suffix)
        label, value, texts = pick_value_and_label_from_block(block)

        if value is None or label is None:
            print(f"\n[DEBUG] {suffix} extraction incomplete")
            print("All <p> texts:", texts)

        return label, value
    except Exception as e:
        print(f"\n[WARN] Could not find/scrape block {suffix}: {e}")
        return None, None

try:
    driver.get(URL)

    # These two worked conceptually, but now we extract correctly
    annual_label, annual_value = scrape_field("--annual-fee")
    purchase_label, purchase_value = scrape_field("--purchase-rate")

    # Cash advance suffix is likely different on this page.
    # We'll try a couple common ones, then debug-print candidate ids.
    cash_label, cash_value = scrape_field("--cash-advance-rate")
    if cash_value is None:
        cash_label, cash_value = scrape_field("--cashAdvance-rate")

    qc_label, qc_value = scrape_field("--quebec-rate")

    # If cash advance still missing, print all rate-ish ids so you can choose the right suffix
    if cash_value is None:
        print("\n[DEBUG] Cash advance block suffix not found. Candidate IDs containing 'cash' or 'advance':")
        candidates = driver.find_elements(By.CSS_SELECTOR, 'div[id*="cash"], div[id*="advance"]')
        for el in candidates[:50]:
            _id = el.get_attribute("id")
            if _id:
                print(_id)

    print("\n====== FINAL RESULTS ======")
    print("annual_fee:", annual_label, "=>", annual_value)
    print("purchase_rate:", purchase_label, "=>", purchase_value)
    print("cash_advance_rate:", cash_label, "=>", cash_value)
    print("quebec_cash_advance_rate:", qc_label, "=>", qc_value)

finally:
    driver.quit()


[DEBUG] --annual-fee extraction incomplete
All <p> texts: ['$0']

[WARN] Could not find/scrape block --cash-advance-rate: Message: 
Stacktrace:
0   chromedriver                        0x0000000100b47dfc cxxbridge1$str$ptr + 3031016
1   chromedriver                        0x0000000100b3fcb8 cxxbridge1$str$ptr + 2997924
2   chromedriver                        0x000000010063ab90 _RNvCsgXDX2mvAJAg_7___rustc35___rust_no_alloc_shim_is_unstable_v2 + 74192
3   chromedriver                        0x0000000100681ab4 _RNvCsgXDX2mvAJAg_7___rustc35___rust_no_alloc_shim_is_unstable_v2 + 364788
4   chromedriver                        0x00000001006c2a28 _RNvCsgXDX2mvAJAg_7___rustc35___rust_no_alloc_shim_is_unstable_v2 + 630888
5   chromedriver                        0x000000010067622c _RNvCsgXDX2mvAJAg_7___rustc35___rust_no_alloc_shim_is_unstable_v2 + 317548
6   chromedriver                        0x0000000100b0c194 cxxbridge1$str$ptr + 2786176
7   chromedriver                        0x0000000100b0f9