In [1]:
import asyncio
from pyppeteer import launch
import json
import re
from urllib.parse import urljoin
import requests
import re

In [10]:
import asyncio
from pyppeteer import launch

async def get_autocomplete_links(base_url, search_term):
    browser = await launch()
    page = await browser.newPage()
    await page.goto(base_url)

    # Type the search term into the search bar
    await page.type('#o-search-bar_query', search_term)  # Replace with actual selector

    # Wait for suggestions
    await page.waitForSelector('ul.m-autocomplete__results li a', timeout=20000)

    # Extract links
    links = await page.querySelectorAllEval(
        'ul.m-autocomplete__results li a',
        'links => links.map(link => link.href)'
    )
    await browser.close()
    return links

# Execute link collection
base_url = "https://www.consumerfinance.gov/ask-cfpb/"
search_terms = [
    "consumer", "auto", "bank", "mortgage", "loan", "debt", "credit", "report",
    "fraud", "scam", "complaint", "card", "finance", "interest", "fees",
    "insurance", "payments", "foreclosure", "bankruptcy", "rights", "dispute",
    "settlement", "identity theft", "protection", "policy", "lawsuit",
    "small business", "regulations", "penalty", "lending", "budgeting",
    "fair lending", "student loans", "housing", "renters", "leasing",
    "equity", "APR", "terms", "conditions", "compliance", "assistance",
    "relief", "subsidy", "savings", "overdraft", "digital banking",
    "fintech", "crypto", "mobile banking", "interest rates", "investment",
    "retirement", "scoring", "collections", "repossession", "legal"
]
all_links = []

for term in search_terms:
    links = await get_autocomplete_links(base_url, term)
    all_links.extend(links)

unique_links = list(set(all_links))
print(f"Collected {len(unique_links)} unique links")
for urls in unique_links:
    print(str(urls))

Collected 455 unique links
https://www.consumerfinance.gov/ask-cfpb/can-i-withdraw-money-from-my-credit-card-at-an-atm-en-34/
https://www.consumerfinance.gov/ask-cfpb/what-is-a-loan-estimate-en-1995/
https://www.consumerfinance.gov/ask-cfpb/how-can-i-tell-if-i-am-working-with-a-mortgage-broker-or-a-mortgage-lender-en-131/
https://www.consumerfinance.gov/ask-cfpb/how-quickly-can-i-get-money-after-i-deposit-a-check-into-my-checking-account-what-is-a-deposit-hold-en-1023/
https://www.consumerfinance.gov/ask-cfpb/where-can-i-file-a-financial-aid-or-student-loan-complaint-en-2088/
https://www.consumerfinance.gov/ask-cfpb/how-can-i-find-an-attorney-who-specializes-in-elder-law-en-1159/
https://www.consumerfinance.gov/ask-cfpb/how-long-can-the-card-issuer-take-to-resolve-my-billing-error-or-dispute-en-64/
https://www.consumerfinance.gov/ask-cfpb/what-happens-my-reverse-mortgage-when-i-die-en-2096/
https://www.consumerfinance.gov/ask-cfpb/what-happens-if-my-payment-app-has-an-outage-and-i-cant

In [11]:
async def scrape_page(url):
    browser = await launch()
    page = await browser.newPage()
    await page.goto(url)

    # Extract data
    title = await page.querySelectorEval('#main > div > div.u-layout-grid__main > div.block.block--flush-top.block--sub > h1', 'el => el.textContent.trim()')
    topic = await page.querySelectorEval('#main > div > div.u-layout-grid__breadcrumbs > nav > a', 'el => el.textContent.trim()')
    last_reviewed = await page.querySelectorEval('#main > div > div.u-layout-grid__main > div.block.block--flush-top.block--sub > time', 'el => el.textContent.replace("last reviewed:", "").trim()')
    language = await page.querySelectorEval('#main > div > div.u-layout-grid__main > div.block.block--flush-top.block--sub > div > ul > li:nth-child(1)', 'el => el.textContent.trim()')
    main_content = await page.querySelectorEval('#main > div > div.u-layout-grid__main > div:nth-child(2) > div.block.block--sub > div.o-summary.o-summary--mobile > div > div', 'el => el.textContent.trim()')
    key_message = await page.querySelectorEval('#main > div > div.u-layout-grid__main > div:nth-child(2) > div.lead-paragraph > p', 'el => el.textContent.trim()')

    await browser.close()

    return {
        'url': url,
        'topic': topic,
        'title': title,
        'last_reviewed': last_reviewed,
        'language': language,
        'main_content': main_content,
        'key_message': key_message
    }

In [12]:
async def main():
    # Create output directory if it doesn't exist
    import os
    os.makedirs('CFPB_json_outputs', exist_ok=True)
    
    print(f"Starting to scrape {len(unique_links)} pages")
    for url in unique_links:
        try:
            data = await scrape_page(url)
            filename = re.sub(r'[^\w\d]+', '_', url).replace('https_www_consumerfinance_gov_ask_cfpb_', '') + '.json'
            # Add directory to file path
            filepath = os.path.join('CFPB_json_outputs', filename)
            with open(filepath, 'w', encoding='utf-8') as f:
                json.dump(data, f, ensure_ascii=False, indent=4)
            print(f"Saved: {filepath}")
        except Exception as e:
            print(f"Failed {url}: {str(e)}")
# Run the main function
if __name__ == "__main__":
    import nest_asyncio
    nest_asyncio.apply()
    asyncio.run(main())

Starting to scrape 455 pages
Saved: CFPB_json_outputs/can_i_withdraw_money_from_my_credit_card_at_an_atm_en_34_.json
Saved: CFPB_json_outputs/what_is_a_loan_estimate_en_1995_.json
Saved: CFPB_json_outputs/how_can_i_tell_if_i_am_working_with_a_mortgage_broker_or_a_mortgage_lender_en_131_.json
Saved: CFPB_json_outputs/how_quickly_can_i_get_money_after_i_deposit_a_check_into_my_checking_account_what_is_a_deposit_hold_en_1023_.json
Saved: CFPB_json_outputs/where_can_i_file_a_financial_aid_or_student_loan_complaint_en_2088_.json
Saved: CFPB_json_outputs/how_can_i_find_an_attorney_who_specializes_in_elder_law_en_1159_.json
Saved: CFPB_json_outputs/how_long_can_the_card_issuer_take_to_resolve_my_billing_error_or_dispute_en_64_.json
Saved: CFPB_json_outputs/what_happens_my_reverse_mortgage_when_i_die_en_2096_.json
Saved: CFPB_json_outputs/what_happens_if_my_payment_app_has_an_outage_and_i_cant_access_my_account_en_2145_.json
Saved: CFPB_json_outputs/how_do_i_avoid_atm_fees_en_981_.json
Saved: 