In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import numpy as np
import re
from urllib.parse import urlparse
import pyarrow

import os
from pathlib import Path
os.environ['USER_AGENT'] = 'myagent'

#os.chdir("/YOUR PATH HERE")
os.chdir("/Users/roamingt/Documents/Medium Posts")
cwd = Path.cwd()
notebook_path = os.getcwd()


In [4]:
"""
List of publications:

https://www.consumerfinance.gov/compliance/supervisory-highlights/
"""
# ---- SCRAPE THE CFPB Supervisory Highlights DATA ----
# Classify the CFPB links that are scraped into categories to easily manipulate them
# PDF is for links that can be hit directly
# RELATIVE_PDF is for links that need the site added
# LANDING_PAGE is for the one useful link to the most recent file page that contains the actual link
# IGNORE and UNKNOWN are what they say on the tin

def classify_link(link):
    if not link:
        return 'IGNORE'
    
    # 1. Skip "View More" or anything matching those patterns
    elif 'activity-log' in link or 'view-more' in link:
        return 'IGNORE'
    
    # 2. Direct PDF from 'files.consumerfinance.gov'
    elif link.startswith('https://files.consumerfinance.gov/') and link.endswith('.pdf'):
        return 'PDF'
    
    # 3. Relative PDF link, needs prefix
    elif link.startswith('/documents/') and link.endswith('.pdf'):
        return 'RELATIVE_PDF'
    
    # 4. Landing pages (not a PDF, might require scraping)
    elif link.startswith('/data-research/research-reports/') :
        return 'LANDING_PAGE'

    # 5. Unknown/Other (optional)
    else:
        return 'UNKNOWN'

# For PDFs, we can use links as is. For RELATIVE_PDF & LANDING_PAGE we need to add the subdomain
def normalize_link(link, link_type):
    if link_type == 'PDF':
        return link
    
    elif link_type in ['RELATIVE_PDF', 'LANDING_PAGE']:
        return f"https://www.consumerfinance.gov{link}"
    
    else:
        return None


# For PDF links, we want the actual file name
def extract_filename_from_link(url):
    """Extracts the filename from a URL."""
    parsed_url = urlparse(url)
    filename = os.path.basename(parsed_url.path)
    
    # Safety: fallback if something goes wrong
    if not filename:
        filename = 'downloaded_report.pdf'
    return filename


# This does the actual downloading of the PDFs
def download_pdf(file_url, save_path):
    """Download a PDF from the given URL to a local path."""
    try:
        response = requests.get(file_url, timeout=15)

        if response.status_code == 200:
            with open(save_path, 'wb') as f:
                f.write(response.content)
            print(f"[DOWNLOADED] {save_path}")
            return True
        else:
            print(f"[FAILED] Download failed: {file_url} | Status: {response.status_code}")
            return False

    except requests.RequestException as e:
        print(f"[ERROR] Exception occurred for {file_url}: {e}")
        return False

# Currently we only have one landing page link, so we could hard code this, but other regulatory bodies might have similar headaches
def get_pdf_link_from_report_page(report_page_url):
    """Scrape the report page to find any direct PDF link."""
    print(f"Scraping PDF link from: {report_page_url}")

    response = requests.get(report_page_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find all <a> tags with class 'a-link' and href
    pdf_links = soup.find_all('a', class_='a-link', href=True)

    for link_tag in pdf_links:
        href = link_tag['href']
        
        # Check if it's a direct PDF link
        if href.lower().endswith('.pdf'):
            # Full URL if relative
            if href.startswith('/'):
                href = 'https://www.consumerfinance.gov' + href

            print(f"Found PDF link: {href}")
            return href

# We're checking if a file to be downloaded already exists locally in our target folder
#      If it does NOT exist, we download and return the save path
#      If it DOES exist, we return the save path of the existing file
def process_download(normalized_link, download_folder):
    """Download a file using its filename from the URL."""
    
    # Step 1: Direct PDF link
    if normalized_link.lower().endswith('.pdf'):
        filename = extract_filename_from_link(normalized_link)
        save_path = os.path.join(download_folder, filename)

        # Check if file exists locally
        if os.path.isfile(save_path):
            print(f"[SKIP] File already exists locally: {save_path}")
            return save_path

        success = download_pdf(normalized_link, save_path)
        local_filename = download_folder + "/" + filename
        return save_path if success else None

    # Step 2: Landing page → scrape for PDF link
    pdf_link = get_pdf_link_from_report_page(normalized_link)

    if pdf_link:
        filename = extract_filename_from_link(pdf_link)
        save_path = os.path.join(download_folder, filename)

        # Check if file exists locally
        if os.path.isfile(save_path):
            print(f"[SKIP] File already exists locally (scraped): {save_path}")
            return save_path

        else:
            success = download_pdf(pdf_link, save_path)
            local_filename = download_folder + "/" + filename
            return local_filename if success else None
        print(f"[FAILED] No PDF found or downloaded for: {normalized_link}")

# Get the most recent mod date of a file
def get_file_modification_date(file_path):
    """Return file modification date as YYYYMMDD string."""
    try:
        # Get modification time as Unix timestamp
        mod_timestamp = os.path.getmtime(file_path)
        
        # Convert to datetime
        mod_datetime = datetime.fromtimestamp(mod_timestamp)
        
        # Format as YYYYMMDD
        mod_date_str = mod_datetime.strftime('%Y%m%d')
        
        return mod_date_str
    
    except Exception as e:
        print(f"[ERROR] Could not read mod date for {file_path}: {e}")
        return None


In [6]:
# Local folder to store downloaded PDFs (optional)
local_download_folder = 'scraped data files'
os.makedirs(local_download_folder, exist_ok=True)

url = "https://www.consumerfinance.gov/compliance/supervisory-highlights/"
response = requests.get(url)
# raises exception for errors, instead handled manually response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')

#Read the given url containing the links. from inspection, we know this is the class we want
link_elements = soup.select('a.a-link.a-link--jump')

data = []


for link in link_elements:
    title_element = link.select_one('span.a-link__text')

    if not title_element:
        continue  # Skip anything without a proper title
    
    #get the link
    title = title_element.get_text(strip=True)
    href = link['href']

    # Classify the link type
    link_type = classify_link(href)

    # SKIP garbage links immediately
    if link_type == 'IGNORE':
        print(f"[SKIP] Ignored link: {href}")
        continue

    # Normalize link (if necessary)
    normalized_link = normalize_link(href, link_type)
    print(link_type, normalized_link)

    if normalized_link:
        downloaded_filename = process_download(normalized_link, local_download_folder)
    else:
        downloaded_filename = None
    
    # Get today's date in YYYYMMDD format
    file_loc = ''
    scrape_date = ''
    if downloaded_filename:
        file_loc = downloaded_filename
        scrape_date = datetime.today().strftime('%Y%m%d')
        #print(scrape_date)
    elif normalized_link:
        print(f"[WARN] No file downloaded for link: {normalized_link}")
    
    # Extract year and season from the title
    year_match = re.search(r'\b(20\d{2})\b', title)
    year = year_match.group(1) if year_match else 'Unknown'

    season_match = re.search(r'(Winter|Spring|Summer|Fall)', title, re.IGNORECASE)
    season = season_match.group(1).capitalize() if season_match else 'Unknown'
    clean_filename = title.replace('Download ', '')

    # Append the cleaned data row
    data.append({
        'Title': clean_filename,
        'Link': href,
        'Link Type': link_type,
        'Normalized Link': normalized_link,
        'Year': year,
        'Season': season,
        'Agency': 'CFPB',
        'File Location': file_loc,         # Will fill this later in download/upload
        'Drive Link': '',              # Will fill this later in upload
        'Summarized?': False,          # Will update post-summarization
        'Summary': '',
        'Scraped Date': scrape_date
    })

# Convert to DataFrame (single pass complete)
df = pd.DataFrame(data)

print(f"[INFO] Scraped {len(df)} records")
#print(df)


LANDING_PAGE https://www.consumerfinance.gov/data-research/research-reports/supervisory-highlights-advanced-technologies-special-edition-issue-38-winter-2025/
Scraping PDF link from: https://www.consumerfinance.gov/data-research/research-reports/supervisory-highlights-advanced-technologies-special-edition-issue-38-winter-2025/
Found PDF link: https://files.consumerfinance.gov/f/documents/cfpb_supervisory-highlights-advanced-technologies_2025-01.pdf
[DOWNLOADED] scraped data files/cfpb_supervisory-highlights-advanced-technologies_2025-01.pdf
PDF https://files.consumerfinance.gov/f/documents/cfpb_supervisory-highlights-special-ed-student-lending-issue-36-winter_2024-12.pdf
[DOWNLOADED] scraped data files/cfpb_supervisory-highlights-special-ed-student-lending-issue-36-winter_2024-12.pdf
PDF https://files.consumerfinance.gov/f/documents/cfpb_Supervisory-Highlights-Issue-37_Winter-2024.pdf
[DOWNLOADED] scraped data files/cfpb_Supervisory-Highlights-Issue-37_Winter-2024.pdf
PDF https://files

In [8]:
# Save to parquet with pyarrow
data_summary_dir = 'pdf scrape reports/'
scraped_data_name = data_summary_dir+'scraped_data_'+str(scrape_date)+'.parquet'
df.to_parquet(scraped_data_name, engine='pyarrow', compression='snappy')

# Read it back
df = pd.read_parquet(scraped_data_name, engine='pyarrow')
print(df)

                                                Title  \
0   Advanced Technologies Special Edition, Issue 3...   
1   Supervisory Highlights: Special Edition Studen...   
2      Supervisory Highlights, Issue 37 (Winter 2024)   
3   Supervisory Highlights: Auto Finance Special E...   
4   Supervisory Highlights: Servicing and Collecti...   
5       Supervisory Highlights, Issue 32, Spring 2024   
6       Supervisory Highlights, Issue 33, Spring 2024   
7   Supervisory Highlights Junk Fees Update Specia...   
8                  Summer 2023 Supervisory Highlights   
9    Supervisory Highlights Special Edition Junk Fees   
10                   Fall 2022 Supervisory Highlights   
11  Supervisory Highlights Student Loan Servicing ...   
12                 Spring 2022 Supervisory Highlights   
13                   Fall 2021 Supervisory Highlights   
14                 Summer 2021 Supervisory Highlights   
15  COVID-19 Prioritized Assessments Special Editi...   
16                 Winter 2020 