In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import numpy as np
import re

import os
from pathlib import Path
os.environ['USER_AGENT'] = 'myagent'

os.chdir("/YOUR PATH HERE")
cwd = Path.cwd()
notebook_path = os.getcwd()

In [4]:
"""
List of publications:
CFPB Supervisory Highlights -- target of this program
https://www.consumerfinance.gov/compliance/supervisory-highlights/
"""


'\nList of publications:\nCFPB Supervisory Highlights -- target of this program\nhttps://www.consumerfinance.gov/compliance/supervisory-highlights/\n'

In [6]:
# ---- SCRAPE THE CFPB Supervisory Highlights DATA ----

url = "https://www.consumerfinance.gov/compliance/supervisory-highlights/"
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')

#we only keep elements that we think have the info we want
link_elements = soup.select('a.a-link.a-link--jump')

# Initialize data list
data = []

# Current scrape date
scrape_date = datetime.now().strftime('%Y-%m-%d')

# Loop through each link and extract fields
for link in link_elements:
    title_element = link.select_one('span.a-link__text')

    #if the element contains the title element and URL we're looking for
    if title_element:
        title = title_element.get_text(strip=True)
        href = link.get('href')
        
        # Full URL if relative
        if href.startswith('/'):
            href = 'https://www.consumerfinance.gov' + href
        
        # Extract Year and Season from the title
        year_match = re.search(r'\b(20\d{2})\b', title)
        year = year_match.group(1) if year_match else 'Unknown'

        season_match = re.search(r'(Winter|Spring|Summer|Fall)', title, re.IGNORECASE)
        season = season_match.group(1).capitalize() if season_match else 'Unknown'

        # Prepare the row of data
        data.append({
            'Title': title,
            'Link': href,
            'Year': year,
            'Season': season,
            'Agency': 'CFPB',
            'File Location': '',           # We'll fill this later
            'Summarized?': False,          # Default to FALSE
            'Summary': '',                 # Empty for now
            'Scraped Date': scrape_date
        })

# Create the DataFrame
df = pd.DataFrame(data)

# See the full DataFrame (optional)
pd.set_option('display.max_colwidth', None)
print(df)


                                                                                        Title  \
0                               Advanced Technologies Special Edition, Issue 38 (Winter 2025)   
1             Supervisory Highlights: Special Edition Student Lending, Issue 36 (Winter 2024)   
2                                              Supervisory Highlights, Issue 37 (Winter 2024)   
3                  Supervisory Highlights: Auto Finance Special Edition, Issue 35 (Fall 2024)   
4   Supervisory Highlights: Servicing and Collection of Consumer Debt, Issue 34 (Summer 2024)   
5                                               Supervisory Highlights, Issue 32, Spring 2024   
6                                               Supervisory Highlights, Issue 33, Spring 2024   
7                Supervisory Highlights Junk Fees Update Special Edition, Issue 31, Fall 2023   
8                                                 Download Summer 2023 Supervisory Highlights   
9                             

In [8]:

def download_pdf(file_url, save_path):
    """Download the PDF from the given URL to a local path."""
    response = requests.get(file_url)
    
    if response.status_code == 200:
        with open(save_path, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded: {save_path}")
        return True
    else:
        print(f"Failed to download: {file_url}")
        return False

def get_pdf_link_from_report_page(report_page_url):
    """Scrape the report page to find any direct PDF link."""
    print(f"Scraping PDF link from: {report_page_url}")

    response = requests.get(report_page_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find all <a> tags with class 'a-link' and href
    pdf_links = soup.find_all('a', class_='a-link', href=True)

    for link_tag in pdf_links:
        href = link_tag['href']
        
        # Check if it's a direct PDF link
        if href.lower().endswith('.pdf'):
            # Full URL if relative
            if href.startswith('/'):
                href = 'https://www.consumerfinance.gov' + href

            print(f"Found PDF link: {href}")
            return href
    
    print(f"No PDF link found on page: {report_page_url}")
    return None
    
# Local folder to store downloaded PDFs (optional)
local_download_folder = 'downloads'
os.makedirs(local_download_folder, exist_ok=True)

for idx, row in df.iterrows():
    # Step 1: Scrape the PDF link from the report page
    doc_link = row['Link']
    if doc_link.lower().endswith('.pdf'):
        # Step 2: Download PDF locally, change spaces to underscore, slash to dash, and remove commas and colons
        pdf_filename = f"{row['Title'].replace(' ', '_').replace('/', '-').replace(',', '').replace(':', '')}.pdf"
        local_file_path = os.path.join(local_download_folder, pdf_filename)

        downloaded = download_pdf(doc_link, local_file_path)

    elif doc_link.lower().endswith('/'):
        print(f"Link Not PDF found for {row['Title']}")
        actual_pdf = get_pdf_link_from_report_page(doc_link)
        if actual_pdf.lower().endswith('.pdf'):
            # Step 2: Download PDF locally, change spaces to underscore, slash to dash, and remove commas and colons
            pdf_filename = f"{row['Title'].replace(' ', '_').replace('/', '-').replace(',', '').replace(':', '')}.pdf"
            local_file_path = os.path.join(local_download_folder, pdf_filename)
    
            downloaded = download_pdf(actual_pdf, local_file_path)
    else:
        print(f"Skipped: Link ends with neither / nor .pdf for {row['Title']}")


Link Not PDF found for Advanced Technologies Special Edition, Issue 38 (Winter 2025)
Scraping PDF link from: https://www.consumerfinance.gov/data-research/research-reports/supervisory-highlights-advanced-technologies-special-edition-issue-38-winter-2025/
Found PDF link: https://files.consumerfinance.gov/f/documents/cfpb_supervisory-highlights-advanced-technologies_2025-01.pdf
Downloaded: downloads/Advanced_Technologies_Special_Edition_Issue_38_(Winter_2025).pdf
Downloaded: downloads/Supervisory_Highlights_Special_Edition_Student_Lending_Issue_36_(Winter_2024).pdf
Downloaded: downloads/Supervisory_Highlights_Issue_37_(Winter_2024).pdf
Downloaded: downloads/Supervisory_Highlights_Auto_Finance_Special_Edition_Issue_35_(Fall_2024).pdf
Downloaded: downloads/Supervisory_Highlights_Servicing_and_Collection_of_Consumer_Debt_Issue_34_(Summer_2024).pdf
Downloaded: downloads/Supervisory_Highlights_Issue_32_Spring_2024.pdf
Downloaded: downloads/Supervisory_Highlights_Issue_33_Spring_2024.pdf
Downl