In [None]:
import os
import time
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By

# --- Setup params ---
BASE_URL = "https://www.bis.org"
DOWNLOAD_DIR = "downloads"
INITIAL_DATE = "01/01/2000"
FINAL_DATE = "11/08/2025"
PAGE_LENGTH = 10
MAX_PAGE = 2

os.makedirs(DOWNLOAD_DIR, exist_ok=True)

driver = webdriver.Chrome()

for i in range(1, MAX_PAGE + 1):
    index_url = (
        f"https://www.bis.org/cbspeeches/index.htm?m=256&fromDate={INITIAL_DATE}&tillDate={FINAL_DATE}"
        f"&cbspeeches_page={i}&cbspeeches_page_length={PAGE_LENGTH}"
    )
    print(f"\n=== Processing page {i} ===")
    driver.get(index_url)
    time.sleep(5)  # Wait for JS

    try:
        container = driver.find_element(By.ID, "cbspeeches_list")
        review_links = container.find_elements(By.CSS_SELECTOR, "a.dark[href^='/review/']")
        review_hrefs = [link.get_attribute("href") for link in review_links]
        print(f"Found {len(review_hrefs)} review links on page {i}.")
    except Exception as e:
        print(f"Could not find review links on page {i}: {e}")
        continue

    # --- Iterate over each review link ---
    for review_url in review_hrefs:
        print(f"Visiting: {review_url}")
        driver.get(review_url)
        time.sleep(2)  # Wait for detail page JS (adjust if necessary)

        # Look for pdf link on the detail page
        try:
            pdf_link = driver.find_element(By.CSS_SELECTOR, "a.pdftitle_link[href$='.pdf']")
            pdf_href = pdf_link.get_attribute("href")
            if not pdf_href.startswith("http"):
                pdf_href = BASE_URL + pdf_href
            print("PDF found:", pdf_href)

            # Download the PDF
            response = requests.get(pdf_href)
            filename = os.path.basename(pdf_href)
            save_path = os.path.join(DOWNLOAD_DIR, filename)
            with open(save_path, "wb") as f:
                f.write(response.content)
            print(f"Downloaded PDF to {save_path}")
        except Exception as e:
            print("No PDF found or error:", e)

driver.quit()



=== Processing page 1 ===
Found 10 review links on page 1.
Visiting: https://www.bis.org/review/r250728g.htm
PDF found: https://www.bis.org/review/r250728g.pdf
Downloaded PDF to downloaded_pdfs2/r250728g.pdf
Visiting: https://www.bis.org/review/r250728f.htm
PDF found: https://www.bis.org/review/r250728f.pdf
Downloaded PDF to downloaded_pdfs2/r250728f.pdf
Visiting: https://www.bis.org/review/r250728e.htm
PDF found: https://www.bis.org/review/r250728e.pdf
Downloaded PDF to downloaded_pdfs2/r250728e.pdf
Visiting: https://www.bis.org/review/r250717g.htm
PDF found: https://www.bis.org/review/r250717g.pdf
Downloaded PDF to downloaded_pdfs2/r250717g.pdf
Visiting: https://www.bis.org/review/r250728i.htm
PDF found: https://www.bis.org/review/r250728i.pdf
Downloaded PDF to downloaded_pdfs2/r250728i.pdf
Visiting: https://www.bis.org/review/r250728h.htm
PDF found: https://www.bis.org/review/r250728h.pdf
Downloaded PDF to downloaded_pdfs2/r250728h.pdf
Visiting: https://www.bis.org/review/r250717f.

In [1]:
import os
# ! pip install pyPDF2
import PyPDF2

pdf_folder = 'downloads'
output_folder = 'texts'
os.makedirs(output_folder, exist_ok=True)

for pdf_file in os.listdir(pdf_folder):
    if pdf_file.lower().endswith('.pdf'):
        pdf_path = os.path.join(pdf_folder, pdf_file)
        print(f"Extracting: {pdf_path}")
        try:
            with open(pdf_path, "rb") as f:
                reader = PyPDF2.PdfReader(f)
                text = ""
                for page in reader.pages:
                    text += page.extract_text() or ""
            txt_filename = os.path.splitext(pdf_file)[0] + ".txt"
            txt_path = os.path.join(output_folder, txt_filename)
            with open(txt_path, "w", encoding="utf-8") as f:
                f.write(text)
            print(f"Saved text to {txt_path}")
        except Exception as e:
            print(f"Error processing {pdf_path}: {e}")


Extracting: downloads/r250714a.pdf
Saved text to texts/r250714a.txt
Extracting: downloads/r250717h.pdf
Saved text to texts/r250717h.txt
Extracting: downloads/r250715b.pdf
Saved text to texts/r250715b.txt
Extracting: downloads/r250715c.pdf
Saved text to texts/r250715c.txt
Extracting: downloads/r250709f.pdf
Saved text to texts/r250709f.txt
Extracting: downloads/r250728g.pdf
Saved text to texts/r250728g.txt
Extracting: downloads/r250728f.pdf
Saved text to texts/r250728f.txt
Extracting: downloads/r250728d.pdf
Saved text to texts/r250728d.txt
Extracting: downloads/r250728e.pdf
Saved text to texts/r250728e.txt
Extracting: downloads/r250728h.pdf
Saved text to texts/r250728h.txt
Extracting: downloads/r250728i.pdf
Saved text to texts/r250728i.txt
Extracting: downloads/r250728k.pdf
Saved text to texts/r250728k.txt
Extracting: downloads/r250728j.pdf
Saved text to texts/r250728j.txt
Extracting: downloads/r250728l.pdf
Saved text to texts/r250728l.txt
Extracting: downloads/r250717f.pdf
Saved text to