<a href="https://colab.research.google.com/github/jesusvillota/DataScience_CemfiMaster/blob/master/Session2/Extra_BCU_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<div style="max-width: 880px; margin: 20px auto 22px; padding: 0px; border-radius: 18px; border: 1px solid #e5e7eb; background: linear-gradient(180deg, #ffffff 0%, #f9fafb 100%); box-shadow: 0 8px 26px rgba(0,0,0,0.06); overflow: hidden;">

  <!-- Banner Header -->
  <div style="padding: 34px 32px 14px; text-align: center; line-height: 1.38;">
    <div style="font-size: 13px; letter-spacing: 0.14em; text-transform: uppercase; color: #6b7280; font-weight: bold; margin-bottom: 5px;">
      Session #2
    </div>
    <div style="font-size: 29px; font-weight: 800; color: #14276c; margin-bottom: 4px;">
      BCU Scraper [Extra]
    </div>
    <div style="font-size: 16.5px; color: #374151; font-style: italic; margin-bottom: 0;">
      Data Science for Economics
    </div>
  </div>

  <!-- Logo Section -->
  <div style="background: none; text-align: center; margin: 30px 0 10px;">
    <img src="https://www.cemfi.es/images/Logo-Azul.png" alt="CEMFI Logo" style="width: 158px; filter: drop-shadow(0 2px 12px rgba(56,84,156,0.05)); margin-bottom: 0;">
  </div>

  <!-- Name -->
  <div style="font-family: 'Times New Roman', Times, serif; color: #38549c; text-align: center; font-size: 1.22em; font-weight: bold; margin-bottom: 0px;">
    Jesus Villota Miranda © 2025
  </div>

  <!-- Contact info -->
  <div style="font-family: 'Times New Roman', Times, serif; color: #38549c; text-align: center; font-size: 1em; margin-top: 7px; margin-bottom: 20px;">
    <a href="mailto:jesus.villota@cemfi.edu.es" style="color: #38549c; text-decoration: none; margin-right:8px;" title="Email">
      <!-- Email logo -->
      <!-- <img src="https://cdn-icons-png.flaticon.com/512/11679/11679732.png" alt="Email" style="width:18px; vertical-align:middle; margin-right:5px;"> -->
      jesus.villota@cemfi.edu.es
    </a>
    <span style="color:#9fa7bd;">|</span>
    <a href="https://www.linkedin.com/in/jesusvillotamiranda/" target="_blank" style="color: #38549c; text-decoration: none; margin-left:7px;" title="LinkedIn">
      <!-- LinkedIn logo -->
      <!-- <img src="https://1.bp.blogspot.com/-onvhHUdW1Us/YI52e9j4eKI/AAAAAAAAE4c/6s9wzOpIDYcAo4YmTX1Qg51OlwMFmilFACLcBGAsYHQ/s1600/Logo%2BLinkedin.png" alt="LinkedIn" style="width:22px; vertical-align:middle; margin-right:5px;"> -->
      LinkedIn
    </a>
  </div>
</div>


**IMPORTANT**: **Are you running this notebook in Google Colab?**

- If so, please make sure that in the cell below `running_in_colab` is set to `True`

- And, of course,  make sure to **run the cell**!

In [1]:
# ARE YOU RUNNING THIS IN GOOGLE COLAB? If YES, type True below
running_in_colab = False

# --- Conditional install ---
if running_in_colab:
    # Install selenium if running in Colab
    !pip install bs4 requests

In [2]:
# --- Setup params ---
URL = url = "https://www.bcu.gub.uy/Politica-Economica-y-Mercados/Paginas/Informe-de-Politica-Monetaria.aspx"
# BASE_URL = "https://www.bis.org"
OUTPUT_BASE = "output/"
DOWNLOAD_DIR = OUTPUT_BASE + "Extra/"

if running_in_colab:
  from google.colab import drive
  drive.mount('/content/gdrive')
  DOWNLOAD_DIR = f'/content/gdrive/My Drive/{DOWNLOAD_DIR}'
  print(DOWNLOAD_DIR)

import os
os.makedirs(DOWNLOAD_DIR, exist_ok=True)

# **Can we scrape the website?**

In [3]:
import requests
from bs4 import BeautifulSoup

# Step 1. Send a simple HTTP request
print(f"🔗 Requesting URL: {url}")
response = requests.get(url)
print(f"🌐 HTTP Status: {response.status_code}")
if response.status_code == 200:
    print("✅ Successfully retrieved the webpage.")
else:
    print("❌ Failed to retrieve the webpage.")

# Step 2. Parse the HTML with BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')
print("✅ Parsed the HTML content from the URL")

# (Optional) save the html content to the output directory
with open(os.path.join(DOWNLOAD_DIR, "bcu_speeches.html"), "w", encoding="utf-8") as f:
    f.write(soup.prettify())
print("✅ Saved the HTML content to the output directory.")

🔗 Requesting URL: https://www.bcu.gub.uy/Politica-Economica-y-Mercados/Paginas/Informe-de-Politica-Monetaria.aspx
🌐 HTTP Status: 200
✅ Successfully retrieved the webpage.
✅ Parsed the HTML content from the URL
✅ Saved the HTML content to the output directory.


# **Extract the links to the PDF files**

In [4]:
import re
import time
from urllib.parse import urljoin, urlparse
import os

def extract_pdf_links_from_html(soup, base_url):
    """
    Extract PDF links that match the BCU Reportes pattern
    """
    pdf_links = []
    
    # Method 1: Look for data-href attributes in table rows
    rows_with_href = soup.find_all('tr', {'data-href': True})
    for row in rows_with_href:
        href = row.get('data-href')
        if href and 'Reportes' in href and href.endswith('.pdf'):
            pdf_links.append(href)
    
    # Method 2: Look for direct links in anchor tags
    for a_tag in soup.find_all('a', href=True):
        href = a_tag['href']
        if 'Reportes' in href and href.endswith('.pdf'):
            # Convert relative URLs to absolute URLs
            if href.startswith('/'):
                href = urljoin(base_url, href)
            pdf_links.append(href)
    
    # Remove duplicates while preserving order
    seen = set()
    unique_links = []
    for link in pdf_links:
        if link not in seen:
            seen.add(link)
            unique_links.append(link)
    
    return unique_links

def categorize_pdfs(pdf_links):
    """
    Categorize PDFs into IPOM reports and Box metodológicos
    """
    ipom_reports = []
    box_metodologicos = []
    
    for link in pdf_links:
        filename = os.path.basename(link)
        if 'IPOM' in filename or 'pepmam' in filename:
            ipom_reports.append(link)
        elif 'Box' in filename or 'box' in filename:
            box_metodologicos.append(link)
        else:
            # If unsure, add to IPOM reports
            ipom_reports.append(link)
    
    return ipom_reports, box_metodologicos

In [5]:
# Extract PDF links from the saved HTML
print("🔍 Extracting PDF links from the HTML content...")
pdf_links = extract_pdf_links_from_html(soup, "https://www.bcu.gub.uy")

print(f"📊 Found {len(pdf_links)} PDF links")

# Categorize the PDFs
ipom_reports, box_metodologicos = categorize_pdfs(pdf_links)

# Display the first few links of each category
print(f"\n📈 IPOM Reports: {len(ipom_reports)} PDF links")
for i, link in enumerate(ipom_reports):
    print(f"  {i+1}. {os.path.basename(link)}")
print(f"\n📋 Box Metodológicos: {len(box_metodologicos)} PDF links")
for i, link in enumerate(box_metodologicos):
    print(f"  {i+1}. {os.path.basename(link)}")

🔍 Extracting PDF links from the HTML content...
📊 Found 108 PDF links

📈 IPOM Reports: 101 PDF links
  1. IPOM 2025.II final con box22.pdf
  2. IPOM 2025.I final con box21.pdf
  3. IPOM 2024.IV final.pdf
  4. IPOM 2024.III final.pdf
  5. IPOM 2024.II Final con box20.pdf
  6. IPOM 2024.I Final, box 18 y 19.pdf
  7. IPOM 2023.IV_Final y Box.pdf
  8. IPOM 2023.III - Final y Box.pdf
  9. IPOM 2023.II y Box final.pdf
  10. IPOM 2023.I_Final2 word.pdf
  11. IPOM-2022.IV.pdf
  12. IPOM 2022.III_final.pdf
  13. IPOM-Informe-de-Politica-Monetaria-2022-II.pdf
  14. IPOM 2022-I.pdf
  15. IPOM_2021.IV.pdf
  16. IPOM 2021.III.pdf
  17. IPOM 2021-II.pdf
  18. pepmam04i0321.pdf
  19. pepmam04i1220.pdf
  20. pepmam04i0920.pdf
  21. pepmam04i0620.pdf
  22. pepmam04i0120.pdf
  23. pepmam04i1219.pdf
  24. pepmam04i0919.pdf
  25. pepmam04i0619.pdf
  26. pepmam04i0319.pdf
  27. pepmam04i1218.pdf
  28. pepmam04i0918.pdf
  29. pepmam04i0618.pdf
  30. pepmam04i0318.pdf
  31. pepmam04i1217.pdf
  32. pepmam04i0

# **Download the PDF files**

In [6]:
# Function to download a single PDF
def download_pdf(url, download_path, session=None):
    """
    Download a PDF file from URL
    """
    if session is None:
        session = requests.Session()
    
    try:
        print(f"📄 Downloading: {os.path.basename(url)}")
        response = session.get(url, stream=True)
        response.raise_for_status()
        
        filename = os.path.basename(urlparse(url).path)
        if not filename.endswith('.pdf'):
            filename += '.pdf'
        
        filepath = os.path.join(download_path, filename)
        
        with open(filepath, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
        
        print(f"✅ Successfully downloaded: {filename}")
        return True
        
    except Exception as e:
        print(f"❌ Error downloading {url}: {str(e)}")
        return False

# Function to download a selection of PDFs
def download_pdfs_selection(pdf_list, download_dir, category_name, max_downloads=5):
    """
    Download a selection of PDFs with user control
    
    Parameters:
    - pdf_list: List of PDF URLs to download
    - download_dir: Directory to save PDFs
    - category_name: Name for logging purposes
    - max_downloads: Maximum number to download (None = download all)
    """
    print(f"\n🔽 Downloading {category_name}...")
    print(f"📁 Download directory: {download_dir}")
    
    # Determine how many PDFs to download
    if max_downloads is None:
        pdfs_to_download = pdf_list
        total_count = len(pdf_list)
        print(f"📊 Downloading ALL {total_count} PDFs...")
    else:
        pdfs_to_download = pdf_list[:max_downloads]
        total_count = min(len(pdf_list), max_downloads)
        print(f"📊 Downloading {total_count} out of {len(pdf_list)} available PDFs...")
    
    session = requests.Session()
    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
    })
    
    downloaded_count = 0
    for i, pdf_url in enumerate(pdfs_to_download):
        print(f"\n📄 [{i+1}/{total_count}] Processing: {os.path.basename(pdf_url)}")
        
        if download_pdf(pdf_url, download_dir, session):
            downloaded_count += 1
        
        # Be respectful - small delay between downloads
        time.sleep(1)
    
    print(f"\n✅ Downloaded {downloaded_count}/{total_count} {category_name}")
    return downloaded_count

In [7]:
# Create subdirectories for different types of PDFs
ipom_dir = os.path.join(DOWNLOAD_DIR, "IPOM_Reports")
box_dir = os.path.join(DOWNLOAD_DIR, "Box_Metodologicos")

os.makedirs(ipom_dir, exist_ok=True)
os.makedirs(box_dir, exist_ok=True)

# Let's download files from each category
print("🚀 Starting PDF downloads...")

# Download IPOM reports
if ipom_reports:
    print(f"\n📊 Available IPOM Reports: {len(ipom_reports)}")
    ipom_downloaded = download_pdfs_selection(pdf_list=ipom_reports, 
                                              download_dir=ipom_dir, 
                                              category_name="IPOM Reports", 
                                              max_downloads=None)

# Download Box metodológicos
if box_metodologicos:
    print(f"\n📋 Available Box Metodológicos: {len(box_metodologicos)}")
    box_downloaded = download_pdfs_selection(pdf_list=box_metodologicos, 
                                             download_dir=box_dir, 
                                             category_name="Box Metodológicos", 
                                             max_downloads=None)

print(f"\n🎉 Download Summary:")
print(f"📁 Files saved to: {DOWNLOAD_DIR}")
print(f"📈 IPOM Reports: {ipom_downloaded if 'ipom_downloaded' in locals() else 0}")
print(f"📋 Box Metodológicos: {box_downloaded if 'box_downloaded' in locals() else 0}")

🚀 Starting PDF downloads...

📊 Available IPOM Reports: 101

🔽 Downloading IPOM Reports...
📁 Download directory: output/Extra/IPOM_Reports
📊 Downloading ALL 101 PDFs...

📄 [1/101] Processing: IPOM 2025.II final con box22.pdf
📄 Downloading: IPOM 2025.II final con box22.pdf
✅ Successfully downloaded: IPOM 2025.II final con box22.pdf

📄 [2/101] Processing: IPOM 2025.I final con box21.pdf
📄 Downloading: IPOM 2025.I final con box21.pdf
✅ Successfully downloaded: IPOM 2025.I final con box21.pdf

📄 [3/101] Processing: IPOM 2024.IV final.pdf
📄 Downloading: IPOM 2024.IV final.pdf
✅ Successfully downloaded: IPOM 2024.IV final.pdf

📄 [4/101] Processing: IPOM 2024.III final.pdf
📄 Downloading: IPOM 2024.III final.pdf
✅ Successfully downloaded: IPOM 2024.III final.pdf

📄 [5/101] Processing: IPOM 2024.II Final con box20.pdf
📄 Downloading: IPOM 2024.II Final con box20.pdf
✅ Successfully downloaded: IPOM 2024.II Final con box20.pdf

📄 [6/101] Processing: IPOM 2024.I Final, box 18 y 19.pdf
📄 Downloading: