## Download the title, image, summary, and file for each PDF 

In [1]:
import os
import csv
import json
import time
import requests
from bs4 import BeautifulSoup
from uuid import uuid4
from selenium import webdriver
from dotenv import load_dotenv
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from unidecode import unidecode

In [2]:
# Load the environment variables
load_dotenv("../.env")

True

In [3]:
def download_file(url, filepath):
    """Helper function to download a file from a given URL."""
   
    response = requests.get(url, stream = True)
    
    if response.status_code == 200:
        with open(filepath, "wb") as f:
            for chunk in response.iter_content(chunk_size = 8192):
                f.write(chunk)
    else:
        print(f"Failed to download: {url}")

In [4]:
def download(title, url):

    status = False

    try:
        # We need Javascript rendering to see the content
        # Ask Selenium to use Google Chrome as the driver

        # Set WebDriver options (headless mode to run without UI)
        options = Options()
        options.add_argument("--headless=new")
        
        # Ensure you download the right chrome driver from the URL mentioned above for your OS
        # Create a folder named 'chromedriver' and store the chromedriver in it 
        
        # chromedriver.exe will work only on Windows
        chromedriver_directory = os.path.join(os.path.dirname(os.getcwd()), "chromedriver", "chromedriver.exe")
        service = Service(chromedriver_directory)

        # Load the driver from Google Chrome
        driver = webdriver.Chrome(options = options, service = service)

        # Make a GET request
        driver.get(url)

        # Sleep for 10 seconds to ensure the page is completely loaded before proceeding
        time.sleep(10)

        # Fetch the source code of the webpage so we can process it
        page_source_code = driver.page_source

        # Call BeautifulSoup to parse the HTML content
        soup = BeautifulSoup(page_source_code, "html.parser")

        # Domain prefix for URLs
        url_prefix = "https://rpc.cfainstitute.org"

        # Extract the PDF download url
        download_content = soup.find('a', class_="content-asset--primary")
        download_url = ""
        if download_content:
            download_url = url_prefix + download_content.get("href", "")

        # Extract the book cover image
        cover_image_content = soup.find('img', class_="article-cover")
        cover_image_url = ""
        if cover_image_content:
            cover_image_url = url_prefix + cover_image_content.get("src", "").split('?')[0]
        
        # Extract the overview (which will be used as summary)
        # overview = ""
        # overview_content = soup.find_all('div', class_='article__paragraph')

        # if overview_content:
        #     for div in overview_content:
        #         paragraphs = div.find_all('p')
        #         for para in paragraphs:
        #             overview += unidecode(str(para.get_text()).strip().replace("\n", ""))

        overview = ""

        # If <div class="article__paragraph"> is available, scrape the text from it
        overview_content = soup.find_all('div', class_='article__paragraph')
        if overview_content:
            for div in overview_content:
                
                # Extract text from <p>, <ol>, and <ul> tags
                for tag in div.find_all(['p', 'ol', 'ul']):
                    overview = overview + " " + unidecode(tag.get_text().strip().replace("\n", ""))

        # Fallback if <div class="article__paragraph"> is not found (Aggressive scraping)
        if not overview:
            article_body = soup.find('article', class_='grid__item--article-body')
            if article_body:
                
                # Extract from <span class="overview__content">
                span_content = article_body.find('span', class_='overview__content')
                if span_content:
                    for para in span_content.find_all('p'):
                        overview = overview + " " + unidecode(para.get_text().strip().replace("\n", ""))

                # Extract from <div> tags without any class
                div_without_class = article_body.find_all('div', class_=None)
                for div in div_without_class:
                    
                    for tag in div.find_all(['p', 'ol', 'ul'], class_=None):
                        overview = overview + " " + unidecode(tag.get_text().strip().replace("\n", "").replace("\t\t\t\t\t", " "))

        # Save all downloads to download directory
        download_dir = os.getenv("DOWNLOAD_DIRECTORY", None)
        os.makedirs(download_dir, exist_ok = True)
        
        # Create a directory with a unique name
        document_id = uuid4().hex
        directory = os.path.join(download_dir, document_id)
        os.makedirs(directory, exist_ok = False)

        # Download the PDF file
        if download_url != "":
            pdf_filename = os.path.join(directory, os.path.basename(download_url))
            download_file(download_url, pdf_filename)

        # Download the cover image
        if cover_image_url != "":
            cover_image_filename = os.path.join(directory, "cover_image.jpg")
            download_file(cover_image_url, cover_image_filename)

        # Create metadata.json and store relevant details
        metadata = {
            "document_id"       : document_id,
            "title"             : title,
            "pdf_filename"      : os.path.basename(pdf_filename),
            "cover_image_url"   : cover_image_url,
            "pdf_download_url"  : download_url,
            "overview"          : overview
        }

        metadata_file = os.path.join(directory, "metadata.json")
        with open(metadata_file, "w") as f:
            json.dump(metadata, f, indent = 4)

        status = True
    
    except Exception as exception:
        print(exception)
    
    finally:
        # Stop the webdriver
        driver.quit()
    
    return status

In [5]:
def crawl():

    csv_file = os.getenv("STAGE_1_FILENAME", None)
    
    try:
        with open(csv_file, 'r') as file:
            reader = csv.reader(file)

            for row in reader:
                if len(row) != 2:
                    print(f"Skipping invalid row: {row}")
                    continue 

                title, url = row
                print(f"Downloading: {title}")
                
                # Call download for each title and URL
                success = download(title, url)
                
                if success:
                    print(f"Downloaded: {title}")
                else:
                    print(f"Failed to download: {title}")

    except Exception as exception:
        print("Error occurred: ", exception)

In [None]:
crawl()

In [22]:
download("Defined Contribution Plans: Challenges and Opportunities for Plan Sponsors", "https://rpc.cfainstitute.org/research/foundation/2021/defined-contribution-plans")

True

In [7]:
download("Investor Risk Profiling: An Overview", "https://rpc.cfainstitute.org/research/foundation/2015/investor-risk-profiling-an-overview")

True

In [6]:
download("Lifetime Financial Advice: A Personalized Optimal Multilevel Approach", "https://rpc.cfainstitute.org/research/foundation/2024/lifetime-financial-advice-a-personalized-optimal-multilevel-approach")

True