In [1]:
import requests
from bs4 import BeautifulSoup
import urllib.parse
import os

def extract_pdf_urls(url):
    """
    Extracts PDF URLs from a given URL, handling redirects and using a browser-like user agent.

    Args:
        url (str): The URL to fetch and extract URLs from.

    Returns:
        list: A list of extracted PDF URLs.
    """

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36'
    }

    response = requests.get(url, headers=headers, allow_redirects=True)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all anchor tags with the desired href attribute
        pdf_links = []
        for link in soup.find_all('a', href=lambda href: href and 'pdf=' in href):
            # Extract the content after 'pdf='
            pdf_link = link['href'].split('pdf=')[1]
            # Decode the URL-encoded string
            pdf_link = urllib.parse.unquote(pdf_link)
            # Replace the first '&' with '?'
            pdf_link = pdf_link.replace('&', '?', 1)
            pdf_links.append(pdf_link)

        return pdf_links

    else:
        print(f"Failed to fetch the page. Status code: {response.status_code} for URL: {url}")
        return []

def download_pdf(url, filename):
    """
    Downloads a PDF file from a given URL and saves it to a file, using browser-like headers.

    Args:
        url (str): The URL of the PDF file.
        filename (str): The desired filename for the downloaded PDF.
    """

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36'
    }

    response = requests.get(url, headers=headers)

    # Check if the request was successful
    if response.status_code == 200:
        with open(filename, "wb") as f:
            f.write(response.content)
        print(f"Downloaded {url} to {filename}")
    else:
        print(f"Failed to download {url}. Status code: {response.status_code}")

def download_pdfs_from_url(input_url, output_folder, prefix):
    """
    Downloads PDF files from a given URL, using a prefix for filenames.

    Args:
        input_url (str): The URL to fetch and extract PDF URLs from.
        output_folder (str): The folder to save the downloaded PDFs.
        prefix (str): A prefix to add to the filenames.
    """

    pdf_urls = extract_pdf_urls(input_url)

    # Create the output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    for i, pdf_url in enumerate(pdf_urls):
        filename = f"{prefix}_{i+1}.pdf"  # Add prefix and index to filename
        filepath = os.path.join(output_folder, filename)
        download_pdf(pdf_url, filepath)

In [2]:
# Example usage:
input_url = "https://www2.trustnet.com/Factsheets/Factsheet.aspx?fundCode=J8F4M&univ=U"
output_folder = "downloaded_pdfs"
prefix = "fund_j8f4m"

download_pdfs_from_url(input_url, output_folder, prefix)

Downloaded https://documentscdn.financialexpress.net/Literature/FF097F28977D792E9EF0B94E6C5C8D2A/214147324.pdf?citicode=J84N&universe=O to downloaded_pdfs/fund_j8f4m_1.pdf
Downloaded https://www2.trustnet.com//Factsheets/FundFactsheetPDF.aspx?fundCode=J8F4M?univ=O&citicode=J84N&Universe=O&FundCode=J8F4M&SectorCode=O:VOM&citicode=J84N&universe=O to downloaded_pdfs/fund_j8f4m_2.pdf


In [3]:
import pandas as pd

def process_funds_managed_column(csv_file):
    """
    Processes the "Funds managed" column in a CSV file, extracts individual URLs,
    downloads the HTML content, and extracts PDF URLs in parallel, using batch processing.

    Args:
        csv_file (str): The path to the CSV file.
        output_dir (str): The directory to save the downloaded HTML files.
        batch_size (int): The number of URLs to process in each batch.
    """

    df = pd.read_csv(csv_file)

    # Split "Funds managed" column into a list of URLs
    df['Funds managed'] = df['Funds managed'].str.split('https://')
    df['Funds managed'] = df['Funds managed'].apply(lambda x: [f'https://{url}' for url in x if url])
    
    return df

In [5]:
csv_file = 'sara_funds/all_funds_1_1835.csv'
all_urls = process_funds_managed_column(csv_file)