In [2]:
import requests
from bs4 import BeautifulSoup
import urllib.parse
import os
import pandas as pd
from multiprocessing import Pool

 ----------------------------------------------------------------------------
  PDF DOWNLOAD AND EXTRACTION FUNCTIONS
 ----------------------------------------------------------------------------

In [3]:
def extract_pdf_urls(url):
    """
    Extracts PDF URLs from a given URL, handling redirects and using a browser-like user agent.

    This function is designed to mimic a web browser by sending a user-agent header. 
    This can be helpful to avoid being blocked by websites that restrict access 
    from automated scripts.

    Args:
        url (str): The URL to fetch and extract URLs from.

    Returns:
        list: A list of extracted PDF URLs.
    """

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36'
    }

    response = requests.get(url, headers=headers, allow_redirects=True)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all anchor tags with the desired href attribute
        pdf_links = []
        for link in soup.find_all('a', href=lambda href: href and 'pdf=' in href):
            # Extract the content after 'pdf='
            pdf_link = link['href'].split('pdf=')[1]
            # Decode the URL-encoded string
            pdf_link = urllib.parse.unquote(pdf_link)
            # Replace the first '&' with '?'
            pdf_link = pdf_link.replace('&', '?', 1)

            # Encode the URL
            pdf_link = urllib.parse.quote(pdf_link, safe='/:?=&') 

            pdf_links.append(pdf_link)

        return "Sucess: Extracted pdf links", pdf_links

    else:
        error = f"Error: Failed to fetch the page. Status code: {response.status_code} for URL: {url}"
        return error, []

In [4]:
def download_pdf(url, filename):
    """
    Downloads a PDF file from a given URL and saves it to a file, using browser-like headers.

    Args:
        url (str): The URL of the PDF file.
        filename (str): The desired filename for the downloaded PDF.
    """

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36'
    }

    response = requests.get(url, headers=headers)

    # Check if the request was successful
    if response.status_code == 200:
        with open(filename, "wb") as f:
            f.write(response.content)
        return f"Success: Downloaded {url} to {filename}"
    else:
        return f"Error: Failed to download {url}. Status code: {response.status_code}"

In [105]:
def download_pdfs_from_url(input_url, output_folder, prefix):
    """
    Downloads PDF files from a given URL, using a prefix for filenames.

    Args:
        input_url (str): The URL to fetch and extract PDF URLs from.
        output_folder (str): The folder to save the downloaded PDFs.
        prefix (str): A prefix to add to the filenames.
    """

    # create a manager.txt file to add informations such as the pdf_url, input_url and filepath
    with open(f"{output_folder}/{prefix}.csv", "w") as f:
        f.write(f"input_url, prefix, pdf_index, pdf_url, filepath, result_extract_pdf_urls, result_download_pdf\n")

        try:
            error_extract_pdfs, pdf_urls = extract_pdf_urls(input_url)
        except Exception as e:
            error_extract_pdfs = f"Error: Failed to extract PDF URLs for page {input_url}. {str(e)}"
            pdf_urls = []
            print("Error: ", error_extract_pdfs)
            print("Manager: ", prefix)

        if len(pdf_urls) == 0:
            return

        for i, pdf_url in enumerate(pdf_urls):
            filename = output_folder +  f"/{prefix}_{i+1}.pdf"  # Add prefix and index to filename

            result_download_pdf = download_pdf(pdf_url, filename)

            # write to `f` the input_url, prefix, pdf_url and filename
            f.write(f"{input_url}, {prefix}, {i}, {pdf_url}, {filename}, \"{error_extract_pdfs}\", \"{result_download_pdf}\"\n")

 ----------------------------------------------------------------------------
  DATAFRAME PROCESSING FUNCTIONS
 ----------------------------------------------------------------------------

In [155]:
import pandas as pd
from urllib.parse import urlparse

def process_funds_managed_column(csv_file):
  """Processes the "Funds managed" column in a CSV file to extract individual URLs.

  Args:
      csv_file (str): The path to the CSV file.

  Returns:
      pandas.DataFrame: The DataFrame with the "Funds managed" column processed.
  """

  df = pd.read_csv(csv_file)

  # Split "Funds managed" column into a list of URLs
  df['Funds managed'] = df['Funds managed'].str.split('https://')
  df['Funds managed'] = df['Funds managed'].apply(lambda x: [f'https://{url}' for url in x if url])

  # Verify that each element in the list is a valid URL
  def validate_urls(row_idx, urls):
    valid_urls = []
    for url in urls:
      try:
        result = urlparse(url)
        # Check for a valid domain name (netloc)
        if result.scheme and result.netloc and '.' in result.netloc:
          valid_urls.append(url)
        else:
          print(f"Warning: Invalid URL in row {row_idx}: {url}")
      except ValueError:
        print(f"Warning: Invalid URL in row {row_idx}: {url}")
    return valid_urls

  # Apply the validation function to each row
  df['Funds managed'] = df.apply(lambda row: validate_urls(row.name, row['Funds managed']), axis=1)

  return df

In [156]:
def process_row(row, output_folder):
    """
    Processes a single row of the DataFrame, downloading PDFs from the URLs in the "Funds managed" column.

    Args:
        row (pandas.Series): A row of the DataFrame.
        output_folder (str): The base output folder for downloaded PDFs.
    """
    row_index = row.name
    manager_name = row['Manager']
    manager_name = manager_name.strip()
    manager_name = str(row_index) + "_manager_" + manager_name.split('/')[-2]
    for i, url in enumerate(row['Funds managed']):
        download_pdfs_from_url(url, output_folder, f"{manager_name}_{i+1}")

 ----------------------------------------------------------------------------
  MAIN EXECUTION
 ----------------------------------------------------------------------------

In [178]:
csv_file = 'sara_funds/all_funds_1_1835.csv'
output_folder = 'downloaded_pdfs'

# create output_folder if it does not exist
os.makedirs(output_folder, exist_ok=True)

df = process_funds_managed_column(csv_file)



In [179]:
df.head()

Unnamed: 0,AlphaRating,Property,Manager,Funds managed,Fund Focus,Files
0,,,https://www2.trustnet.com/managers/factsheet/-...,[https://www2.trustnet.com/Factsheets/Factshee...,Equity,
1,,,https://www2.trustnet.com/managers/factsheet/2...,[https://www2.trustnet.com/Factsheets/Factshee...,Fixed Interest,
2,,,https://www2.trustnet.com/managers/factsheet/7...,[https://www2.trustnet.com/Factsheets/Factshee...,Mixed AssetMixed AssetMixed AssetMixed AssetMi...,https://www2.trustnet.com/images/managerPerfWa...
3,,,https://www2.trustnet.com/managers/factsheet/a...,[https://www2.trustnet.com/Factsheets/Factshee...,Equity,https://www2.trustnet.com/images/managerPerfWa...
4,,,https://www2.trustnet.com/managers/factsheet/a...,[https://www2.trustnet.com/Factsheets/Factshee...,Fixed Interest,https://www2.trustnet.com/images/managerPerfWa...


In [180]:
rows = [(row, output_folder) for _, row in df.iterrows()]

In [181]:
for row in rows:
    print(row[0].name, row[0]['Manager'],row[0]['Funds managed'])

0 https://www2.trustnet.com/managers/factsheet/--asian-equity-investment-team/utoeic/U/MC4064563/ ['https://www2.trustnet.com/Factsheets/Factsheet.aspx?fundCode=QFF1R&univ=U']
1 https://www2.trustnet.com/managers/factsheet/24-asset-management/utoeic/U/MC9980/ ['https://www2.trustnet.com/Factsheets/Factsheet.aspx?fundCode=S9F30&univ=U']
2 https://www2.trustnet.com/managers/factsheet/7im-investment-team/utoeic/U/MC4223935/ ['https://www2.trustnet.com/Factsheets/Factsheet.aspx?fundCode=BFFA4&univ=U', 'https://www2.trustnet.com/Factsheets/Factsheet.aspx?fundCode=BFFA8&univ=U', 'https://www2.trustnet.com/Factsheets/Factsheet.aspx?fundCode=BFFB2&univ=U', 'https://www2.trustnet.com/Factsheets/Factsheet.aspx?fundCode=BFFB6&univ=U', 'https://www2.trustnet.com/Factsheets/Factsheet.aspx?fundCode=Q7F29&univ=U', 'https://www2.trustnet.com/Factsheets/Factsheet.aspx?fundCode=QIF83&univ=U', 'https://www2.trustnet.com/Factsheets/Factsheet.aspx?fundCode=QIF88&univ=U', 'https://www2.trustnet.com/Factshee

In [162]:
process_row(*rows[3])

In [171]:
# Parallel Processing
with Pool(processes=16) as pool:  # Adjust the number of processes as needed
    pool.starmap(process_row, rows[:50])

In [190]:
# Parallel Processing
num_rows = len(rows)
start_index = 1726
chunk_size = 32  # Adjust chunk size as needed

for i in range(start_index, num_rows, chunk_size):
    print("Processing chunk size: ", i, ":", i + chunk_size)
    with Pool(processes=16) as pool:  # Adjust the number of processes as needed
        pool.starmap(process_row, rows[i:min(i + chunk_size, num_rows)])

Processing chunk size:  1726 : 1758


Processing chunk size:  1758 : 1790
Error:  Error: Failed to extract PDF URLs for page https://Vanguard FTSE UK Equity Income IndexVanguard FTSE Developed World ex-UK Equity IndexVanguard FTSE Developed Europe ex-UK Equity IndexVanguard US Equity IndexVanguard FTSE U.K. All Share Index Unit TrustVanguard LifeStrategy 20% EquityVanguard LifeStrategy 40% EquityVanguard LifeStrategy 60% EquityVanguard LifeStrategy 80% EquityVanguard LifeStrategy 100% EquityVanguard Target Retirement 2015Vanguard Target Retirement 2020Vanguard Target Retirement 2025Vanguard Target Retirement 2030Vanguard Target Retirement 2035Vanguard Target Retirement 2040Vanguard Target Retirement 2045Vanguard Target Retirement 2050Vanguard Target Retirement 2055Vanguard FTSE 100 Index Unit TrustVanguard FTSE Global All Cap IndexVanguard Target Retirement 2060Vanguard Target Retirement 2065. Failed to parse: https://Vanguard FTSE UK Equity Income IndexVanguard FTSE Developed World ex-UK Equity IndexVanguard FTSE Develope

In [None]:
# Parallel Processing
num_rows = len(rows)
start_index = 0
chunk_size = 32  # Adjust chunk size as needed
for i in range(start_index, num_rows, chunk_size):
    print("Processing chunk size: ", i, ":", i + chunk_size)
    with Pool(processes=16) as pool:  # Adjust the number of processes as needed
        pool.starmap(process_row, rows[i:min(i + chunk_size, num_rows)])

In [197]:
import os

folder_path = "downloaded_pdfs" 

pdf_count = 0
for filename in os.listdir(folder_path):
    if filename.endswith(".pdf"):
        pdf_count += 1

print(f"There are {pdf_count} PDF files in the folder.")

There are 2084 PDF files in the folder.
