In [7]:
import os
import pandas as pd
import requests
from PyPDF2 import PdfReader
from io import BytesIO

In [8]:

df = pd.read_csv('imf_statements_links.csv')

df['Link'] = df['Link'].str.replace(r'(https://meetings.imf.org)+', 'https://meetings.imf.org', regex=True)

In [9]:
def update_link(row):
    if 2012 <= row['Year'] <= 2015:
        # Extract the last text after the last '/'
        ending_text = row['Link'].split('/')[-1]
        # Construct the new link
        return f"https://www.imf.org/External/AM/{row['Year']}/imfc/statement/eng/{ending_text}"
    return row['Link']

# Apply the function to update links
df['Link'] = df.apply(update_link, axis=1)

In [10]:
df = df[~df['Title'].str.split().str.len().eq(1)]

In [11]:
df

Unnamed: 0,Year,Title,Link
0,2024,"IMFC Statement by Christine Lagarde, President...",https://meetings.imf.org/-/media/AMSM/Files/AM...
1,2024,"IMFC Statement by HE Haitham Al Ghais, Secreta...",https://meetings.imf.org/-/media/AMSM/Files/AM...
2,2024,"IMFC Statement by Ayman Al-Sayari, Governor of...",https://meetings.imf.org/-/media/AMSM/Files/AM...
3,2024,"IMFC Statement by Antoine Armand, Minister of ...",https://meetings.imf.org/-/media/AMSM/Files/AM...
4,2024,"IMFC Statement by Luis Caputo, Minister of Eco...",https://meetings.imf.org/-/media/AMSM/Files/AM...
...,...,...,...
571,2004,IMFC Statement by the Honorable Domenico Sinis...,https://www.imf.org/External/AM/2004/imfc/stat...
572,2004,"IMFC Statement by the Honorable John W. Snow, ...",https://www.imf.org/External/AM/2004/imfc/stat...
573,2004,IMFC Statement by H.E. Sadakazu Tanigaki Minis...,https://www.imf.org/External/AM/2004/imfc/stat...
574,2004,"IMFC Statement By James D. Wolfensohn, Preside...",https://www.imf.org/External/AM/2004/imfc/stat...


In [16]:
import requests
import os
from io import BytesIO
from PyPDF2 import PdfReader
import pandas as pd

# Function to download PDF
def download_pdf(link):
    try:
        response = requests.get(link)
        response.raise_for_status()  # Raises an HTTPError for bad responses (4xx and 5xx)
        return response.content
    except requests.RequestException as e:
        print(f"Error downloading {link}: {e}")
        return None

# Function to extract text from PDF content
def extract_text_from_pdf(content):
    try:
        pdf_bytes = BytesIO(content)
        pdf_reader = PdfReader(pdf_bytes)
        text = ''
        for page in pdf_reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text
        return text
    except Exception as e:
        print(f"Error reading PDF content: {e}")
        return None

# Function to save PDF to a specified directory
def save_pdf(content, year, title, data_directory="data"):
    try:
        directory = os.path.join(data_directory, str(year))
        os.makedirs(directory, exist_ok=True)
        # Replace invalid characters in the title to prevent errors
        safe_title = "".join(c if c.isalnum() or c in (' ', '_', '-') else "_" for c in title)
        file_path = os.path.join(directory, f"{safe_title}.pdf")
        with open(file_path, 'wb') as f:
            f.write(content)
        return file_path
    except Exception as e:
        print(f"Error saving file for year {year}, title {title}: {e}")
        return None

# Function to download PDFs and store file paths
def download_files(df, data_directory="data"):
    file_paths = []
    for _, row in df.iterrows():
        year_directory = os.path.join(data_directory, str(row['Year']))
        os.makedirs(year_directory, exist_ok=True)  # Ensure the year directory exists

        # Replace invalid characters in the title to prevent errors
        safe_title = "".join(c if c.isalnum() or c in (' ', '_', '-') else "_" for c in row['Title'])
        file_path = os.path.join(year_directory, f"{safe_title}.pdf")
        
        # Skip downloading if the file already exists
        if os.path.exists(file_path):
            print(f"Skipping download for {file_path}: file already exists.")
            file_paths.append(file_path)
            continue

        content = download_pdf(row['Link'])
        if content:
            file_path = save_pdf(content, row['Year'], row['Title'], data_directory)
            file_paths.append(file_path)
        else:
            print(f"Failed to download or save file for {row['Title']}.")
            file_paths.append(None)  # Append None if download failed

    df['File_Path'] = file_paths
    return df

# Function to read and extract text from downloaded PDF files
def read_files(df):
    texts = []
    for _, row in df.iterrows():
        if row['File_Path']:
            try:
                with open(row['File_Path'], 'rb') as f:
                    content = f.read()
                text = extract_text_from_pdf(content)
                texts.append(text)
            except FileNotFoundError:
                print(f"File not found: {row['File_Path']}. Attempting to redownload.")
                content = download_pdf(row['Link'])
                if content:
                    text = extract_text_from_pdf(content)
                    texts.append(text)
                else:
                    texts.append(None)
            except Exception as e:
                print(f"Error reading file {row['File_Path']}: {e}")
                texts.append(None)
        else:
            texts.append(None)  # Append None if file is not available
    df['Extracted_Text'] = texts
    return df

# Example of usage with your DataFrame
def process_dataframe(df, data_directory="data"):
    # Step 1: Download files
    df = download_files(df, data_directory)
    
    # Step 2: Read files and extract text
    df = read_files(df)
    
    return df


processed_df = process_dataframe(df)

# Save the updated dataframe
processed_df.to_csv("processed_data.csv", index=False)

Skipping download for data\2024\IMFC Statement by Christine Lagarde_ President of the ECB.pdf: file already exists.
Skipping download for data\2024\IMFC Statement by HE Haitham Al Ghais_ Secretary General_ OPEC.pdf: file already exists.
Skipping download for data\2024\IMFC Statement by Ayman Al-Sayari_ Governor of the Saudi Central Bank _SAMA_.pdf: file already exists.
Skipping download for data\2024\IMFC Statement by Antoine Armand_ Minister of the Economy_ Finance and Industry_ France.pdf: file already exists.
Skipping download for data\2024\IMFC Statement by Luis Caputo_ Minister of Economy_ Argentina.pdf: file already exists.
Skipping download for data\2024\IMFC Statement By The Hon_ Jim Chalmers_ Treasurer_ Australia.pdf: file already exists.
Skipping download for data\2024\IMFC Statement by Mathias Cormann_ Secretary-General_ OECD.pdf: file already exists.
Skipping download for data\2024\IMFC Statement by Carlos Cuerpo_ Minister of Economy_ Trade and Business_ Spain.pdf: file alr

In [17]:
processed_df = processed_df.drop(columns=['Link', 'File_Path'])

In [20]:
print(processed_df.iloc[0]['Extracted_Text'])

 
 
 INTERNATIONAL MONETARY AND FINANCIAL COMMITTEE 
 
 
 
Fiftieth Meeting 
October 24–25, 2024 
Statement No. 50-25 
 
   
 
  
Statement by Ms. Lagarde 
European Central Bank  
 
 European Central Bank  
Directorate General Communications 
Sonnemannstrasse 20, 60314 Frankfurt am Main, Germany Tel.: +49 69 1344 7455, email: media@ecb.europa.eu, website: www.ecb.europa.eu 
Repr
oduction is permitted provided that the source is acknowledged. Speech  
IMF Annual  Meetings,  25 October 2024  
IMFC Statement  
Statement by Christine Lagarde, President of the ECB, at the fiftieth 
meeting of the International Monetary and Financial Committee 
Introduction 
Since our last meeting in April, the global growth outlook has  remained broadly unchanged. While 
global growth is projected to expand at a moderate pace, risks to the outlook have shifted to the 
downside, reflecting rising economic policy uncertainty against a back drop of heightened geopolitical 
tensions. Global headline inflation c