In [25]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def check_imf_statements():
    # Define URL formats for different year ranges
    url_format_2012_onwards = "https://meetings.imf.org/en/{year}/Annual/Statements"
    url_format_2004_to_2011 = "https://www.imf.org/external/am/{year}/imfc/index.aspx"
    dataset = []

    for year in range(2024, 2003, -1):
        if year >= 2012:
            url = url_format_2012_onwards.format(year=year)
            search_pattern = "/AMSM/"
            base_url = "https://meetings.imf.org"
        else:
            url = url_format_2004_to_2011.format(year=year)
            search_pattern = "/AM/"
            base_url = "https://www.imf.org"

        response = requests.get(url)
        
        if response.status_code == 404:
            print(f"Page does not exist for the year {year}")
            continue
        
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find and filter links based on the search pattern
        filtered_links = [link for link in soup.find_all('a', href=True) if search_pattern in link['href']]
        
        # Extract links and corresponding text
        data = {
            'year': [year] * len(filtered_links),  # Repeat the year for each link
            'title': [link.get_text(strip=True) for link in filtered_links],
            'link': [base_url + link['href'] for link in filtered_links]
        }
        
        # Store data in a dataset
        dataset.extend(zip(data['year'], data['title'], data['link']))
    
    # Create a DataFrame and save it to a CSV file
    df = pd.DataFrame(dataset, columns=['Year', 'Title', 'Link'])
    df['Link'] = df['Link'].str.replace('.ashx', '.pdf')
    df.to_csv('imf_statements_links.csv', index=False)
    print("Dataset created and saved to 'imf_statements_links.csv'")

check_imf_statements()


Dataset created and saved to 'imf_statements_links.csv'
