In [None]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd
from datetime import datetime

def fetch_arxiv_papers_by_year(query="climate change", year=2020, max_results=10):
    """
    Fetches academic papers related to a query from arXiv that were submitted in a given year.

    Parameters:
    - query (str): The search term (e.g., "climate change").
    - year (int): The year to filter papers by submission date.
    - max_results (int): Maximum number of papers to retrieve for that year.

    Returns:
    - List of dictionaries containing Title, Published date, Authors, Abstract, and Link.
    """
    base_url = "http://export.arxiv.org/api/query"

    # Construct date range for the year: YYYY01010000 to YYYY12312359
    date_from = f"{year}01010000"
    date_to   = f"{year}12312359"
    
    # Construct the search query with the date range.
    search_query = f"all:{query} AND submittedDate:[{date_from} TO {date_to}]"
    
    params = {
        "search_query": search_query,
        "start": 0,
        "max_results": max_results,
        "sortBy": "relevance",
        "sortOrder": "descending"
    }
    
    response = requests.get(base_url, params=params)
    if response.status_code != 200:
        print(f"Failed to fetch data from arXiv for year {year}")
        return []
    
    root = ET.fromstring(response.text)
    papers = []
    
    for entry in root.findall("{http://www.w3.org/2005/Atom}entry"):
        title = entry.find("{http://www.w3.org/2005/Atom}title").text.strip()
        abstract = entry.find("{http://www.w3.org/2005/Atom}summary").text.strip()
        link = entry.find("{http://www.w3.org/2005/Atom}id").text.strip()
        
        # Get the published date
        published = entry.find("{http://www.w3.org/2005/Atom}published").text.strip() if entry.find("{http://www.w3.org/2005/Atom}published") is not None else "N/A"
        
        # Aggregate authors (there can be multiple author tags)
        authors = []
        for author in entry.findall("{http://www.w3.org/2005/Atom}author"):
            name = author.find("{http://www.w3.org/2005/Atom}name").text.strip()
            authors.append(name)
        authors_str = ", ".join(authors)
        
        papers.append({
            "Title": title,
            "Published": published,
            "Authors": authors_str,
            "Abstract": abstract,
            "Link": link
        })
        
    return papers

def fetch_and_save_by_year(years, query="climate change", max_results=10):
    """
    For each year provided, fetch papers from arXiv and save the results
    into a CSV file named "climate_papers_{year}.csv".
    
    Parameters:
    - years (list of int): List of years to process.
    - query (str): The search query for the papers.
    - max_results (int): The maximum number of papers per year.
    """
    for year in years:
        print(f"Fetching papers for {year}...")
        papers_data = fetch_arxiv_papers_by_year(query=query, year=year, max_results=max_results)
        
        if papers_data:
            df = pd.DataFrame(papers_data)
            filename = f"climate_papers_{year}.csv"
            df.to_csv(filename, index=False)
            print(f"Data for {year} saved to {filename}")
        else:
            print(f"No data fetched for {year}.")

In [7]:
fetch_and_save_by_year(years=[2019, 2020, 2021], query="climate change", max_results=10000)

Fetching papers for 2019...
Data for 2019 saved to climate_papers_2019.csv
Fetching papers for 2020...
Data for 2020 saved to climate_papers_2020.csv
Fetching papers for 2021...
Data for 2021 saved to climate_papers_2021.csv


In [8]:
fetch_and_save_by_year(years=[2022, 2023, 2024, 2025], query="climate change", max_results=10000)

Fetching papers for 2022...
Data for 2022 saved to climate_papers_2022.csv
Fetching papers for 2023...
Data for 2023 saved to climate_papers_2023.csv
Fetching papers for 2024...
Data for 2024 saved to climate_papers_2024.csv
Fetching papers for 2025...
Data for 2025 saved to climate_papers_2025.csv
