In [10]:
import os
import urllib.request
import xml.etree.ElementTree as ET

In [12]:
def fetch_and_save_papers(save_dir="papers/ai", batch_size=2000, search_term="all:AI"):
    # Constants for API and directory
    base_url = "http://export.arxiv.org/api/query"
    max_results = 30000  # Max results the API allows

    print(f"API query to be processed: {base_url}?search_query={search_term}")

    # Create directory if it doesn't exist
    os.makedirs(save_dir, exist_ok=True)
    
    start = 0  # Start index
    while start < max_results:
        # Construct the query URL
        url = f"{base_url}?search_query={search_term}&start={start}&max_results={batch_size}"
        print(f"Fetching papers {start + 1} to {start + batch_size}...")

        try:
            # Fetch data from the API
            response = urllib.request.urlopen(url)
            data = response.read().decode('utf-8')

            # Parse the XML response
            root = ET.fromstring(data)
            entries = root.findall("{http://www.w3.org/2005/Atom}entry")
            
            if not entries:
                print("No more results found.")
                break

            for entry in entries:
                paper_id = entry.find("{http://www.w3.org/2005/Atom}id").text.split("/")[-1]
                title = entry.find("{http://www.w3.org/2005/Atom}title").text.strip()
                summary = entry.find("{http://www.w3.org/2005/Atom}summary").text.strip()
                published_date = entry.find("{http://www.w3.org/2005/Atom}published").text
                updated_date = entry.find("{http://www.w3.org/2005/Atom}updated").text
                authors = [author.find("{http://www.w3.org/2005/Atom}name").text for author in entry.findall("{http://www.w3.org/2005/Atom}author")]
                journal_ref = entry.find("{http://arxiv.org/schemas/atom}journal_ref")
                journal_ref = journal_ref.text if journal_ref is not None else "N/A"
                doi = entry.find("{http://arxiv.org/schemas/atom}doi")
                doi = doi.text if doi is not None else "N/A"

                # Prepare content
                content = f"Paper ID: {paper_id}\n"
                content += f"Title: {title}\n"
                content += f"Authors: {', '.join(authors)}\n"
                content += f"Abstract: {summary}\n"
                content += f"Published Date: {published_date}\n"
                content += f"Updated Date: {updated_date}\n"
                content += f"Journal Reference: {journal_ref}\n"
                content += f"DOI: {doi}\n"

                # Save metadata
                file_path = os.path.join(save_dir, f"{paper_id}_metadata.txt")
                with open(file_path, "w", encoding="utf-8") as file:
                    file.write(content)


            print(f"Saved {len(entries)} papers to the folder '{save_dir}'.")
            start += batch_size

        except Exception as e:
            print(f"Error fetching or saving papers: {e}")
            break

# Execute the function
fetch_and_save_papers()

API query to be processed: http://export.arxiv.org/api/query?search_query=all:AI
Fetching papers 1 to 2000...
Saved 2000 papers to the folder 'papers/ai'.
Fetching papers 2001 to 4000...
Saved 2000 papers to the folder 'papers/ai'.
Fetching papers 4001 to 6000...
Saved 2000 papers to the folder 'papers/ai'.
Fetching papers 6001 to 8000...
Saved 2000 papers to the folder 'papers/ai'.
Fetching papers 8001 to 10000...
Saved 2000 papers to the folder 'papers/ai'.
Fetching papers 10001 to 12000...
Saved 2000 papers to the folder 'papers/ai'.
Fetching papers 12001 to 14000...
Saved 2000 papers to the folder 'papers/ai'.
Fetching papers 14001 to 16000...
No more results found.
