In [11]:
import urllib.request  # Correct import for urlopen
import time
import feedparser
import os
import json
import requests
import fitz  # PyMuPDF
from pathlib import Path

In [None]:
def fetch_and_save_ai_papers(search_query='all:AI', start=0, total_results=30000, results_per_iteration=100, wait_time=3, output_dir='papers/metadata'):
    """
    Function to fetch AI-related papers metadata from arXiv, including abstract, and save them in JSON format.

    Args:
    - search_query (str): The query string for searching arXiv. Default is 'all:AI'.
    - start (int): The starting index for results. Default is 0.
    - total_results (int): The total number of results to fetch. Default is 30000.
    - results_per_iteration (int): The number of results per API request. Default is 100.
    - wait_time (int): The time (in seconds) to wait between requests to avoid overloading the API. Default is 3.
    - output_dir (str): Directory to store the JSON files. Default is 'papers/metadata'.
    """
    
    # Ensure the output directory exists
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Base arXiv API query URL
    base_url = 'http://export.arxiv.org/api/query?'
    
    saved_count = 0
    print(f'Searching arXiv for AI papers: {search_query}')
    
    # Iterate through results in batches
    for i in range(start, total_results, results_per_iteration):
        print(f"Results {i} - {i + results_per_iteration}")
        
        # Build the query string
        query = f'search_query={search_query}&start={i}&max_results={results_per_iteration}'
        
        # Perform the GET request using urllib.request.urlopen
        response = urllib.request.urlopen(base_url + query).read()
        
        # Parse the response using feedparser
        feed = feedparser.parse(response)
        
        # Iterate over each entry in the response
        for entry in feed.entries:
            # Extract metadata including the abstract (summary) and link
            paper_metadata = {
                'arxiv_id': entry.id.split('/abs/')[-1],
                'title': entry.title,
                'summary': entry.summary,  # This contains the abstract of the paper
                'published': entry.published,
                'updated': entry.updated,
                'authors': [author.name for author in entry.authors],
                'link': entry.link,  # Link to the paper
                'doi': entry.get('doi', 'N/A'),
                'abstract': entry.summary,  # Saving abstract explicitly
            }
            
            # Extract PDF link from the entry's links
            pdf_link = None
            for link in entry.links:
                if 'application/pdf' in link.type:
                    pdf_link = link.href
                    break
            paper_metadata['pdf_link'] = pdf_link if pdf_link else 'N/A'  # If no PDF link, set it as 'N/A'
            
            # Save metadata to a JSON file
            paper_id = paper_metadata['arxiv_id']
            paper_filename = os.path.join(output_dir, f'{paper_id}.json')
            
            try:
                with open(paper_filename, 'w') as f:
                    json.dump(paper_metadata, f, indent=4)
                saved_count += 1  # Increment the saved papers counter
                print(f'Saved metadata and abstract for paper {paper_id}')
            except Exception as e:
                print(f'Failed to save paper {paper_id}: {e}')
                continue  # Skip this paper if an error occurs

        # Display how many papers have been saved so far after each iteration
        print(f'Finished processing batch. {saved_count} papers saved so far.')

        # Wait to avoid hitting the API too quickly
        print(f'Sleeping for {wait_time} seconds')
        time.sleep(wait_time)

    # Final count display after all iterations
    print(f'Finished saving {saved_count} AI papers in total.')

In [None]:
# fetch_and_save_ai_papers(search_query='all:AI', total_results=500, results_per_iteration=50)

Searching arXiv for AI papers: all:AI
Results 0 - 100
Saved metadata and abstract for paper 2409.12922v1
Saved metadata and abstract for paper 2406.11563v3
Saved metadata and abstract for paper 2402.07632v3
Saved metadata and abstract for paper 2211.05075v1
Saved metadata and abstract for paper 2403.15481v2
Saved metadata and abstract for paper 2103.15294v1
Saved metadata and abstract for paper 2403.05551v1
Saved metadata and abstract for paper 2305.15922v1
Saved metadata and abstract for paper 2405.16424v1
Saved metadata and abstract for paper 2303.11508v1
Saved metadata and abstract for paper 2406.16696v1
Saved metadata and abstract for paper 2206.00335v2
Saved metadata and abstract for paper 2405.16355v1
Saved metadata and abstract for paper 1906.10418v1
Saved metadata and abstract for paper 2103.16168v1
Saved metadata and abstract for paper 2405.03999v1
Saved metadata and abstract for paper 2407.01557v1
Saved metadata and abstract for paper 2307.10057v1
Saved metadata and abstract 

In [22]:
# Directories
metadata_dir = 'papers/metadata'
texts_dir = 'papers/texts'
temp_dir = 'papers/temp'  # Temporary directory for storing downloaded PDFs

# Ensure the directories exist
Path(texts_dir).mkdir(parents=True, exist_ok=True)
Path(temp_dir).mkdir(parents=True, exist_ok=True)  # Create the temp directory if it doesn't exist

def extract_pdf_text(pdf_url, paper_id):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
        }
        # Download PDF
        response = requests.get(pdf_url, headers=headers)
        
        # Check if the response is not a PDF
        if 'application/pdf' not in response.headers.get('Content-Type', ''):
            print(f"Failed to download PDF for {paper_id}: Returned content is not a PDF.")
            return None
        
        # Save the PDF to a temporary file
        pdf_path = os.path.join(temp_dir, f'temp_{paper_id}.pdf')
        with open(pdf_path, 'wb') as f:
            f.write(response.content)
        
        # Open the PDF with PyMuPDF (fitz)
        doc = fitz.open(pdf_path)
        text = ""
        for page_num in range(doc.page_count):
            page = doc.load_page(page_num)  # Load each page
            text += page.get_text()  # Extract text from each page

        # Clean up the downloaded PDF file
        os.remove(pdf_path)
        
        return text
    except Exception as e:
        print(f"Error extracting text from PDF for {paper_id}: {e}")
        return None

# Process each JSON file and extract the text
def process_metadata_and_extract_text():
    processed_count = 0  # Counter for successfully processed papers
    failed_count = 0     # Counter for papers that failed to process
    
    for json_filename in os.listdir(metadata_dir):
        if json_filename.endswith('.json'):
            paper_id = json_filename.split('.')[0]  # Extract arXiv ID from filename
            json_path = os.path.join(metadata_dir, json_filename)

            # Load metadata from JSON file
            with open(json_path, 'r') as f:
                metadata = json.load(f)

            pdf_link = metadata.get('pdf_link', 'N/A')
            if pdf_link != 'N/A':
                print(f"Extracting text for {paper_id}...")

                # Extract text from PDF
                text = extract_pdf_text(pdf_link, paper_id)
                
                if text:
                    # Save extracted text into a .txt file
                    text_filename = os.path.join(texts_dir, f"{paper_id}.txt")
                    with open(text_filename, 'w', encoding='utf-8') as text_file:
                        text_file.write(text)
                    print(f"Saved extracted text for {paper_id}")
                    processed_count += 1  # Increment the counter for successfully processed papers
                else:
                    print(f"Failed to extract text for {paper_id}")
                    failed_count += 1  # Increment the counter for failed papers
            else:
                print(f"No PDF link found for {paper_id}")
                failed_count += 1  # Increment the counter for failed papers

    # Print the final counts
    print(f"Total papers processed: {processed_count}")
    print(f"Total papers failed to process: {failed_count}")

In [None]:
process_metadata_and_extract_text()

Extracting text for 0704...
Failed to download PDF for 0704: Returned content is not a PDF.
Failed to extract text for 0704
Extracting text for 0704...
Failed to download PDF for 0704: Returned content is not a PDF.
Failed to extract text for 0704
Extracting text for 0704...
Failed to download PDF for 0704: Returned content is not a PDF.
Failed to extract text for 0704
Extracting text for 0705...
Failed to download PDF for 0705: Returned content is not a PDF.
Failed to extract text for 0705
Extracting text for 0707...
Failed to download PDF for 0707: Returned content is not a PDF.
Failed to extract text for 0707
Extracting text for 0708...
Failed to download PDF for 0708: Returned content is not a PDF.
Failed to extract text for 0708
Extracting text for 0708...
Failed to download PDF for 0708: Returned content is not a PDF.
Failed to extract text for 0708
Extracting text for 0709...
Failed to download PDF for 0709: Returned content is not a PDF.
Failed to extract text for 0709
Extracti

KeyboardInterrupt: 