In [1]:
import os
import json
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
import requests
import time


In [2]:
def fetch_metadata(arxiv_id, max_retries, initial_delay):
    url = f"https://api.semanticscholar.org/v1/paper/arXiv:{arxiv_id}"
    
    for attempt in range(max_retries):
        response = requests.get(url)
        if response.status_code == 200:
            return response.json()
        elif response.status_code == 429:  # Too Many Requests
            delay = initial_delay * (2 ** attempt)  # Exponential backoff
            print(f"Rate limit hit, waiting {delay} seconds...")
            time.sleep(delay)
            continue
        else:
            print(f"Failed :{arxiv_id}. Status code: {response.status_code}, Response: {response.text}")
            return None
    
    print(f"Max retries exceeded for {arxiv_id}")
    return None

In [3]:
def process_files(metadata_dir, max_workers):
    """
    Process files with better error handling and progress tracking
    """
    files = os.listdir(metadata_dir)
    total_files = len(files)
    
    # Track successful and failed papers
    successful = []
    failed = []
    
    # Create progress bar
    pbar = tqdm(
        total=total_files,
        desc="Fetching metadata",
        unit="papers"
    )
    
    def process_and_update(file):
        try:
            arxiv_id = file.replace('_', '/').replace('.json', '')
            response = fetch_metadata(
                arxiv_id,
                max_retries=10,     
                initial_delay=2     
            )
            
            if response:
                with open(os.path.join(metadata_dir, file), 'w') as f:
                    json.dump(response, f, indent=4)
                successful.append(arxiv_id)
            else:
                failed.append(arxiv_id)
                
            pbar.update(1)
            
        except Exception as e:
            print(f"Error processing {file}: {str(e)}")
            failed.append(file)
            pbar.update(1)
    
    # Process files in parallel with reduced workers
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(process_and_update, file) for file in files]
        for future in futures:
            future.result()
    
    pbar.close()
    
    # Save progress report
    report = {
        'total_processed': len(files),
        'successful': len(successful),
        'failed': len(failed),
        'failed_ids': failed
    }
    
    with open('fetch_report.json', 'w') as f:
        json.dump(report, f, indent=4)
    
    print(f"\nProcessing complete!")
    print(f"Successfully processed: {len(successful)}")
    print(f"Failed: {len(failed)}")
    if failed:
        print("Failed IDs saved to fetch_report.json")

In [4]:
process_files('papers/metadata_lg/1', max_workers=2)

Fetching metadata:   1%|          | 79/10000 [00:42<1:27:43,  1.88papers/s]