In [1]:
import requests 
import os  
from typing import Literal 
from pydantic import BaseModel, Field
import tyro
from dotenv import load_dotenv

In [2]:
load_dotenv()

False

In [3]:
class Config(BaseModel):
    query: str = Field(..., description="Search query term, e.g., 'natural language processing'")
    venue: Literal["ACL Anthology"] = Field("ACL Anthology", description="Publication venue to filter papers from")
    num_papers: int = Field(10, gt=0, description="Number of papers to retrieve")
    output_dir: str = Field("acl_papers", description="Directory to save downloaded PDFs")
    api_key: str | None = Field(os.getenv("SEMANTIC_SCHOLAR_API_KEY"), description="API key for Semantic Scholar")

In [5]:
def fetch_acl_bulk(config) -> list[str]:
    # Base URL for Semantic Scholar bulk search API
    url = "https://api.semanticscholar.org/graph/v1/paper/search/bulk"
    
    # Query parameters
    params = {
        "query": config.query,
        "fields": "paperId,title,venue",
        "limit": 1000,
        "venue": "ACL Anthology"  # Add other venue names if needed
    }
    
    headers = {"x-api-key": config.api_key} if config.api_key else {}

    all_acl_ids = []
    token = None  # For pagination

    while True:
        if token:
            params["token"] = token  # Use pagination token if present

        try:
            # Make the request
            response = requests.get(url, params=params, headers=headers)
            response.raise_for_status()

            # Parse the response
            data = response.json()
            for paper in data.get("data", []):
                if "ACL" in paper.get("venue", ""):
                    all_acl_ids.append(paper["paperId"])

            # Print out progress and handle pagination
            print(f"Fetched {len(data.get('data', []))} papers, Total ACL papers: {len(all_acl_ids)}")
            
            # Check for a continuation token to fetch more results
            token = data.get("token")
            if not token:
                break  # Stop if there is no token, meaning we're done with all pages

        except requests.exceptions.RequestException as e:
            print(f"An error occurred: {e}")
            break

    return all_acl_ids

In [6]:
# Test script for fetch_acl_ids

# Sample configuration
config = Config(
    query="natural language processing",
    venue="ACL Anthology",
    num_papers=5,  # Set a small number for testing
    output_dir="acl_papers"  # This will be used later when saving PDFs
)

# Fetch ACL paper metadata
acl_ids = fetch_acl_bulk(config)
# Print the results to verify
print("Retrieved Papers:")
for paper in acl_ids:
    print(f"Title: {paper['title']}")
    print(f"Abstract: {paper['abstract'][:150]}...")  # Print only the first 150 chars of the abstract for brevity
    print(f"ACL ID: {paper['acl_id']}")
    print()


Fetched 0 papers, Total ACL papers: 0
Retrieved Papers:


In [9]:
import requests
import os

# Set up the API key and endpoint
api_key = os.getenv("SEMANTIC_SCHOLAR_API_KEY")
url = "https://api.semanticscholar.org/graph/v1/paper/search/bulk"

# Parameters for testing with a strict limit
params = {
    "query": "natural language processing",
    "fields": "paperId,title,venue,externalIds",
    "limit": 5  # Strictly fetch only 5 results
}

# Headers for API authentication
headers = {"x-api-key": api_key} if api_key else {}

# Make the API request
try:
    response = requests.get(url, params=params, headers=headers)
    response.raise_for_status()  # Raises an error for bad status codes
    data = response.json()       # Parse JSON response

    # Print the raw response to inspect structure and content
    print("Raw Response Data:", data)

    # Check each paper for the presence of ACL ID in externalIds
    for paper in data.get("data", []):
        external_ids = paper.get("externalIds", {})
        if "ACL" in external_ids:
            print(f"Paper ID: {paper.get('paperId')}")
            print(f"Title: {paper.get('title')}")
            print(f"Venue: {paper.get('venue')}")
            print(f"ACL ID: {external_ids['ACL']}\n")

except requests.exceptions.RequestException as e:
    print(f"An error occurred: {e}")


Paper ID: 00088b7e7739bd82dccd0eb30405bdba4f22e139
Title: Proper Name Extraction from Web Pages for Finding People in Internet
Venue: ROCLING/IJCLCLP
ACL ID: O97-1010

Paper ID: 0012a2a14d69cf6cea974503f90affcb32b96a7a
Title: TamilATIS: Dataset for Task-Oriented Dialog in Tamil
Venue: DRAVIDIANLANGTECH
ACL ID: 2022.dravidianlangtech-1.4

Paper ID: 0019c072a19c6296e3ed90056fb8caa79b10ab02
Title: Embedding Words and Senses Together via Joint Knowledge-Enhanced Training
Venue: Conference on Computational Natural Language Learning
ACL ID: K17-1012

Paper ID: 0021b3beb2ee0906b425eef7c0f453623c1c6a03
Title: Certified Robustness to Word Substitution Attack with Differential Privacy
Venue: North American Chapter of the Association for Computational Linguistics
ACL ID: 2021.naacl-main.87

Paper ID: 0025b0c655e678688ebd204b24f4d1be8490bdbc
Title: EMNLP versus ACL: Analyzing NLP research over time
Venue: Conference on Empirical Methods in Natural Language Processing
ACL ID: D15-1235

Paper ID: 00

In [11]:
import requests
import os

# Set up the API key and endpoint
api_key = os.getenv("SEMANTIC_SCHOLAR_API_KEY")
url = "https://api.semanticscholar.org/graph/v1/paper/search"

# Query parameters without venue to test
params = {
    "query": "natural language processing",
    "fields": "paperId,title,venue,externalIds",
    "limit": 5  # Set a small limit to inspect a few results
}

# Headers for API authentication
headers = {"x-api-key": api_key} if api_key else {}

# Make the API request
try:
    response = requests.get(url, params=params, headers=headers)
    response.raise_for_status()  # Raises an error for bad status codes
    data = response.json()       # Parse JSON response

    # Print the raw response to inspect structure and content
    print("Raw Response Data:", data)

    # Check and display all available venue names
    for paper in data.get("data", []):
        print(f"Paper ID: {paper.get('paperId')}")
        print(f"Title: {paper.get('title')}")
        print(f"Venue: {paper.get('venue')}")
        print(f"External IDs: {paper.get('externalIds')}\n")

except requests.exceptions.RequestException as e:
    print(f"An error occurred: {e}")


Raw Response Data: {'total': 1279299, 'offset': 0, 'next': 5, 'data': [{'paperId': '2f5102ec3f70d0dea98c957cc2cab4d15d83a2da', 'externalIds': {'ACL': 'P14-5010', 'MAG': '2123442489', 'DBLP': 'conf/acl/ManningSBFBM14', 'DOI': '10.3115/v1/P14-5010', 'CorpusId': 14068874}, 'title': 'The Stanford CoreNLP Natural Language Processing Toolkit', 'venue': 'Annual Meeting of the Association for Computational Linguistics'}, {'paperId': '28692beece311a90f5fa1ca2ec9d0c2ce293d069', 'externalIds': {'DBLP': 'journals/csur/LiuYFJHN23', 'ArXiv': '2107.13586', 'DOI': '10.1145/3560815', 'CorpusId': 236493269}, 'title': 'Pre-train, Prompt, and Predict: A Systematic Survey of Prompting Methods in Natural Language Processing', 'venue': 'ACM Computing Surveys'}, {'paperId': 'bc1022b031dc6c7019696492e8116598097a8c12', 'externalIds': {'MAG': '2158899491', 'DBLP': 'journals/jmlr/CollobertWBKKK11', 'ArXiv': '1103.0398', 'DOI': '10.5555/1953048.2078186', 'CorpusId': 351666}, 'title': 'Natural Language Processing (