In [34]:
import requests 
import os  
from typing import Literal, Any
from pydantic import BaseModel, Field
from dotenv import load_dotenv
import stamina
import structlog

In [35]:
load_dotenv()
logger = structlog.get_logger()

In [36]:
class Config(BaseModel):
    query: str = Field(..., description="Search query term, e.g., 'natural language processing'")
    venue: Literal["ACL Anthology"] = Field("ACL Anthology", description="Publication venue to filter papers from")
    num_papers: int = Field(10, gt=0, description="Number of papers to retrieve")
    output_dir: str = Field("acl_papers", description="Directory to save downloaded PDFs")
    api_key: str | None = Field(os.getenv("SEMANTIC_SCHOLAR_API_KEY"), description="API key for Semantic Scholar")

In [37]:
def fetch_acl_papers(query: str, max_results: int = 100) -> dict[str, dict[str, Any]]:
    """
    Fetches ACL papers based on a query, using pagination to retrieve more results if available.
    
    Parameters:
    - query (str): The search term to query papers.
    - max_results (int): The maximum number of papers to fetch in total.
    
    Returns:
    - dict[str, dict[str, Any]]: A dictionary with ACL IDs as keys and dictionaries containing
      the title and paperId as values.
    """
    api_key = os.getenv("SEMANTIC_SCHOLAR_API_KEY")
    url = "https://api.semanticscholar.org/graph/v1/paper/search"
    
    params = {
        "query": query,
        "fields": "paperId,title,externalIds,openAccessPdf",
        "limit": 100
    }
    headers = {"x-api-key": api_key} if api_key else {}

    acl_papers = {}
    total_fetched = 0
    token = None

    while total_fetched < max_results:
        if token:
            params["token"] = token

        try:
            response = requests.get(url, params=params, headers=headers)
            response.raise_for_status()
            data = response.json()

            # Process and store ACL papers
            for paper in data.get("data", []):
                external_ids = paper.get("externalIds", {})
                if "ACL" in external_ids:
                    acl_id = external_ids["ACL"]
                    acl_papers[acl_id] = {
                        "title": paper.get("title"),
                        "paperId": paper.get("paperId"),
                        "openAccessPdf": paper.get("openAccessPdf", {}).get("url") if paper.get("openAccessPdf") else None
                    }
                    total_fetched += 1
                    if total_fetched >= max_results:
                        break

            # Get the next token for pagination, or break if no more pages
            token = data.get("token")
            if not token:
                break

        except requests.exceptions.RequestException as e:
            logger.error(f"An error occurred: {e}")
            break
    
    logger.info(f"Retrieved {len(acl_papers)} ACL papers.")
    return acl_papers

In [38]:
acl_papers = fetch_acl_papers(query="natural language processing", max_results=20)
for acl_id, details in acl_papers.items():
    print(f"ACL ID: {acl_id}, Title: {details['title']}, Paper ID: {details['paperId']}")


[2m2024-10-31 21:29:54[0m [[32m[1minfo     [0m] [1mRetrieved 20 ACL papers.      [0m
ACL ID: P14-5010, Title: The Stanford CoreNLP Natural Language Processing Toolkit, Paper ID: 2f5102ec3f70d0dea98c957cc2cab4d15d83a2da
ACL ID: J00-2011, Title: Book Reviews: Foundations of Statistical Natural Language Processing, Paper ID: 084c55d6432265785e3ff86a2e900a49d501c00a
ACL ID: 2020.acl-demos.14, Title: Stanza: A Python Natural Language Processing Toolkit for Many Human Languages, Paper ID: 641a9749fe546a02bbab9a86bfc91492db1c3bc5
ACL ID: 2023.nlposs-1.4, Title: PyThaiNLP: Thai Natural Language Processing in Python, Paper ID: 17fd7b820b0879734a2c08c20a890ddc526cd83d
ACL ID: D15-1075, Title: A large annotated corpus for learning natural language inference, Paper ID: f04df4e20a18358ea2f689b4c129781628ef7fc1
ACL ID: J96-1002, Title: A Maximum Entropy Approach to Natural Language Processing, Paper ID: fb486e03369a64de2d5b0df86ec0a7b55d3907db
ACL ID: J00-4006, Title: Book Reviews: Speech and

In [None]:
@stamina.retry(on=requests.exceptions.RequestException, attempts=3)
def download_pdf(pdf_url: str, pdf_filename: str) -> bool:
    """
    Attempts to download a PDF file from the given URL and save it to the specified filename.
    Retries up to 3 times on network-related exceptions.

    Parameters:
    - pdf_url (str): The URL of the PDF to download.
    - pdf_filename (str): The path where the PDF will be saved.

    Returns:
    - bool: True if download is successful, False otherwise.
    """
    response = requests.get(pdf_url, stream=True)
    response.raise_for_status()
    
    with open(pdf_filename, "wb") as pdf_file:
        for chunk in response.iter_content(chunk_size=8192):
            pdf_file.write(chunk)
    
    return True

def get_pdfs_from_acl_id(acl_papers: dict[str, dict[str, Any]], output_dir: str = "acl_papers") -> None:
    """
    Downloads PDFs for the given ACL papers if a PDF URL is available, with retry capability,
    and provides a summary of results.

    Parameters:
    - acl_papers (Dict[str, Dict[str, Any]]): Dictionary of ACL papers with ACL IDs as keys and metadata as values.
    - output_dir (str): Directory to save downloaded PDFs.
    """
    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Track successful and failed downloads
    fetched_count = 0
    unable_to_fetch = []

    for acl_id, details in acl_papers.items():
        pdf_url = details.get("openAccessPdf")
        
        if pdf_url:
            pdf_filename = os.path.join(output_dir, f"{acl_id}.pdf")
            try:
                success = download_pdf(pdf_url, pdf_filename)
                if success:
                    logger.info(f"Downloaded PDF for {acl_id}: {details['title']}")
                    fetched_count += 1
            except requests.exceptions.RequestException as e:
                logger.error(f"Failed to download PDF for {acl_id}: {details['title']} after 3 attempts. Error: {e}")
                unable_to_fetch.append(f"{acl_id}: {details['title']}")
        else:
            logger.error(f"No PDF available for {acl_id}: {details['title']}")
            unable_to_fetch.append(f"{acl_id}: {details['title']}")

    # Summary of download results
    logger.info(f"\nSummary:\nFetched {fetched_count} PDFs.\n")
    logger.warning(f"Unable to fetch {len(unable_to_fetch)} PDFs:\n" + "\n".join(unable_to_fetch))

In [42]:
acl_papers = fetch_acl_papers(query="natural language processing", max_results=20)
get_pdfs_from_acl_id(acl_papers, output_dir="./acl_papers")

[2m2024-10-31 21:30:38[0m [[32m[1minfo     [0m] [1mRetrieved 20 ACL papers.      [0m
[2m2024-10-31 21:30:38[0m [[32m[1minfo     [0m] [1mDownloaded PDF for P14-5010: The Stanford CoreNLP Natural Language Processing Toolkit[0m
[2m2024-10-31 21:30:38[0m [[31m[1merror    [0m] [1mNo PDF available for J00-2011: Book Reviews: Foundations of Statistical Natural Language Processing[0m
[2m2024-10-31 21:30:40[0m [[31m[1merror    [0m] [1mFailed to download PDF for 2020.acl-demos.14: Stanza: A Python Natural Language Processing Toolkit for Many Human Languages after 3 attempts. Error: 406 Client Error: Not Acceptable for url: https://www.aclweb.org/anthology/2020.acl-demos.14.pdf[0m
[2m2024-10-31 21:30:40[0m [[32m[1minfo     [0m] [1mDownloaded PDF for 2023.nlposs-1.4: PyThaiNLP: Thai Natural Language Processing in Python[0m
[2m2024-10-31 21:30:43[0m [[31m[1merror    [0m] [1mFailed to download PDF for D15-1075: A large annotated corpus for learning natural la

In [43]:
import requests
import os
from typing import Literal, Any
from pydantic import BaseModel, Field
from dotenv import load_dotenv
import stamina
import structlog

load_dotenv()
logger = structlog.get_logger()

class Config(BaseModel):
    query: str = Field(..., description="Search query term, e.g., 'natural language processing'")
    venue: Literal["ACL Anthology"] = Field("ACL Anthology", description="Publication venue to filter papers from")
    num_papers: int = Field(10, gt=0, description="Number of papers to retrieve")
    output_dir: str = Field("acl_papers", description="Directory to save downloaded PDFs")
    api_key: str | None = Field(None, description="Optional API key for Semantic Scholar")
    retry_attempts: int = Field(3, description="Number of retry attempts for downloading PDFs")

def fetch_acl_papers(query: str, max_results: int = 100) -> dict[str, dict[str, Any]]:
    """
    Fetches ACL papers based on a query, using pagination to retrieve more results if available.
    
    Parameters:
    - query (str): The search term to query papers.
    - max_results (int): The maximum number of papers to fetch in total.
    
    Returns:
    - dict[str, dict[str, Any]]: A dictionary with ACL IDs as keys and dictionaries containing
      the title and paperId as values.
    """
    api_key = os.getenv("SEMANTIC_SCHOLAR_API_KEY")
    url = "https://api.semanticscholar.org/graph/v1/paper/search"
    
    params = {
        "query": query,
        "fields": "paperId,title,externalIds,openAccessPdf",
        "limit": 100
    }
    headers = {"x-api-key": api_key} if api_key else {}

    acl_papers = {}
    total_fetched = 0
    token = None

    while total_fetched < max_results:
        if token:
            params["token"] = token

        try:
            response = requests.get(url, params=params, headers=headers)
            response.raise_for_status()
            data = response.json()

            # Process and store ACL papers
            for paper in data.get("data", []):
                external_ids = paper.get("externalIds", {})
                if "ACL" in external_ids:
                    acl_id = external_ids["ACL"]
                    acl_papers[acl_id] = {
                        "title": paper.get("title"),
                        "paperId": paper.get("paperId"),
                        "openAccessPdf": paper.get("openAccessPdf", {}).get("url") if paper.get("openAccessPdf") else None
                    }
                    total_fetched += 1
                    if total_fetched >= max_results:
                        break

            # Get the next token for pagination, or break if no more pages
            token = data.get("token")
            if not token:
                break

        except requests.exceptions.RequestException as e:
            logger.error(f"An error occurred: {e}")
            break
    
    logger.info(f"Retrieved {len(acl_papers)} ACL papers.")
    return acl_papers


@stamina.retry(on=requests.exceptions.RequestException, attempts=3)
def download_pdf(pdf_url: str, pdf_filename: str) -> bool:
    """
    Attempts to download a PDF file from the given URL and save it to the specified filename.
    Retries up to 3 times on network-related exceptions.

    Parameters:
    - pdf_url (str): The URL of the PDF to download.
    - pdf_filename (str): The path where the PDF will be saved.

    Returns:
    - bool: True if download is successful, False otherwise.
    """
    response = requests.get(pdf_url, stream=True)
    response.raise_for_status()
    
    with open(pdf_filename, "wb") as pdf_file:
        for chunk in response.iter_content(chunk_size=8192):
            pdf_file.write(chunk)
    
    return True

def get_pdfs_from_acl_id(acl_papers: dict[str, dict[str, Any]], output_dir: str = "acl_papers") -> None:
    """
    Downloads PDFs for the given ACL papers if a PDF URL is available, with retry capability,
    and provides a summary of results.

    Parameters:
    - acl_papers (Dict[str, Dict[str, Any]]): Dictionary of ACL papers with ACL IDs as keys and metadata as values.
    - output_dir (str): Directory to save downloaded PDFs.
    """
    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Track successful and failed downloads
    fetched_count = 0
    unable_to_fetch = []

    for acl_id, details in acl_papers.items():
        pdf_url = details.get("openAccessPdf")
        
        if pdf_url:
            pdf_filename = os.path.join(output_dir, f"{acl_id}.pdf")
            try:
                success = download_pdf(pdf_url, pdf_filename)
                if success:
                    logger.info(f"Downloaded PDF for {acl_id}: {details['title']}")
                    fetched_count += 1
            except requests.exceptions.RequestException as e:
                logger.error(f"Failed to download PDF for {acl_id}: {details['title']} after 3 attempts. Error: {e}")
                unable_to_fetch.append(f"{acl_id}: {details['title']}")
        else:
            logger.error(f"No PDF available for {acl_id}: {details['title']}")
            unable_to_fetch.append(f"{acl_id}: {details['title']}")

    # Summary of download results
    logger.info(f"\nSummary:\nFetched {fetched_count} PDFs.\n")
    logger.warning(f"Unable to fetch {len(unable_to_fetch)} PDFs:\n" + "\n".join(unable_to_fetch))

def main(config: Config) -> None:
    """
    Main function to fetch and download ACL papers.
    """
    # Set API key from config if provided
    if config.api_key:
        os.environ["SEMANTIC_SCHOLAR_API_KEY"] = config.api_key

    # Fetch ACL papers
    acl_papers = fetch_acl_papers(query=config.query, max_results=config.num_papers)

    # Download PDFs for fetched papers
    get_pdfs_from_acl_id(acl_papers, output_dir=config.output_dir)

# Run the script using tyro to parse CLI arguments into Config class directly

In [46]:
config = Config(query="vision language retrieval" , num_papers=10)

In [47]:
main(config)

[2m2024-10-31 21:45:51[0m [[32m[1minfo     [0m] [1mRetrieved 1 ACL papers.       [0m
[2m2024-10-31 21:45:51[0m [[32m[1minfo     [0m] [1mDownloaded PDF for 2022.emnlp-main.488: mPLUG: Effective and Efficient Vision-Language Learning by Cross-modal Skip-connections[0m
[2m2024-10-31 21:45:51[0m [[32m[1minfo     [0m] [1m
Summary:
Fetched 1 PDFs.
    [0m
      [0m
