In [118]:
import requests
from bs4 import BeautifulSoup


def search_paper(domain: str, limit: int = 3):
    """Fetches latest or specific papers from arXiv for the given domain.

    Args:
        domain (str): _description_
        limit (int, optional): _description_. Defaults to 3.
        
    Returns:
        List of papers that successfully fetched
    """
    
    # if limit > settings.MAXIMUM_PAPER:
    #     return f"Limit to {settings.MAXIMUM_PAPER} papers for each request"
    
    url = f"https://arxiv.org/search/?query={domain}&searchtype=all&abstracts=show&order=-announced_date_first&size=50"
    response = requests.get(url)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, "html.parser")

    results = []
    papers = soup.find_all("li", class_="arxiv-result")[:limit]

    for paper in papers:
        title = paper.find("p", class_="title").text.strip()
        authors = paper.find("p", class_="authors").text.strip().replace("Authors:", "").strip()
        abstract = paper.find("span", class_="abstract-full").text.strip().replace("\n", " ")
        source_url = paper.find("p", class_="list-title").find("a")["href"]
        pdf_url = paper.find("p", class_="list-title").find("span").find("a")["href"]

        results.append({
            "title": title,
            "authors": authors,
            "abstract": abstract,
            "source_url": source_url,
            "pdf_url": pdf_url
        })
    
    return results

In [119]:
pdf_links = [
    {
        "source_url": l["source_url"].strip(),
        "pdf_url": l["pdf_url"].strip(),
    } 
    for l in search_paper("AI")
]
pdf_links

[{'source_url': 'https://arxiv.org/abs/2512.19683',
  'pdf_url': 'https://arxiv.org/pdf/2512.19683'},
 {'source_url': 'https://arxiv.org/abs/2512.19644',
  'pdf_url': 'https://arxiv.org/pdf/2512.19644'},
 {'source_url': 'https://arxiv.org/abs/2512.19632',
  'pdf_url': 'https://arxiv.org/pdf/2512.19632'}]

In [120]:
import io
from pypdf import PdfReader

def review_paper(pdf_urls: str, source_urls: str):
    """Fetch string of urls divided by comma e.g: url1, url2, url3

    Args:
        pdf_urls (str): list of string of PDF URLs
        source_urls (str): list of string of source URLs
    """
        
    try:
        pdf_urls = [i.strip() for i in pdf_urls.split(",")]
        source_urls = [i.strip() for i in source_urls.split(",")]
    except:
        return "Unable to process the urls"
    
    pdfs: list[PdfReader] = []
    for i, url in enumerate(pdf_urls):
        response = requests.get(url)
        status_code = response.status_code
        
        if status_code != 200:
            pdfs.append(f"PDF Unavailable | Take a look at {source_urls[i]}")
            continue
        
        try:
            file_object = io.BytesIO(response.content)
            reader = PdfReader(file_object)
            pdfs.append(reader.metadata)
        except:
            pdfs.append(f"Unable to process the PDF | take a look at {source_urls[i]}")
        
    return pdfs

In [121]:
review_paper(
    ",".join(i["pdf_url"] for i in pdf_links), 
    ",".join(i["source_url"] for i in pdf_links)
)

['PDF Unavailable | Take a look at https://arxiv.org/abs/2512.19683',
 'PDF Unavailable | Take a look at https://arxiv.org/abs/2512.19644',
 {'/Author': 'Da Tan; Michael Beck; Christopher P. Bidinosti; Robert H. Gulden; Christopher J. Henry',
  '/Creator': 'arXiv GenPDF (tex2pdf:57610bf)',
  '/DOI': 'https://doi.org/10.48550/arXiv.2512.19632',
  '/License': 'http://creativecommons.org/licenses/by/4.0/',
  '/PTEX.Fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.28 (TeX Live 2025) kpathsea version 6.4.1',
  '/Producer': 'pikepdf 8.15.1',
  '/Title': 'Generative diffusion models for agricultural AI: plant image generation, indoor-to-outdoor translation, and expert preference alignment',
  '/Trapped': '/False',
  '/arXivID': 'https://arxiv.org/abs/2512.19632v1'}]