In [4]:
# 📌 Task 2: Extract Text from arXiv PDFs (No Poppler, No OCR if possible)
# ----------------------------------------------------------
# Uses PyMuPDF (fitz) to extract text directly from arXiv PDFs
# Saves both PDFs and TXT outputs

import os
import requests
import xml.etree.ElementTree as ET
import fitz  # PyMuPDF

# ------------ Config ------------
CATEGORY = "cs.CL"       # arXiv subcategory
MAX_RESULTS = 20         # test with 20; set to 200 for full run
PDF_DIR = "pdf_downloads"
TXT_DIR = "pdf_text"

os.makedirs(PDF_DIR, exist_ok=True)
os.makedirs(TXT_DIR, exist_ok=True)

# ------------ Fetch papers from arXiv API ------------
def fetch_arxiv_papers(category, max_results):
    base_url = "http://export.arxiv.org/api/query"
    params = {
        "search_query": f"cat:{category}",
        "start": 0,
        "max_results": max_results,
        "sortBy": "submittedDate",
        "sortOrder": "descending"
    }
    response = requests.get(base_url, params=params)
    root = ET.fromstring(response.text)
    ns = {"atom": "http://www.w3.org/2005/Atom"}
    
    papers = []
    for entry in root.findall("atom:entry", ns):
        url = entry.find("atom:id", ns).text
        title = entry.find("atom:title", ns).text.strip().replace("\n", " ")
        authors = [a.find("atom:name", ns).text for a in entry.findall("atom:author", ns)]
        date = entry.find("atom:published", ns).text[:10]
        papers.append({"url": url, "title": title, "authors": authors, "date": date})
    return papers

# ------------ Download PDF ------------
def download_pdf(paper):
    pdf_url = paper["url"].replace("/abs/", "/pdf/") + ".pdf"
    pdf_filename = os.path.basename(pdf_url)
    pdf_path = os.path.join(PDF_DIR, pdf_filename)

    if not os.path.exists(pdf_path):
        try:
            r = requests.get(pdf_url, stream=True)
            if r.status_code == 200:
                with open(pdf_path, "wb") as f:
                    for chunk in r.iter_content(1024):
                        f.write(chunk)
        except Exception as e:
            print(f"⚠️ Failed to download {pdf_url}: {e}")
            return None
    return pdf_path

# ------------ Extract Text from PDF ------------
def extract_text_from_pdf(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        text_content = ""
        for i, page in enumerate(doc, start=1):
            text = page.get_text()
            text_content += f"\n\n--- Page {i} ---\n\n{text}"
        doc.close()
        return text_content.strip()
    except Exception as e:
        print(f"⚠️ Text extraction failed for {pdf_path}: {e}")
        return ""

# ------------ Main Workflow ------------
papers = fetch_arxiv_papers(CATEGORY, MAX_RESULTS)
print(f"Fetched {len(papers)} papers from arXiv category {CATEGORY}")

for paper in papers:
    pdf_path = download_pdf(paper)
    if pdf_path:
        text = extract_text_from_pdf(pdf_path)
        if text:
            txt_filename = os.path.join(
                TXT_DIR,
                os.path.basename(pdf_path).replace(".pdf", ".txt")
            )
            with open(txt_filename, "w", encoding="utf-8") as f:
                f.write(f"Title: {paper['title']}\n")
                f.write(f"Authors: {', '.join(paper['authors'])}\n")
                f.write(f"Date: {paper['date']}\n")
                f.write(f"URL: {paper['url']}\n\n")
                f.write(text)

print(f"✅ Completed: PDFs in '{PDF_DIR}', text files in '{TXT_DIR}'")


Fetched 20 papers from arXiv category cs.CL
✅ Completed: PDFs in 'pdf_downloads', text files in 'pdf_text'
