In [1]:
# Jupyter Notebook: Batch OCR + TXT Export for arXiv Abstracts
# -----------------------------------------------------------

# Cell 1: Imports and Configuration
import os
import json
from PIL import Image
import pytesseract

# -----------------------------
# Configuration
# -----------------------------
TASK1_JSON = "arxiv_clean.json"      # Input JSON from Task 1
SCREENSHOT_DIR = "screenshots"       # Folder with screenshots (optional)
OUTPUT_DIR = "papers_txt"            # Folder to save each paper as .txt
OCR_ENABLED = True                    # Set False if no screenshots

# Ensure output folders exist
os.makedirs(SCREENSHOT_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("Configuration set. Ready to process papers.")

# -----------------------------------------------------------
# Cell 2: Load Task 1 JSON
with open(TASK1_JSON, "r", encoding="utf-8") as f:
    papers_data = json.load(f)

print(f"Loaded {len(papers_data)} papers from {TASK1_JSON}")

# -----------------------------------------------------------
# Cell 3: Define function to combine abstract + OCR
def process_paper(paper, screenshot_dir, output_dir, ocr_enabled=True):
    """
    Combines Trafilatura abstract with optional OCR text,
    preserves line breaks, and saves one .txt file per paper.
    """
    url = paper.get("url", "")
    arxiv_id = url.split("/")[-1]
    
    abstract_text = paper.get("abstract", "").strip()
    
    # -----------------------------
    # OCR from screenshot if enabled
    # -----------------------------
    if ocr_enabled:
        screenshot_file = os.path.join(screenshot_dir, f"{arxiv_id}.png")
        if os.path.exists(screenshot_file):
            try:
                img = Image.open(screenshot_file)
                ocr_text = pytesseract.image_to_string(img, lang="eng", config="--oem 1 --psm 3")
                # Preserve line breaks, remove empty lines
                lines = [line.rstrip() for line in ocr_text.splitlines()]
                ocr_text = "\n".join(line for line in lines if line.strip())
                if ocr_text:
                    # Add separator to indicate OCR portion
                    abstract_text += "\n\n[OCR Text]\n\n" + ocr_text
            except Exception as e:
                print(f"OCR failed for {url}: {e}")
    
    # -----------------------------
    # Save to .txt file
    # -----------------------------
    txt_filename = os.path.join(output_dir, f"{arxiv_id}.txt")
    with open(txt_filename, "w", encoding="utf-8") as f:
        # Add header info
        f.write(f"Title: {paper.get('title','')}\n")
        f.write(f"Authors: {paper.get('authors','')}\n")
        f.write(f"Date: {paper.get('date','')}\n")
        f.write(f"URL: {url}\n")
        f.write("\n=== Abstract ===\n\n")
        f.write(abstract_text)
    
    return txt_filename

# -----------------------------------------------------------
# Cell 4: Process all papers and save as individual .txt
for i, paper in enumerate(papers_data, start=1):
    txt_file = process_paper(paper, SCREENSHOT_DIR, OUTPUT_DIR, OCR_ENABLED)
    if i % 10 == 0 or i == len(papers_data):
        print(f"Processed {i}/{len(papers_data)} papers -> last saved: {txt_file}")

# -----------------------------------------------------------
# Cell 5: Optional: inspect a paper's txt
sample_paper_id = papers_data[0]["url"].split("/")[-1]
sample_txt_path = os.path.join(OUTPUT_DIR, f"{sample_paper_id}.txt")

with open(sample_txt_path, "r", encoding="utf-8") as f:
    print(f.read()[:1000])  # show first 1000 characters

print("All papers processed. Check the folder:", OUTPUT_DIR)


Configuration set. Ready to process papers.
Loaded 50 papers from arxiv_clean.json
Processed 10/50 papers -> last saved: papers_txt/2510.26575.txt
Processed 20/50 papers -> last saved: papers_txt/2510.26345.txt
Processed 30/50 papers -> last saved: papers_txt/2510.26202.txt
Processed 40/50 papers -> last saved: papers_txt/2510.26020.txt
Processed 50/50 papers -> last saved: papers_txt/2510.25816.txt
Title: Gistify! Codebase-Level Understanding via Runtime Execution
Authors: Hyunji Lee, Minseon Kim, Chinmay Singh, Matheus Pereira, Atharv Sonwane, Isadora White, Elias Stengel-Eskin, Mohit Bansal, Zhengyan Shi, Alessandro Sordoni, Marc-Alexandre Côté, Xingdi Yuan, Lucas Caccia
Date: [Submitted on 30 Oct 2025]
URL: https://arxiv.org/abs/2510.26790

=== Abstract ===


All papers processed. Check the folder: papers_txt
