# PDF parsing evaluation

## Step 1: Create our dataset and download the PDFs

Let's download a few arxiv pdf papers and create a dataset with them.

In [5]:
import httpx
import pandas as pd
import os

df = pd.DataFrame(
    [
        {
            "pdf": "https://arxiv.org/pdf/2505.19443",
            "title": "Vibe Coding vs. Agentic Coding: Fundamentals and Practical Implications of Agentic AI",
            "author_names": "Ranjan Sapkota, Konstantinos I. Roumeliotis, Manoj Karkee",
            "github_link": "",
        },
        {
            "pdf": "https://arxiv.org/pdf/2506.23253",
            "title": "Vibe coding: programming through conversation with artificial intelligence",
            "author_names": "Advait Sarkar, Ian Drosos",
            "github_link": "",
        },
        {
            "pdf": "https://arxiv.org/pdf/2506.11162v1",
            "title": "VIBE: Can a VLM Read the Room?*",
            "author_names": "Tania Chakraborty, Eylon Caplan, Dan Goldwasser",
            "github_link": "",
        },
        {
            "pdf": "https://arxiv.org/pdf/2507.00951v1",
            "title": "Thinking Beyond Tokens: From Brain-Inspired Intelligence to Cognitive Foundations for Artificial General Intelligence and its Societal Impact",
            "author_names": "Rizwan Qureshi, Ranjan Sapkota, Abbas Shah, Amgad Muneer, Anas Zafar, Ashmal Vayani, Maged Shoman, Abdelrahman B. M. Eldaly, Kai Zhang, Ferhat Sadak, Shaina Raza, Xinqi Fan, Ravid Shwartz-Ziv, Hong Yan, Vinjia Jain, Aman Chadha, Manoj Karkee, Jia Wu, Philip Torr, Seyedali Mirjalili",
            "github_link": "",
        },
        {
            "pdf": "https://arxiv.org/pdf/2505.17810",
            "title": "VIBE: Vector Index Benchmark for Embeddings",
            "author_names": "Elias Jääsaari, Ville Hyvönen, Matteo Ceccarello, Teemu Roos, Martin Aumüller",
            "github_link": "https://github.com/vector-index-bench/vibe",
        },
        {
            "pdf": "https://arxiv.org/pdf/2407.12787v2",
            "title": "GameVibe: a multimodal affective game corpus",
            "author_names": "Matthew Barthet, Maria Kaselimi, Kosmas Pinitas, Konstantinos Makantasis, Antonios Liapis, Georgios N. Yannakakis",
            "github_link": "",
        },
        {
            "pdf": "https://arxiv.org/pdf/2411.10867",
            "title": "ViBe: A Text-to-Video Benchmark for Evaluating Hallucination in Large Multimodal Models",
            "author_names": "Vipula Rawte, Sarthak Jain, Aarush Sinha, Garv Kaushik, Aman Bansal, Prathiksha Rumale Vishwanath, Samyak Rajesh Jain, Aishwarya Naresh Reganti, Vinija Jain, Aman Chadha, Amit Sheth, Amitava Das",
            "github_link": "https://vibe-t2v-bench.github.io/",
        },
        {
            "pdf": "https://arxiv.org/pdf/2410.12851v1",
            "title": "VibeCheck: Discover & Quantify Qualitative Differences in Large Language Models",
            "author_names": "Lisa Dunlap, Krishna Mandal, Trevor Darrell, Jacob Steinhardt, Joseph Gonzalez",
            "github_link": "",
        },
        {
            "pdf": "https://metr.org/Early_2025_AI_Experienced_OS_Devs_Study.pdf",
            "title": "Measuring the Impact of Early-2025 AI on Experienced Open-Source Developer Productivity",
            "author_names": "Joel Becker, Nate Rush, Beth Barnes, David Rein",
            "github_link": "",
        },
    ]
)

print("Downloading PDFs...")
os.makedirs(".files/", exist_ok=True)
for index, row in df.iterrows():
    name = row["pdf"].split("/")[-1].replace(".pdf", "") + ".pdf"
    if os.path.exists(f".files/{name}"):
        print(f"{name} already downloaded")
    else:
        response = httpx.get(row["pdf"])
        with open(f".files/{name}", "wb") as f:
            f.write(response.content)
            print(f"Downloaded {name}")
    df.at[index, "file"] = f".files/{name}"
print("Done")

df

Downloading PDFs...
2505.19443.pdf already downloaded
2506.23253.pdf already downloaded
2506.11162v1.pdf already downloaded
2507.00951v1.pdf already downloaded
2505.17810.pdf already downloaded
2407.12787v2.pdf already downloaded
2411.10867.pdf already downloaded
2410.12851v1.pdf already downloaded
Early_2025_AI_Experienced_OS_Devs_Study.pdf already downloaded
Done


Unnamed: 0,pdf,title,author_names,github_link,file
0,https://arxiv.org/pdf/2505.19443,Vibe Coding vs. Agentic Coding: Fundamentals a...,"Ranjan Sapkota, Konstantinos I. Roumeliotis, M...",,.files/2505.19443.pdf
1,https://arxiv.org/pdf/2506.23253,Vibe coding: programming through conversation ...,"Advait Sarkar, Ian Drosos",,.files/2506.23253.pdf
2,https://arxiv.org/pdf/2506.11162v1,VIBE: Can a VLM Read the Room?*,"Tania Chakraborty, Eylon Caplan, Dan Goldwasser",,.files/2506.11162v1.pdf
3,https://arxiv.org/pdf/2507.00951v1,Thinking Beyond Tokens: From Brain-Inspired In...,"Rizwan Qureshi, Ranjan Sapkota, Abbas Shah, Am...",,.files/2507.00951v1.pdf
4,https://arxiv.org/pdf/2505.17810,VIBE: Vector Index Benchmark for Embeddings,"Elias Jääsaari, Ville Hyvönen, Matteo Ceccarel...",https://github.com/vector-index-bench/vibe,.files/2505.17810.pdf
5,https://arxiv.org/pdf/2407.12787v2,GameVibe: a multimodal affective game corpus,"Matthew Barthet, Maria Kaselimi, Kosmas Pinita...",,.files/2407.12787v2.pdf
6,https://arxiv.org/pdf/2411.10867,ViBe: A Text-to-Video Benchmark for Evaluating...,"Vipula Rawte, Sarthak Jain, Aarush Sinha, Garv...",https://vibe-t2v-bench.github.io/,.files/2411.10867.pdf
7,https://arxiv.org/pdf/2410.12851v1,VibeCheck: Discover & Quantify Qualitative Dif...,"Lisa Dunlap, Krishna Mandal, Trevor Darrell, J...",,.files/2410.12851v1.pdf
8,https://metr.org/Early_2025_AI_Experienced_OS_...,Measuring the Impact of Early-2025 AI on Exper...,"Joel Becker, Nate Rush, Beth Barnes, David Rein",,.files/Early_2025_AI_Experienced_OS_Devs_Study...


## Step 2: Define our unstructured data parting pipeline

In [6]:
import dspy
import langwatch
from unstructured.partition.pdf import partition_pdf
from unstructured.staging.base import elements_to_text

dspy.configure(lm=dspy.LM("openai/gpt-4o-mini"))


@langwatch.trace()
def extract_pdf_info(filename):
    langwatch.get_current_trace().autotrack_dspy()

    elements = partition_pdf(filename=filename)
    pdf = elements_to_text(elements=elements)

    return dspy.Predict(
        "pdf -> title: str, author_names: str, github_link: Optional[str]"
    )(pdf=pdf)


extract_pdf_info(df["file"][0])

2025-07-11 12:25:50 - Loaded .env file


Prediction(
    title='Vibe Coding vs. Agentic Coding: Fundamentals and Practical Implications of Agentic AI',
    author_names='Ranjan Sapkota, Konstantinos I. Roumeliotis, Manoj Karkee',
    github_link=None
)

# Step 3: Run the evaluation

In [8]:
evaluation = langwatch.evaluation.init("pdf-parsing-evaluation")

for index, row in evaluation.loop(df.iterrows()):

        response = extract_pdf_info(row["file"])

        evaluation.log(
            "author_names_accuracy",
            index=index,
            passed=response.author_names == row["author_names"],
        )

        return response

    evaluation.submit(evaluate, index, row)

2025-07-11 12:26:58 - HTTP Request: POST https://app.langwatch.ai/api/experiment/init "HTTP/1.1 200 OK"
Follow the results at: https://app.langwatch.ai/demo/experiments/pdf-parsing-evaluation?runId=stirring-lemming-of-maturity


Evaluating:   0%|          | 0/9 [00:00<?, ?it/s]

2025-07-11 12:27:03 - HTTP Request: POST https://app.langwatch.ai/api/evaluations/batch/log_results "HTTP/1.1 200 OK"
2025-07-11 12:27:05 - HTTP Request: POST https://app.langwatch.ai/api/evaluations/batch/log_results "HTTP/1.1 200 OK"
2025-07-11 12:27:07 - HTTP Request: POST https://app.langwatch.ai/api/evaluations/batch/log_results "HTTP/1.1 200 OK"
2025-07-11 12:27:08 - Cannot set gray non-stroke color because /'p26' is an invalid float value
2025-07-11 12:27:09 - HTTP Request: POST https://app.langwatch.ai/api/evaluations/batch/log_results "HTTP/1.1 200 OK"
2025-07-11 12:27:11 - HTTP Request: POST https://app.langwatch.ai/api/evaluations/batch/log_results "HTTP/1.1 200 OK"
2025-07-11 12:27:15 - HTTP Request: POST https://app.langwatch.ai/api/evaluations/batch/log_results "HTTP/1.1 200 OK"
2025-07-11 12:27:16 - HTTP Request: POST https://app.langwatch.ai/api/evaluations/batch/log_results "HTTP/1.1 200 OK"
2025-07-11 12:27:16 - HTTP Request: POST https://app.langwatch.ai/api/evaluati