In [1]:
import os
from PyPDF2 import PdfReader, PdfWriter
from pdf2image import convert_from_path


def extract_page_to_png(pdf_path, page_number, output_png_path):
    """
    Extract a specific page from a PDF and convert it to PNG.
    
    Args:
        pdf_path (str): Path to the input PDF file.
        page_number (int): The page number to extract (1-based).
        output_png_path (str): Path to save the output PNG file.
    """
    # Step 1: Extract the specific page using PyPDF2
    reader = PdfReader(pdf_path)
    writer = PdfWriter()

    # Check if the page number is valid
    total_pages = len(reader.pages)
    if page_number < 1 or page_number > total_pages:
        raise ValueError(f"Invalid page number: {page_number}. PDF only has {total_pages} pages.")

    # Add the specific page to a new PDF
    writer.add_page(reader.pages[page_number - 1])  # 0-based index
    extracted_pdf_path = "temp_page.pdf"
    with open(extracted_pdf_path, "wb") as temp_pdf:
        writer.write(temp_pdf)

    # Step 2: Convert the extracted page to PNG using pdf2image
    images = convert_from_path(extracted_pdf_path, dpi=300)

    # Save the first image (should be the single-page PDF output)
    if images:
        images[0].save(output_png_path, "PNG")
        print(f"Page {page_number} saved as PNG: {output_png_path}")

    # Clean up temporary files
    os.remove(extracted_pdf_path)


# Example usage
pdf_file = "sample_tables.pdf"          # Replace with your PDF file path
page_to_extract = 1             # Page number (1-based index)
output_png = "sample_tables.png"  # Output PNG file path

extract_page_to_png(pdf_file, page_to_extract, output_png)

Page 1 saved as PNG: sample_tables.png


# Analyze final table

In [19]:
import pandas as pd
pdf_df = pd.read_csv("pdf_results.csv")
pdf_df['client_name'] = pdf_df['calls'].str.extract(r'client_name=([a-zA-Z0-9_]+)')
pdf_df[["result","timing_duration","input_tokens","output_tokens",'client_name']]

Unnamed: 0,result,timing_duration,input_tokens,output_tokens,client_name
0,"('tables', [Table(name='Table 1', content='| C...",6751,1966,499,Sonnet4
1,"('tables', [Table(name='Table 1', content='| C...",7390,1966,504,Sonnet45
2,"('tables', [Table(name='Table 1', content='| C...",15514,1966,494,Opus41
3,"('tables', [Table(name='Table 1', content='| C...",5304,1966,507,Haiku45


In [96]:
import pandas as pd
img_df = pd.read_csv("img_results.csv")
img_df['client_name'] = img_df['calls'].str.extract(r'client_name=([a-zA-Z0-9_]+)')
img_df[["result","timing_duration","input_tokens","output_tokens",'client_name']]

Unnamed: 0,result,timing_duration,input_tokens,output_tokens,client_name
0,"('tables', [Table(name='Table 1', content='| C...",11494,1588,499,Sonnet4
1,"('tables', [Table(name='Table 1', content='| C...",12975,1588,570,Sonnet45
2,"('tables', [Table(name='Table 1', content='| C...",15285,1588,486,Opus41
3,"('tables', [Table(name='Table 1', content='| C...",5492,1588,506,Haiku45


In [97]:
import re
import ast
from pydantic import BaseModel

class Table(BaseModel):
    name: str
    content: str

tbl_pattern = re.compile(
    r"Table\(\s*name=(?P<q1>['\"])(?P<name>.*?)(?P=q1)\s*,\s*content=(?P<q2>['\"])(?P<content>.*?)(?P=q2)\s*\)",
    re.S,
)

def extract_tables(body):
    tables = []
    for mm in tbl_pattern.finditer(body):
        name_raw = mm.group("q1") + mm.group("name") + mm.group("q1")
        content_raw = mm.group("q2") + mm.group("content") + mm.group("q2")
        # ast.literal_eval decodes escape sequences safely
        name = ast.literal_eval(name_raw)
        content = ast.literal_eval(content_raw)
        tables.append(Table(name=name, content=content))
    return(tables)

tables0 = extract_tables(pdf_df['result'].values[0][10:-1].strip())
tables1 = extract_tables(pdf_df['result'].values[1][10:-1].strip())
tables2 = extract_tables(pdf_df['result'].values[2][10:-1].strip())
tables3 = extract_tables(pdf_df['result'].values[3][10:-1].strip())

itables0 = extract_tables(img_df['result'].values[0][10:-1].strip())
itables1 = extract_tables(img_df['result'].values[1][10:-1].strip())
itables2 = extract_tables(img_df['result'].values[2][10:-1].strip())
itables3 = extract_tables(img_df['result'].values[3][10:-1].strip())

In [98]:
itables3

[Table(name='Table 1', content='| Column header (TH) | Column header (TH) | Column header (TH) |\n|---|---|---|\n| Row header (TH) | Data cell (TD) | Data cell (TD) |\n| Row header (TH) | Data cell (TD) | Data cell (TD) |'),
 Table(name='Table 2: example of footnotes referenced from within a table', content='| Expenditure by function £ million | 2009/10 | 2010/11 ¹ |\n|---|---|---|\n| **Policy functions** | **Financial** | 22.5 | 30.57 |\n| | **Information** ² | 10.2 | 14.8 |\n| | **Contingency** | 2.6 | 1.2 |\n| **Remunerated functions** | **Agency services** ³ | 44.7 | 35.91 |\n| | **Payments** | 22.41 | 19.88 |\n| | **Banking** | 22.90 | 44.23 |\n| | **Other** | 12.69 | 10.32 |\n\n**Footnotes:**\n1. Provisional total as of publication date.\n2. Costs associated with on-going information programmes.\n3. From the management accounts, net of recoveries, including interest charges.'),
 Table(name='Table 3: "film credits" style layout', content='| Main character | Daniel Radcliffe |\n|--

In [60]:
table1_content_ground_truth = """| Column header (TH) | Column header (TH) | Column header (TH) |
|-------------------|-------------------|-------------------|
| Row header (TH) | Data cell (TD) | Data cell (TD) |
| Row header(TH) | Data cell (TD) | Data cell (TD) |"""

table2_content_ground_truth = """| Expenditure by function £ million |  | 2009/10 | 2010/11 ¹ |
|----------------------------------|--|---------|----------|
| Policy functions | Financial | 22.5 | 30.57 |
|  | Information ² | 10.2 | 14.8 |
|  | Contingency | 2.6 | 1.2 |
| Remunerated functions | Agency services ³ | 44.7 | 35.91 |
|  | Payments | 22.41 | 19.88 |
|  | Banking | 22.90 | 44.23 |
|  | Other | 12.69 | 10.32 |

\n(1) Provisional total as of publication date. 
\n(2) Costs associated with on-going information programmes. 
\n(3) From the management accounts, net of recoveries, including interest charges."""


table3_content_ground_truth = """| Main character | Daniel Radcliffe |
|----------------|------------------|
| Sidekick 1 | Rupert Grint |
| Sidekick 2 | Emma Watson |
| Lovable ogre | Robbie Coltrane |
| Professor | Maggie Smith |
| Headmaster | Richard Harris |"""

## PDFs

In [84]:
from fuzzywuzzy import fuzz

gts = [table1_content_ground_truth, table2_content_ground_truth, table3_content_ground_truth]
tables = [tables0, tables1, tables2,tables3]
ratios = {}

for ix in range(4): # loop over models/sub-tables
    for ii in range(3): # loop over tables in pdf
        table = tables[ix] 
        pred = table[ii].content
        gt = gts[ii]
        ratio = fuzz.ratio(gt, pred)
        client_name = pdf_df['client_name'].values[ix]
        print(f"Table {ii+1} ratio between prediction and ground truth for {client_name}: {ratio}")
        ratios.setdefault(client_name, []).append(ratio)

# for ii in range(3): # loop over tables in pdf
#     for ix in range(4): # loop over models/sub-tables
#         table = tables[ix] 
#         pred = table[ii].content
#         gt = gts[ii]
#         ratio = fuzz.ratio(gt, pred)
#         client_name = pdf_df['client_name'].values[ix]
#         print(f"Table {ii+1} ratio between prediction and ground truth for {client_name}: {ratio}")

Table 1 ratio between prediction and ground truth for Sonnet4: 100
Table 2 ratio between prediction and ground truth for Sonnet4: 100
Table 3 ratio between prediction and ground truth for Sonnet4: 100
Table 1 ratio between prediction and ground truth for Sonnet45: 100
Table 2 ratio between prediction and ground truth for Sonnet45: 93
Table 3 ratio between prediction and ground truth for Sonnet45: 100
Table 1 ratio between prediction and ground truth for Opus41: 100
Table 2 ratio between prediction and ground truth for Opus41: 98
Table 3 ratio between prediction and ground truth for Opus41: 100
Table 1 ratio between prediction and ground truth for Haiku45: 88
Table 2 ratio between prediction and ground truth for Haiku45: 89
Table 3 ratio between prediction and ground truth for Haiku45: 86


In [85]:
ratios

{'Sonnet4': [100, 100, 100],
 'Sonnet45': [100, 93, 100],
 'Opus41': [100, 98, 100],
 'Haiku45': [88, 89, 86]}

In [93]:
import numpy as np
avgs = {key: np.average(values) for key, values in ratios.items()}
avgs

{'Sonnet4': np.float64(100.0),
 'Sonnet45': np.float64(97.66666666666667),
 'Opus41': np.float64(99.33333333333333),
 'Haiku45': np.float64(87.66666666666667)}

In [94]:
pdf_df['median_fuzz_ratio'] = list(avgs.values())
pdf_df.drop(columns=['calls'], inplace=False)

Unnamed: 0,result,timing_duration,input_tokens,output_tokens,client_name,median_fuzz_ratio
0,"('tables', [Table(name='Table 1', content='| C...",6751,1966,499,Sonnet4,100.0
1,"('tables', [Table(name='Table 1', content='| C...",7390,1966,504,Sonnet45,97.666667
2,"('tables', [Table(name='Table 1', content='| C...",15514,1966,494,Opus41,99.333333
3,"('tables', [Table(name='Table 1', content='| C...",5304,1966,507,Haiku45,87.666667


In [81]:
print(tables3[2].content)
print()
print(table3_content_ground_truth)

| Role | Actor |
|---|---|
| Main character | Daniel Radcliffe |
| Sidekick 1 | Rupert Grint |
| Sidekick 2 | Emma Watson |
| Lovable ogre | Robbie Coltrane |
| Professor | Maggie Smith |
| Headmaster | Richard Harris |

| Main character | Daniel Radcliffe |
|----------------|------------------|
| Sidekick 1 | Rupert Grint |
| Sidekick 2 | Emma Watson |
| Lovable ogre | Robbie Coltrane |
| Professor | Maggie Smith |
| Headmaster | Richard Harris |


In [82]:
display(Markdown(tables3[2].content))
display(Markdown(table3_content_ground_truth))

| Role | Actor |
|---|---|
| Main character | Daniel Radcliffe |
| Sidekick 1 | Rupert Grint |
| Sidekick 2 | Emma Watson |
| Lovable ogre | Robbie Coltrane |
| Professor | Maggie Smith |
| Headmaster | Richard Harris |

| Main character | Daniel Radcliffe |
|----------------|------------------|
| Sidekick 1 | Rupert Grint |
| Sidekick 2 | Emma Watson |
| Lovable ogre | Robbie Coltrane |
| Professor | Maggie Smith |
| Headmaster | Richard Harris |

In [77]:
display(Markdown(tables3[0].content))
display(Markdown(table1_content_ground_truth))

| Column header (TH) | Column header (TH) | Column header (TH) |
|---|---|---|
| Row header (TH) | Data cell (TD) | Data cell (TD) |
| Row header (TH) | Data cell (TD) | Data cell (TD) |

| Column header (TH) | Column header (TH) | Column header (TH) |
|-------------------|-------------------|-------------------|
| Row header (TH) | Data cell (TD) | Data cell (TD) |
| Row header(TH) | Data cell (TD) | Data cell (TD) |

In [99]:
from IPython.display import display, Markdown
for table in tables0:
    print(table.name)
    display(Markdown(table.content))

Table 1


| Column header (TH) | Column header (TH) | Column header (TH) |
|-------------------|-------------------|-------------------|
| Row header (TH) | Data cell (TD) | Data cell (TD) |
| Row header(TH) | Data cell (TD) | Data cell (TD) |

Table 2: example of footnotes referenced from within a table


| Expenditure by function £ million |  | 2009/10 | 2010/11 ¹ |
|----------------------------------|--|---------|----------|
| Policy functions | Financial | 22.5 | 30.57 |
|  | Information ² | 10.2 | 14.8 |
|  | Contingency | 2.6 | 1.2 |
| Remunerated functions | Agency services ³ | 44.7 | 35.91 |
|  | Payments | 22.41 | 19.88 |
|  | Banking | 22.90 | 44.23 |
|  | Other | 12.69 | 10.32 |

(1) Provisional total as of publication date.
(2) Costs associated with on-going information programmes.
(3) From the management accounts, net of recoveries, including interest charges.

Table 3: "film credits" style layout


| Main character | Daniel Radcliffe |
|----------------|------------------|
| Sidekick 1 | Rupert Grint |
| Sidekick 2 | Emma Watson |
| Lovable ogre | Robbie Coltrane |
| Professor | Maggie Smith |
| Headmaster | Richard Harris |

## IMAGES

In [100]:
gts = [table1_content_ground_truth, table2_content_ground_truth, table3_content_ground_truth]
tables = [itables0, itables1, itables2,itables3]
iratios = {}

for ix in range(4): # loop over models/sub-tables
    for ii in range(3): # loop over tables in pdf
        table = tables[ix] 
        pred = table[ii].content
        gt = gts[ii]
        ratio = fuzz.ratio(gt, pred)
        client_name = pdf_df['client_name'].values[ix]
        print(f"Table {ii+1} ratio between prediction and ground truth for {client_name}: {ratio}")
        iratios.setdefault(client_name, []).append(ratio)

Table 1 ratio between prediction and ground truth for Sonnet4: 100
Table 2 ratio between prediction and ground truth for Sonnet4: 100
Table 3 ratio between prediction and ground truth for Sonnet4: 100
Table 1 ratio between prediction and ground truth for Sonnet45: 59
Table 2 ratio between prediction and ground truth for Sonnet45: 55
Table 3 ratio between prediction and ground truth for Sonnet45: 88
Table 1 ratio between prediction and ground truth for Opus41: 88
Table 2 ratio between prediction and ground truth for Opus41: 94
Table 3 ratio between prediction and ground truth for Opus41: 93
Table 1 ratio between prediction and ground truth for Haiku45: 88
Table 2 ratio between prediction and ground truth for Haiku45: 84
Table 3 ratio between prediction and ground truth for Haiku45: 93


In [101]:
import numpy as np
iavgs = {key: np.average(values) for key, values in iratios.items()}
iavgs

{'Sonnet4': np.float64(100.0),
 'Sonnet45': np.float64(67.33333333333333),
 'Opus41': np.float64(91.66666666666667),
 'Haiku45': np.float64(88.33333333333333)}

In [104]:
img_df['median_fuzz_ratio'] = list(iavgs.values())
img_df.drop(columns=['calls','result'], inplace=False)

Unnamed: 0,timing_duration,input_tokens,output_tokens,client_name,median_fuzz_ratio
0,11494,1588,499,Sonnet4,100.0
1,12975,1588,570,Sonnet45,67.333333
2,15285,1588,486,Opus41,91.666667
3,5492,1588,506,Haiku45,88.333333


In [112]:
from IPython.display import display, Markdown
display(Markdown(img_df.drop(columns=['calls','result'], inplace=False).to_markdown()))

|    |   timing_duration |   input_tokens |   output_tokens | client_name   |   median_fuzz_ratio |
|---:|------------------:|---------------:|----------------:|:--------------|--------------------:|
|  0 |             11494 |           1588 |             499 | Sonnet4       |            100      |
|  1 |             12975 |           1588 |             570 | Sonnet45      |             67.3333 |
|  2 |             15285 |           1588 |             486 | Opus41        |             91.6667 |
|  3 |              5492 |           1588 |             506 | Haiku45       |             88.3333 |

In [105]:
img_df.drop(columns=['calls','result'], inplace=False).describe()

Unnamed: 0,timing_duration,input_tokens,output_tokens,median_fuzz_ratio
count,4.0,4.0,4.0,4.0
mean,11311.5,1588.0,515.25,86.833333
std,4181.539629,0.0,37.428821,13.89511
min,5492.0,1588.0,486.0,67.333333
25%,9993.5,1588.0,495.75,83.083333
50%,12234.5,1588.0,502.5,90.0
75%,13552.5,1588.0,522.0,93.75
max,15285.0,1588.0,570.0,100.0


In [106]:
pdf_df.drop(columns=['calls','result'], inplace=False).describe()

Unnamed: 0,timing_duration,input_tokens,output_tokens,median_fuzz_ratio
count,4.0,4.0,4.0,4.0
mean,8739.75,1966.0,501.0,96.166667
std,4599.702626,0.0,5.715476,5.751006
min,5304.0,1966.0,494.0,87.666667
25%,6389.25,1966.0,497.75,95.166667
50%,7070.5,1966.0,501.5,98.5
75%,9421.0,1966.0,504.75,99.5
max,15514.0,1966.0,507.0,100.0


In [107]:
pdf_df.drop(columns=['calls','result'], inplace=False)

Unnamed: 0,timing_duration,input_tokens,output_tokens,client_name,median_fuzz_ratio
0,6751,1966,499,Sonnet4,100.0
1,7390,1966,504,Sonnet45,97.666667
2,15514,1966,494,Opus41,99.333333
3,5304,1966,507,Haiku45,87.666667


In [111]:
from IPython.display import display, Markdown
display(Markdown(pdf_df.drop(columns=['calls','result'], inplace=False).to_markdown()))

|    |   timing_duration |   input_tokens |   output_tokens | client_name   |   median_fuzz_ratio |
|---:|------------------:|---------------:|----------------:|:--------------|--------------------:|
|  0 |              6751 |           1966 |             499 | Sonnet4       |            100      |
|  1 |              7390 |           1966 |             504 | Sonnet45      |             97.6667 |
|  2 |             15514 |           1966 |             494 | Opus41        |             99.3333 |
|  3 |              5304 |           1966 |             507 | Haiku45       |             87.6667 |

# Archive

In [44]:
import re
import ast
from pydantic import BaseModel

class Table(BaseModel):
    name: str
    content: str

tbl_pattern = re.compile(
    r"Table\(\s*name=(?P<q1>['\"])(?P<name>.*?)(?P=q1)\s*,\s*content=(?P<q2>['\"])(?P<content>.*?)(?P=q2)\s*\)",
    re.S,
)

tables = []
for mm in tbl_pattern.finditer(pdf_df['result'].values[0][10:-1].strip()):
    name_raw = mm.group("q1") + mm.group("name") + mm.group("q1")
    content_raw = mm.group("q2") + mm.group("content") + mm.group("q2")
    # ast.literal_eval decodes escape sequences safely
    name = ast.literal_eval(name_raw)
    content = ast.literal_eval(content_raw)
    tables.append(Table(name=name, content=content))
tables

[Table(name='Table 1', content='| Column header (TH) | Column header (TH) | Column header (TH) |\n|-------------------|-------------------|-------------------|\n| Row header (TH) | Data cell (TD) | Data cell (TD) |\n| Row header(TH) | Data cell (TD) | Data cell (TD) |'),
 Table(name='Table 2: example of footnotes referenced from within a table', content='| Expenditure by function £ million |  | 2009/10 | 2010/11 ¹ |\n|----------------------------------|--|---------|----------|\n| Policy functions | Financial | 22.5 | 30.57 |\n|  | Information ² | 10.2 | 14.8 |\n|  | Contingency | 2.6 | 1.2 |\n| Remunerated functions | Agency services ³ | 44.7 | 35.91 |\n|  | Payments | 22.41 | 19.88 |\n|  | Banking | 22.90 | 44.23 |\n|  | Other | 12.69 | 10.32 |\n\n(1) Provisional total as of publication date.\n(2) Costs associated with on-going information programmes.\n(3) From the management accounts, net of recoveries, including interest charges.'),
 Table(name='Table 3: "film credits" style layout'

In [32]:
import re
import ast
from pydantic import BaseModel

class Table(BaseModel):
    name: str
    content: str

input_string = "Table(name='Table 1', content='| Column header (TH) | Column header (TH) | Column header (TH) |\\n|-------------------|-------------------|-------------------|\\n| Row header (TH) | Data cell (TD) | Data cell (TD) |\\n| Row header(TH) | Data cell (TD) | Data cell (TD) |')"

# Parse safely: capture the quoted name and content, then use ast.literal_eval to interpret escape sequences
m = re.search(
    r"Table\(\s*name=(?P<q1>['\"])(?P<name>.*?)(?P=q1)\s*,\s*content=(?P<q2>['\"])(?P<content>.*?)(?P=q2)\s*\)\s*$",
    input_string,
    re.S,
)
if not m:
    raise ValueError("Input string not in expected format")

name = ast.literal_eval(m.group("q1") + m.group("name") + m.group("q1"))
content = ast.literal_eval(m.group("q2") + m.group("content") + m.group("q2"))

table = Table(name=name, content=content)

print(table)
print("Name:", table.name)
print("Content:")
print(table.content)

name='Table 1' content='| Column header (TH) | Column header (TH) | Column header (TH) |\n|-------------------|-------------------|-------------------|\n| Row header (TH) | Data cell (TD) | Data cell (TD) |\n| Row header(TH) | Data cell (TD) | Data cell (TD) |'
Name: Table 1
Content:
| Column header (TH) | Column header (TH) | Column header (TH) |
|-------------------|-------------------|-------------------|
| Row header (TH) | Data cell (TD) | Data cell (TD) |
| Row header(TH) | Data cell (TD) | Data cell (TD) |


In [33]:
from IPython.display import display, Markdown
display(Markdown(table.content))

| Column header (TH) | Column header (TH) | Column header (TH) |
|-------------------|-------------------|-------------------|
| Row header (TH) | Data cell (TD) | Data cell (TD) |
| Row header(TH) | Data cell (TD) | Data cell (TD) |