
# Grobid / PDFAlto evaluation







In [1]:
!pip install tabulate



In [2]:
from tqdm import tqdm
from xml.etree.ElementTree import ElementTree
import os


def compute_statistics(input_pdfalto, input_grobid):
    common_subdirs = extract_common_repositories(input_pdfalto, input_grobid)
    print("Found the following corpus directories: ", common_subdirs)

    documents = {}
    for corpus in tqdm(common_subdirs, desc="Processing corpora", unit="corpus"):
        # tqdm.write(corpus)
        skipped = []
        processed = {}
        documents[corpus] = {
            'processed': processed,
            'skipped': skipped,
            'average': {
                'pdfalto': 0,
                'grobid': 0,
                'diff': 0,
                'perc': 0
            }
        }

        for root, dirs, files in os.walk(os.path.join(input_pdfalto, corpus)):
            for file in files:
                if not file.endswith(".txt"):
                    continue

                # Find the directory relative to the corpus directory
                file_id = os.path.relpath(os.path.join(root, file),
                                          os.path.join(input_pdfalto, corpus)).replace(".txt", "")

                with open(os.path.join(root, file), "r") as f:
                    txt_pdfalto = f.read()

                related_grobid_file = os.path.join(input_grobid, corpus, file_id + ".grobid.tei.xml")
                if not os.path.exists(related_grobid_file):
                    skipped.append((file_id, related_grobid_file))
                    continue

                txt_grobid = ""
                grobid_file_txt = related_grobid_file.replace(".grobid.tei.xml", ".grobid.txt")
                if os.path.exists(grobid_file_txt):
                    with open(grobid_file_txt, "r") as f:
                        txt_grobid = f.read()
                else:
                    # Parse xml and extract all text
                    root = ElementTree().parse(related_grobid_file)
                    txt_grobid = ' '.join(root.itertext())

                    with open(grobid_file_txt, "w") as f:
                        f.write(txt_grobid)

                # Strip out spaces, tabs and newlines
                txt_pdfalto_spaces = ' '.join(txt_pdfalto.split())
                txt_grobid_spaces = ' '.join(txt_grobid.split())

                # We keep examples with breaklines
                txt_pdfalto_spaces_breaklines = "\n".join([line.strip() for line in txt_pdfalto.split("\n") if line.strip()])
                txt_grobid_spaces_breaklines = "\n".join([line.strip() for line in txt_grobid.split("\n") if line.strip()])

                txt_grobid_spaces_len = len(txt_pdfalto_spaces)
                txt_pdfalto_spaces_len = len(txt_grobid_spaces)

                txt_pdfalto_no_spaces = ''.join(txt_pdfalto.split())
                txt_grobid_no_spaces = ''.join(txt_grobid.split())

                txt_pdfalto_len = len(txt_pdfalto_no_spaces)
                txt_grobid_len = len(txt_grobid_no_spaces)
                diff_chars = txt_pdfalto_len - txt_grobid_len

                txt_pdfalto_token_len = len(txt_pdfalto_spaces.split())
                txt_grobid_token_len = len(txt_grobid_spaces.split())
                diff_tokens = txt_pdfalto_token_len - txt_grobid_token_len

                diff_grobid_file = related_grobid_file.replace(".grobid.tei.xml", ".diff.grobid.txt")
                if not os.path.exists(diff_grobid_file):
                    # save files
                    with open(diff_grobid_file, "w") as f:
                        f.write(txt_grobid_spaces_breaklines)

                diff_pdfalto_file = related_grobid_file.replace(".grobid.tei.xml", ".diff.pdfalto.txt")
                if not os.path.exists(diff_pdfalto_file):
                    # save files
                    with open(diff_pdfalto_file, "w") as f:
                        f.write(txt_pdfalto_spaces_breaklines)

                processed[file_id] = {
                    'pdfalto': len(txt_pdfalto),
                    'pdfalto_tokens': txt_pdfalto_token_len,
                    'grobid': len(txt_grobid),
                    'grobid_tokens': txt_grobid_token_len,
                    'stats': {
                        'diff': diff_chars,
                        'diff_tokens': diff_tokens,
                        'perc': (1 - (txt_grobid_len / txt_pdfalto_len)) * 100 if txt_pdfalto_len > 0 else 0.0,
                        'perc_tokens': (1 - (
                                txt_grobid_token_len / txt_pdfalto_token_len)) * 100 if txt_pdfalto_token_len > 0 else 0.0
                    }
                }
        # tqdm.write(f"Processed documents {len(processed)}")
        # tqdm.write(f"Skipped documents {len(skipped)}")

    for corpus, docs in documents.items():
        print(f"Corpus: {corpus}, Processed {len(docs['processed'])}, skipped: {len(docs['skipped'])}")

    return documents


def extract_common_repositories(input_pdfalto, input_grobid):
    # Find the subdirectories in the input corpora
    subdirs_pdfalto = [x for x in os.listdir(input_pdfalto)]
    print(f"Found the following corpus directories for PDFAlto: {subdirs_pdfalto}")
    subdirs_grobid = [x for x in os.listdir(input_grobid)]
    print(f"Found the following corpus directories for Grobid: {subdirs_grobid}")
    # find the common subdirectories
    common_subdirs = [x for x in subdirs_grobid if x in subdirs_pdfalto]
    return common_subdirs

#
# negative_diff_documents = list(filter(lambda d: d[1]['stats']['diff'] < 0, documents[corpus].items()))
# print(f"Negative diff documents {len(negative_diff_documents)}")
# print(json.dumps(
#     negative_diff_documents, indent=4
# ))

In [3]:
INPUT_CORPORA_PDFALTO = "/Volumes/ExtremePro/sciencialab/dimensions/pdfalto-evaluation/pdfalto/lin64-0.5"
documents_output = {}

In [4]:
INPUT_CORPORA_GROBID = "/Volumes/ExtremePro/sciencialab/dimensions/pdfalto-evaluation/grobid.v6_2"

## Results Grobid (normal)

In [5]:
documents_output["standard"] = compute_statistics(INPUT_CORPORA_PDFALTO, f"{INPUT_CORPORA_GROBID}/normal-sentences")

Found the following corpus directories for PDFAlto: ['PLOS_1000', 'eLife_984', 'biorxiv-10k-test-2000', 'PMC_sample_1943']


FileNotFoundError: [Errno 2] No such file or directory: '/Volumes/ExtremePro/sciencialab/dimensions/pdfalto-evaluation/grobid.v6_2/normal-sentences'

In [6]:
documents_output["standard (paragraphs)"] = compute_statistics(INPUT_CORPORA_PDFALTO, f"{INPUT_CORPORA_GROBID}/normal-paragraphs")

Found the following corpus directories for PDFAlto: ['PLOS_1000', 'eLife_984', 'biorxiv-10k-test-2000', 'PMC_sample_1943']
Found the following corpus directories for Grobid: ['PLOS_1000', 'eLife_984', 'biorxiv-10k-test-2000', 'PMC_sample_1943']
Found the following corpus directories:  ['PLOS_1000', 'eLife_984', 'biorxiv-10k-test-2000', 'PMC_sample_1943']


Processing corpora: 100%|██████████| 4/4 [00:48<00:00, 12.21s/corpus]

Corpus: PLOS_1000, Processed 1000, skipped: 0
Corpus: eLife_984, Processed 984, skipped: 0
Corpus: biorxiv-10k-test-2000, Processed 2000, skipped: 0
Corpus: PMC_sample_1943, Processed 1943, skipped: 0





## Results Grobid (light process)


In [7]:
documents_output["light"] = compute_statistics(INPUT_CORPORA_PDFALTO, f"{INPUT_CORPORA_GROBID}/light-sentences")

Found the following corpus directories for PDFAlto: ['PLOS_1000', 'eLife_984', 'biorxiv-10k-test-2000', 'PMC_sample_1943']
Found the following corpus directories for Grobid: ['PLOS_1000', 'eLife_984', 'biorxiv-10k-test-2000', 'PMC_sample_1943']
Found the following corpus directories:  ['PLOS_1000', 'eLife_984', 'biorxiv-10k-test-2000', 'PMC_sample_1943']


Processing corpora: 100%|██████████| 4/4 [00:52<00:00, 13.11s/corpus]

Corpus: PLOS_1000, Processed 1000, skipped: 0
Corpus: eLife_984, Processed 984, skipped: 0
Corpus: biorxiv-10k-test-2000, Processed 2000, skipped: 0
Corpus: PMC_sample_1943, Processed 1943, skipped: 0





In [8]:
documents_output["light (paragraphs)"] = compute_statistics(INPUT_CORPORA_PDFALTO, f"{INPUT_CORPORA_GROBID}/light-paragraphs")

Found the following corpus directories for PDFAlto: ['PLOS_1000', 'eLife_984', 'biorxiv-10k-test-2000', 'PMC_sample_1943']
Found the following corpus directories for Grobid: ['PLOS_1000', 'eLife_984', 'biorxiv-10k-test-2000', 'PMC_sample_1943']
Found the following corpus directories:  ['PLOS_1000', 'eLife_984', 'biorxiv-10k-test-2000', 'PMC_sample_1943']


Processing corpora: 100%|██████████| 4/4 [00:48<00:00, 12.17s/corpus]

Corpus: PLOS_1000, Processed 1000, skipped: 0
Corpus: eLife_984, Processed 984, skipped: 0
Corpus: biorxiv-10k-test-2000, Processed 2000, skipped: 0
Corpus: PMC_sample_1943, Processed 1943, skipped: 0





## Grobid results (normal + collectDiscardedText)

In [9]:
documents_output["standard + collectDiscardedText"] = compute_statistics(INPUT_CORPORA_PDFALTO, f"{INPUT_CORPORA_GROBID}/normal-discarded-sentences")

Found the following corpus directories for PDFAlto: ['PLOS_1000', 'eLife_984', 'biorxiv-10k-test-2000', 'PMC_sample_1943']
Found the following corpus directories for Grobid: ['PLOS_1000', 'eLife_984', 'biorxiv-10k-test-2000', 'PMC_sample_1943']
Found the following corpus directories:  ['PLOS_1000', 'eLife_984', 'biorxiv-10k-test-2000', 'PMC_sample_1943']


Processing corpora: 100%|██████████| 4/4 [00:57<00:00, 14.43s/corpus]

Corpus: PLOS_1000, Processed 1000, skipped: 0
Corpus: eLife_984, Processed 984, skipped: 0
Corpus: biorxiv-10k-test-2000, Processed 2000, skipped: 0
Corpus: PMC_sample_1943, Processed 1943, skipped: 0





In [10]:
documents_output["standard + collectDiscardedText (paragraphs)"] = compute_statistics(INPUT_CORPORA_PDFALTO, f"{INPUT_CORPORA_GROBID}/normal-discarded-paragraphs")

Found the following corpus directories for PDFAlto: ['PLOS_1000', 'eLife_984', 'biorxiv-10k-test-2000', 'PMC_sample_1943']
Found the following corpus directories for Grobid: ['PLOS_1000', 'eLife_984', 'biorxiv-10k-test-2000', 'PMC_sample_1943']
Found the following corpus directories:  ['PLOS_1000', 'eLife_984', 'biorxiv-10k-test-2000', 'PMC_sample_1943']


Processing corpora: 100%|██████████| 4/4 [00:49<00:00, 12.41s/corpus]

Corpus: PLOS_1000, Processed 1000, skipped: 0
Corpus: eLife_984, Processed 984, skipped: 0
Corpus: biorxiv-10k-test-2000, Processed 2000, skipped: 0
Corpus: PMC_sample_1943, Processed 1943, skipped: 0





## Result Grobid (lightweight) + collectDiscardedText

In [11]:
documents_output["light + collectDiscardedText"] = compute_statistics(INPUT_CORPORA_PDFALTO, f"{INPUT_CORPORA_GROBID}/light-discarded-sentences")

Found the following corpus directories for PDFAlto: ['PLOS_1000', 'eLife_984', 'biorxiv-10k-test-2000', 'PMC_sample_1943']
Found the following corpus directories for Grobid: ['PLOS_1000', 'eLife_984', 'biorxiv-10k-test-2000', 'PMC_sample_1943']
Found the following corpus directories:  ['PLOS_1000', 'eLife_984', 'biorxiv-10k-test-2000', 'PMC_sample_1943']


Processing corpora: 100%|██████████| 4/4 [01:22<00:00, 20.60s/corpus]

Corpus: PLOS_1000, Processed 1000, skipped: 0
Corpus: eLife_984, Processed 984, skipped: 0
Corpus: biorxiv-10k-test-2000, Processed 2000, skipped: 0
Corpus: PMC_sample_1943, Processed 1943, skipped: 0





In [12]:
documents_output["light + collectDiscardedText (paragraphs)"] = compute_statistics(INPUT_CORPORA_PDFALTO, f"{INPUT_CORPORA_GROBID}/light-discarded-paragraphs")

Found the following corpus directories for PDFAlto: ['PLOS_1000', 'eLife_984', 'biorxiv-10k-test-2000', 'PMC_sample_1943']
Found the following corpus directories for Grobid: ['PLOS_1000', 'eLife_984', 'biorxiv-10k-test-2000', 'PMC_sample_1943']
Found the following corpus directories:  ['PLOS_1000', 'eLife_984', 'biorxiv-10k-test-2000', 'PMC_sample_1943']


Processing corpora: 100%|██████████| 4/4 [00:56<00:00, 14.07s/corpus]

Corpus: PLOS_1000, Processed 1000, skipped: 0
Corpus: eLife_984, Processed 984, skipped: 0
Corpus: biorxiv-10k-test-2000, Processed 2000, skipped: 0
Corpus: PMC_sample_1943, Processed 1943, skipped: 0





In [13]:
def compute_averages(documents_):
    for process_type, process in documents_.items():
        for corpus, documents in process.items():
            total_pdfalto = sum([d['pdfalto'] for d in documents['processed'].values()])
            total_grobid = sum([d['grobid'] for d in documents['processed'].values()])
            total_diff = sum([d['stats']['diff'] for d in documents['processed'].values()])
            total_perc = sum([d['stats']['perc'] for d in documents['processed'].values()])
            total_diff_tokens = sum([d['stats']['diff_tokens'] for d in documents['processed'].values()])
            total_perc_tokens = sum([d['stats']['perc_tokens'] for d in documents['processed'].values()])

            documents['average']['pdfalto'] = total_pdfalto / len(documents['processed'])
            documents['average']['grobid'] = total_grobid / len(documents['processed'])
            documents['average']['diff'] = total_diff / len(documents['processed'])
            documents['average']['diff_tokens'] = total_diff_tokens / len(documents['processed'])
            documents['average']['perc'] = total_perc / len(documents['processed'])
            documents['average']['perc_tokens'] = total_perc_tokens / len(documents['processed'])

        # data.append(documents[corpus]['average']['perc'])

        # print(f"\t- Processed: {len(documents[corpus]['processed'])}, Skipped: {len(documents[corpus]['skipped'])}")
        # print(f"\t- Total length from PDFAlto: {total_pdfalto}")
        # print(f"\t- Total length from Grobid: {total_grobid}")
        # print(f"\t- Total Diff: {total_diff}")
        # print(f"\t- Total Perc: {documents[corpus]['average']['perc']}")
    return documents_


documents_output_with_average = compute_averages(documents_output)


## Results

### Character level evaluation
#### Average percentage difference in characters

In [14]:
results = {}

columns = ["Process type"]
aggregated_data = []
for process_type, process in documents_output_with_average.items():
    process_data = [process_type]
    aggregated_data.append(process_data)
    for corpus, documents in process.items():
        if corpus not in columns:
            columns.append(corpus)
        process_data.append(documents['average']['perc'])

import pandas as pd
df = pd.DataFrame(aggregated_data, columns=columns)
results['character_level'] = df

df


Unnamed: 0,Process type,PLOS_1000,eLife_984,biorxiv-10k-test-2000,PMC_sample_1943
0,standard,11.821514,9.639087,9.269665,6.451962
1,standard (paragraphs),11.820248,9.633689,9.230569,6.450445
2,light,12.459632,6.857511,8.837437,8.516626
3,light (paragraphs),12.459843,6.856247,8.80008,8.515261
4,standard + collectDiscardedText,8.966189,4.817297,3.148627,4.103599
5,standard + collectDiscardedText (paragraphs),8.964434,4.815603,3.104529,4.102097
6,light + collectDiscardedText,3.174446,3.333342,4.081236,2.13362
7,light + collectDiscardedText (paragraphs),3.174235,3.32793,4.039645,2.132294


#### Average characters length difference

In [15]:
columns = ["Process type"]
aggregated_data = []
for process_type, process in documents_output_with_average.items():
    process_data = [process_type]
    aggregated_data.append(process_data)
    for corpus, documents in process.items():
        if corpus not in columns:
            columns.append(corpus)
        process_data.append(documents['average']['diff'])

import pandas as pd

df = pd.DataFrame(aggregated_data, columns=columns)
results['character_level_diff'] = df

df

Unnamed: 0,Process type,PLOS_1000,eLife_984,biorxiv-10k-test-2000,PMC_sample_1943
0,standard,5042.305,7621.560976,4870.395,2533.489449
1,standard (paragraphs),5041.687,7619.377033,4861.6145,2532.909933
2,light,4893.161,5109.507114,4026.266,2988.691199
3,light (paragraphs),4893.237,5108.392276,4015.9255,2988.205867
4,standard + collectDiscardedText,3844.632,3867.505081,1654.9635,1669.83016
5,standard + collectDiscardedText (paragraphs),3844.061,3866.181911,1644.55,1669.277921
6,light + collectDiscardedText,1647.508,2568.152439,2011.3505,906.528049
7,light + collectDiscardedText (paragraphs),1647.44,2564.443089,2000.91,906.061245


### Token level evaluation
#### Average percentage difference in tokens


In [16]:
columns = ["Process type"]
aggregated_data = []
for process_type, process in documents_output.items():
    process_data = [process_type]
    aggregated_data.append(process_data)
    for corpus, documents in process.items():
        if corpus not in columns:
            columns.append(corpus)
        process_data.append(documents['average']['perc_tokens'])

import pandas as pd

df = pd.DataFrame(aggregated_data, columns=columns)

results['token_level'] = df

df


Unnamed: 0,Process type,PLOS_1000,eLife_984,biorxiv-10k-test-2000,PMC_sample_1943
0,standard,8.079996,5.460726,7.451197,3.640059
1,standard (paragraphs),8.073511,5.454855,7.377015,3.6392
2,light,8.931686,2.749526,5.66344,5.121684
3,light (paragraphs),8.932267,2.748054,5.591514,5.125644
4,standard + collectDiscardedText,5.163359,-0.16436,0.702553,0.998441
5,standard + collectDiscardedText (paragraphs),5.156289,-0.167808,0.619746,0.997513
6,light + collectDiscardedText,-0.16709,-0.529563,1.38579,-0.797124
7,light + collectDiscardedText (paragraphs),-0.166914,-0.535557,1.310566,-0.793043


#### Average tokens length difference


In [17]:
columns = ["Process type"]
aggregated_data = []
for process_type, process in documents_output.items():
    process_data = [process_type]
    aggregated_data.append(process_data)
    for corpus, documents in process.items():
        if corpus not in columns:
            columns.append(corpus)
        process_data.append(documents['average']['diff_tokens'])

import pandas as pd

df = pd.DataFrame(aggregated_data, columns=columns)

results['token_level_diff'] = df
df

Unnamed: 0,Process type,PLOS_1000,eLife_984,biorxiv-10k-test-2000,PMC_sample_1943
0,standard,562.686,788.930894,735.182,251.120947
1,standard (paragraphs),562.103,788.402439,730.7985,251.030365
2,light,552.945,345.103659,412.3385,294.220278
3,light (paragraphs),552.99,344.844512,407.733,294.476068
4,standard + collectDiscardedText,346.402,11.170732,40.245,71.429748
5,standard + collectDiscardedText (paragraphs),345.828,10.676829,35.247,71.339166
6,light + collectDiscardedText,10.675,-66.486789,60.0085,-53.407617
7,light + collectDiscardedText (paragraphs),10.695,-67.210366,55.369,-53.145651


In [18]:

from datetime import datetime

cl = results['character_level']
cl_markdown = cl.to_markdown(index=False)
tl = results['token_level']
tl_markdown = tl.to_markdown(index=False)

# Get current date and time
current_time = datetime.now().strftime("%Y%m%d_%H%M%S")

# Create filename with date and time
filename = f"output_{current_time}.md"

# Save markdown table to file
with open(os.path.join("results", filename), "w") as file:
    file.write("# Characters level evaluation\n")
    file.write("\n")
    file.write(cl_markdown)
    file.write("\n\n")
    file.write("# Tokens level evaluation\n")
    file.write("\n")
    file.write(tl_markdown)


## Analysis specific documents

In [19]:
for process_type, process in documents_output.items():
    print(process_type)
    for corpus, documents in process.items():
        print(f"\t{corpus}")
        sorted_by_diff = sorted(
            documents['processed'].items(),
            key=lambda x: x[1]['stats']['diff']
        )

        sorted_by_diff_tokens = sorted(
            documents['processed'].items(),
            key=lambda x: x[1]['stats']['diff_tokens']
        )

        output_neg = [f"\n\t\t\t -{item[0]}: {item[1]['stats']['diff']}" for item in sorted_by_diff[:2]]
        print("".join(output_neg))
        output_pos = [f"\n\t\t\t -{item[0]}: {item[1]['stats']['diff']}" for item in sorted_by_diff[-2:]]
        print("".join(output_pos))
        # print(f"Files to check:\n {[item[0] for item in sorted_by_diff_tokens[:2]]}")


standard
	PLOS_1000

			 -pgen.1002782/pgen.1002782: -9513
			 -pone.0002784/pone.0002784: -4406

			 -pone.0270278/pone.0270278: 19209
			 -pone.0278380/pone.0278380: 19477
	eLife_984

			 -83628/elife-83628-v1: -19977
			 -63910/elife-63910-v2: -5693

			 -36495/elife-36495-v1: 71005
			 -73679/elife-73679-v2: 112692
	biorxiv-10k-test-2000

			 -392563v1/392563v1: -43153
			 -030122v1/030122v1: -5648

			 -413708v1/413708v1: 56583
			 -454355v1/454355v1: 96222
	PMC_sample_1943

			 -Mucosal_Immunol_2011_Jul_30_4(4)_468-478/mi20118a: -4401
			 -Protein_Sci_2010_Nov_10_19(11)_2131-2140/pro0019-2131: -3903

			 -Mol_Phylogenet_Evol_2010_Oct_57(1)_266-284/main: 21114
			 -J_Adv_Nurs_2011_Feb_67(2)_228-250/jan0067-0228: 23014
standard (paragraphs)
	PLOS_1000

			 -pgen.1002782/pgen.1002782: -9513
			 -pone.0002784/pone.0002784: -4406

			 -pone.0270278/pone.0270278: 19209
			 -pone.0278380/pone.0278380: 19477
	eLife_984

			 -83628/elife-83628-v1: -20000
			 -63910/elife-63910-v2: -5693

