# PDFAlto versions comparison and evaluation







In [7]:
from tqdm import tqdm
import os


def compute_statistics(input_pdfalto_A, input_pdfalto_B):
    common_subdirs = extract_common_repositories(input_pdfalto_A, input_pdfalto_B)
    print("Found the following corpus directories: ", common_subdirs)

    documents = {}
    for corpus in common_subdirs:
        print(corpus)
        skipped = []
        processed = {}
        documents[corpus] = {
            'processed': processed,
            'skipped': skipped,
            'pdfalto_A': input_pdfalto_A,
            'pdfalto_B': input_pdfalto_B,
            'average': {
                'pdfalto_A': 0,
                'pdfalto_B': 0,
                'diff': 0,
                'perc': 0
            }
        }

        for root, dirs, files in tqdm(os.walk(os.path.join(input_pdfalto_A, corpus))):
            for file in files:
                if not file.endswith(".txt"):
                    continue

                # Find the directory relative to the corpus directory
                file_id = os.path.relpath(os.path.join(root, file),
                                          os.path.join(input_pdfalto_A, corpus)).replace(".txt", "")

                with open(os.path.join(root, file), "r") as f:
                    txt_pdfalto_A = f.read()

                related_B_file = os.path.join(input_pdfalto_B, corpus, file_id + ".txt")
                if not os.path.exists(related_B_file):
                    skipped.append((file_id, related_B_file))
                    continue

                with open(related_B_file, "r") as f:
                    txt_pdfalto_B = f.read()

                txt_pdfalto_A_spaces = ' '.join(txt_pdfalto_A.split())
                txt_pdfalto_B_spaces = ' '.join(txt_pdfalto_B.split())

                txt_pdfalto_A_spaces_len = len(txt_pdfalto_A_spaces)
                txt_pdfalto_B_spaces_len = len(txt_pdfalto_B_spaces)
                diff_chars_spaces = txt_pdfalto_A_spaces_len - txt_pdfalto_B_spaces_len

                txt_pdfalto_A_no_spaces = ''.join(txt_pdfalto_A.split())
                txt_pdfalto_B_no_spaces = ''.join(txt_pdfalto_B.split())

                txt_pdfalto_A_len = len(txt_pdfalto_A_no_spaces)
                txt_pdfalto_B_len = len(txt_pdfalto_B_no_spaces)
                diff_chars_no_spaces = txt_pdfalto_A_len - txt_pdfalto_B_len

                txt_pdfalto_A_token_len = len(txt_pdfalto_A_spaces.split())
                txt_pdfalto_B_token_len = len(txt_pdfalto_B_spaces.split())
                diff_tokens = txt_pdfalto_A_token_len - txt_pdfalto_B_token_len

                # related_grobid_file = related_grobid_file.replace("tei.xml", "diff.txt")
                # # save files
                # with open(related_grobid_file, "w") as f:
                #     f.write(txt_grobid_spaces)
                #     f.write("\n")
                #     f.write(txt_pdfalto_spaces)

                processed[file_id] = {
                    'pdfalto_A': len(txt_pdfalto_A),
                    'pdfalto_A_tokens': txt_pdfalto_A_token_len,
                    'pdfalto_B': len(txt_pdfalto_B),
                    'pdfalto_B_tokens': txt_pdfalto_B_token_len,
                    'stats': {
                        'diff_chars_no_spaces': diff_chars_no_spaces,
                        'diff_chars_spaces': diff_chars_spaces,
                        'diff_tokens': diff_tokens
                    }
                }
        print(f"Processed documents {len(processed)}")
        print(f"Skipped documents {len(skipped)}")

    return documents


def extract_common_repositories(input_pdfalto, input_grobid):
    # Find the subdirectories in the input corpora
    subdirs_pdfalto = [x for x in os.listdir(input_pdfalto)]
    print(f"Found the following corpus directories for PDFAlto A: {subdirs_pdfalto}")
    subdirs_grobid = [x for x in os.listdir(input_grobid)]
    print(f"Found the following corpus directories for PDFAlto B: {subdirs_grobid}")
    # find the common subdirectories
    common_subdirs = [x for x in subdirs_grobid if x in subdirs_pdfalto]
    return common_subdirs

documents_output = {}


## Comparison

In [8]:
pdfalto_A = "/Volumes/ExtremePro/sciencialab/dimensions/pdfalto-evaluation/output_pdfalto/lin64-0.5"
pdfalto_B = "/Volumes/ExtremePro/sciencialab/dimensions/pdfalto-evaluation/output_pdfalto/output-pdfalto-8cf749a-250505"

documents_output["standard"] = compute_statistics(pdfalto_A, pdfalto_B)

Found the following corpus directories for PDFAlto A: ['PLOS_1000', 'eLife_984', 'biorxiv-10k-test-2000', 'PMC_sample_1943']
Found the following corpus directories for PDFAlto B: ['PLOS_1000', 'eLife_984', 'biorxiv-10k-test-2000', 'PMC_sample_1943']
Found the following corpus directories:  ['PLOS_1000', 'eLife_984', 'biorxiv-10k-test-2000', 'PMC_sample_1943']
PLOS_1000


1001it [00:11, 88.46it/s]


Processed documents 1000
Skipped documents 0
eLife_984


985it [00:21, 46.00it/s]


Processed documents 984
Skipped documents 0
biorxiv-10k-test-2000


2001it [00:30, 65.76it/s]


Processed documents 2000
Skipped documents 0
PMC_sample_1943


1944it [00:24, 78.09it/s] 


Processed documents 1943
Skipped documents 0


In [9]:
def compute_averages(documents_):
    for process_type, process in documents_.items():
        for corpus, documents in process.items():
            total_pdfalto_A = sum([d['pdfalto_A'] for d in documents['processed'].values()])
            total_pdfalto_B = sum([d['pdfalto_B'] for d in documents['processed'].values()])
            total_diff_chars_no_spaces = sum([d['stats']['diff_chars_no_spaces'] for d in documents['processed'].values()])
            total_diff_chars_spaces = sum([d['stats']['diff_chars_spaces'] for d in documents['processed'].values()])
            matching_documents_chars_no_spaces = sum([d['stats']['diff_chars_no_spaces'] == 0 for d in documents['processed'].values()])
            matching_documents_chars_spaces = sum([d['stats']['diff_chars_spaces'] == 0 for d in documents['processed'].values()])
            total_diff_tokens = sum([d['stats']['diff_tokens'] for d in documents['processed'].values()])
            matching_documents_tokens = sum([d['stats']['diff_tokens'] == 0 for d in documents['processed'].values()])

            documents['average']['pdfalto_A'] = total_pdfalto_A / len(documents['processed'])
            documents['average']['pdfalto_B'] = total_pdfalto_B / len(documents['processed'])
            documents['average']['diff_chars_no_spaces'] = total_diff_chars_no_spaces / len(documents['processed'])
            documents['average']['diff_chars_spaces'] = total_diff_chars_spaces / len(documents['processed'])
            documents['average']['diff_tokens'] = total_diff_tokens / len(documents['processed'])
            documents['average']['matching_documents_chars_spaces'] = matching_documents_chars_spaces
            documents['average']['matching_documents_chars_no_spaces'] = matching_documents_chars_no_spaces
            documents['average']['matching_documents_tokens'] = matching_documents_tokens

    return documents_


documents_output_with_average = compute_averages(documents_output)

## Results

## Matching documents


In [10]:
columns = ["Chars/Tokens"]
aggregated_data = []
for process_type, process in documents_output_with_average.items():
    process_data = ["Chars (no spaces)"]
    for corpus, documents in process.items():
        if corpus not in columns:
            columns.append(corpus)
        process_data.append(documents['average']['matching_documents_chars_no_spaces'])
    aggregated_data.append(process_data)
    process_data = ["Chars (spaces)"]
    for corpus, documents in process.items():
        if corpus not in columns:
            columns.append(corpus)
        process_data.append(documents['average']['matching_documents_chars_spaces'])
    aggregated_data.append(process_data)
    process_data = ["Tokens"]
    for corpus, documents in process.items():
        process_data.append(documents['average']['matching_documents_tokens'])
    aggregated_data.append(process_data)

import pandas as pd

pd.DataFrame(aggregated_data, columns=columns)

Unnamed: 0,Chars/Tokens,PLOS_1000,eLife_984,biorxiv-10k-test-2000,PMC_sample_1943
0,Chars (no spaces),748,511,1548,1451
1,Chars (spaces),391,187,740,868
2,Tokens,404,214,761,892


## Average difference

In [11]:
columns = ["Process type"]
aggregated_data = []
for process_type, process in documents_output_with_average.items():
    process_data = ["Chars (no spaces)"]
    for corpus, documents in process.items():
        if corpus not in columns:
            columns.append(corpus)
        process_data.append(documents['average']['diff_chars_no_spaces'])
    aggregated_data.append(process_data)
    process_data = ["Chars (spaces)"]
    for corpus, documents in process.items():
        if corpus not in columns:
            columns.append(corpus)
        process_data.append(documents['average']['diff_chars_spaces'])
    aggregated_data.append(process_data)
    process_data = ["Tokens"]
    for corpus, documents in process.items():
        process_data.append(documents['average']['diff_tokens'])
    aggregated_data.append(process_data)

import pandas as pd

pd.DataFrame(aggregated_data, columns=columns)

Unnamed: 0,Process type,PLOS_1000,eLife_984,biorxiv-10k-test-2000,PMC_sample_1943
0,Chars (no spaces),-3.196,-7.197154,-4.0065,-2.457025
1,Chars (spaces),-4.641,-12.759146,-8.4745,-3.888832
2,Tokens,-1.445,-5.561992,-4.468,-1.431806


## Analysis specific documents

In [12]:
for process_type, process in documents_output.items():
    print(process_type)
    for corpus, documents in process.items():
        print(f"\t{corpus}")
        sorted_by_diff = sorted(
            documents['processed'].items(),
            key=lambda x: x[1]['stats']['diff_chars_no_spaces']
        )

        sorted_by_diff_tokens = sorted(
            documents['processed'].items(),
            key=lambda x: x[1]['stats']['diff_tokens']
        )

        output_neg = [f"\n\t\t\t -{item[0]}: {item[1]['stats']['diff_chars_no_spaces']}" for item in sorted_by_diff[:2]]
        print("".join(output_neg))
        output_pos = [f"\n\t\t\t -{item[0]}: {item[1]['stats']['diff_chars_no_spaces']}" for item in sorted_by_diff[-2:]]
        print("".join(output_pos))
        # print(f"Files to check:\n {[item[0] for item in sorted_by_diff_tokens[:2]]}")


standard
	PLOS_1000

			 -pone.0278112/pone.0278112: -317
			 -pone.0278819/pone.0278819: -217

			 -pone.0278186/pone.0278186: 0
			 -pone.0278971/pone.0278971: 0
	eLife_984

			 -63910/elife-63910-v2: -169
			 -08954/elife-08954-v1: -144

			 -44795/elife-44795-v2: 0
			 -32143/elife-32143-v2: 0
	biorxiv-10k-test-2000

			 -172486v1/172486v1: -422
			 -187518v1/187518v1: -290

			 -286617v1/286617v1: 0
			 -438572v1/438572v1: 0
	PMC_sample_1943

			 -Comput_Math_Methods_Med_2011_Mar_8_2011_790721/CMMM2011-790721: -60
			 -BMC_Clin_Pathol_2011_May_10_11_6/1472-6890-11-6: -59

			 -Clin_Res_Cardiol_2011_May_4_100(5)_433-438/392_2010_Article_261: 0
			 -Scand_J_Food_Nutr_2007_Sep_51(3)_91-99/FNR-51-091: 0
