In [61]:
import os
import re
import time
import warnings

import pandas as pd

from pathlib import Path
from difflib import SequenceMatcher
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from langchain_community.document_loaders import (
    PyPDFLoader,
    PyMuPDFLoader,
    PyPDFium2Loader,
    PDFMinerLoader,
    PDFPlumberLoader,
)

warnings.filterwarnings("ignore")

In [57]:
# similarity score functions

def levenshtein_similarity(ground_truth, pred):
    return SequenceMatcher(None, ground_truth, pred).ratio()


def jaccard_similarity(ground_truth, pred):
    words_ground_truth, words_pred = set(ground_truth.split()), set(pred.split())
    intersection = words_ground_truth.intersection(words_pred)
    union = words_ground_truth.union(words_pred)
    return len(intersection) / len(union)


def cosine_similarity_score(ground_truth, pred):
    vectorizer = TfidfVectorizer().fit_transform([ground_truth, pred])
    vectors = vectorizer.toarray()
    return cosine_similarity(vectors)[0, 1]


similarity_functions = {
    'Levenshtein': levenshtein_similarity,
    'Jaccard': jaccard_similarity,
    'Cosine': cosine_similarity_score
}

In [80]:
def process_articles(loader_class):
    article_scores = {name: [] for name in similarity_functions.keys()}
    path = Path('articles')
    num_articles = 0
    start_time = time.time()

    for article in path.rglob('*'):
        if loader_class.__name__ == 'PDFMinerLoader':
            loader = loader_class(article, concatenate_pages=False)
        else:
            loader = loader_class(article)
        pages = loader.load()
        num_pages = len(pages)

        pattern = r'([^/\\]+)\.[^/\\]+$'
        filename = re.search(pattern, str(article)).group(1)

        total_scores = {name: 0 for name in similarity_functions.keys()}

        for i in range(num_pages):
            ground_truth_path = os.path.join(f'ground_truths/{filename}', f'{i+1}.txt')
            with open(ground_truth_path, 'r') as f:
                ground_truth = f.read()

            pred = pages[i].page_content

            for name, func in similarity_functions.items():
                page_score = func(ground_truth, pred)
                total_scores[name] += page_score

        for name in total_scores.keys():
            article_scores[name].append(total_scores[name] / num_pages)
        num_articles += 1

    elapsed_time = time.time() - start_time
    avg_scores = {name: sum(scores) / num_articles for name, scores in article_scores.items()}
    avg_scores['Time (s)'] = elapsed_time
    avg_scores['Loader'] = loader_class.__name__
    return avg_scores

In [86]:
loader_classes = [PyPDFLoader, PyMuPDFLoader, PyPDFium2Loader, PDFMinerLoader, PDFPlumberLoader]

results = []
for loader_class in loader_classes:
    avg_scores = process_articles(loader_class)
    results.append(avg_scores)

df = pd.DataFrame(results)
df.set_index('Loader', inplace=True)
df

Unnamed: 0_level_0,Levenshtein,Jaccard,Cosine,Time (s)
Loader,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
PyPDFLoader,0.818865,0.707674,0.883303,9.180178
PyMuPDFLoader,0.850488,0.797807,0.90422,1.378215
PyPDFium2Loader,0.851828,0.805883,0.907525,1.620844
PDFMinerLoader,0.756942,0.78681,0.886505,23.804049
PDFPlumberLoader,0.788298,0.750844,0.876181,32.255733
