In [2]:
import spacy
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, accuracy_score
from sentence_transformers import SentenceTransformer, util

In [3]:
dataset = pd.read_pickle("../data/docs-dataset.pkl")
print(f"Number of documents to compare: {len(dataset)}")
dataset.head(5)

Number of documents to compare: 18


Unnamed: 0,summary_id,source_text,target_text,sids
0,C00-2123,[The authors in this paper describe a search p...,[Word Re-ordering and DP-based Search in Stati...,"[(1, 2), (2, 3), (3, 166), (4, 167), (5, 36), ..."
1,C02-1025,[This paper presents a maximum entropy-based n...,[Named Entity Recognition: A Maximum Entropy A...,"[(1, 2), (2, 7), (3, 205), (4, 63), (4, 64), (..."
2,C10-1045,[This paper offers a broad insight into of Ara...,"[Better Arabic Parsing: Baselines, Evaluations...","[(1, 2), (2, 27), (3, 8), (4, 8), (5, 23), (5,..."
3,D10-1044_swastika,[Foster et all describe a new approach to SMT ...,[Discriminative Instance Weighting for Domain ...,"[(1, 2), (2, 3), (3, 4), (4, 145), (5, 146), (..."
4,D10-1083,"[In this paper, the authors are of the opinion...","[Simple Type-Level Unsupervised POS Tagging, P...","[(1, 15), (2, 17), (3, 20), (4, 35), (5, 38), ..."


In [None]:
# Return the intersection of two sets of elements
def intersection(source, target):
    try: 
        return len(source.intersection(target)) / len(source.union(target))
    except:
        return float(0)

# Feature calculation
def cos_sim(source_sentence, target_sentence, model):
    source = model.encode(source_sentence, convert_to_tensor=True)
    target = model.encode(target_sentence, convert_to_tensor=True)
    similarity = util.cos_sim(source, target).item()
    return (similarity + 1) / 2 # Scale from [-1, 1] to [0, 1]

def jaccard_similarity(source_sentence, target_sentence):
    source = set([token.text for token in source_sentence if not token.is_stop])
    target = set([token.text for token in target_sentence if not token.is_stop])
    return intersection(source, target)

def entity_overlap(source_sentence, target_sentence):
    source = set([ent.text for ent in source_sentence.ents])
    target = set([ent.text for ent in target_sentence.ents])
    return intersection(source, target)

def pos_overlap(source_sentence, target_sentence):
    source = set([token.pos_ for token in source_sentence if not token.is_stop])
    target = set([token.pos_ for token in target_sentence if not token.is_stop])
    return intersection(source, target)

In [None]:
# Create sentence pairs from document & calculate weighted similarities
def pairwise(source_text, target_text, source_tokenized, target_tokenized, sids):
    pairs = []
    labels = []
    feature_scores = []
    weighted_sums = []
    
    for i, s in enumerate(source_text):
        s_tok = source_tokenized[i]
        for j, p in enumerate(target_text):
            p_tok = target_tokenized[j]

            pair = (s, p)
            if (i + 1, j + 1) in sids: label = 1
            else: label = 0
            pairs.append(pair)
            labels.append(label)
            
            cos = cos_sim(s, p, sentence_tf)
            jac = jaccard_similarity(s_tok, p_tok)
            ent = entity_overlap(s_tok, p_tok)
            pos = pos_overlap(s_tok, p_tok)

            feature_scores.append({
                "cosine": cos,
                "jaccard": jac,
                "entities": ent,
                "pos": pos,
            })

            weighted_sum = cos * 0.75 + jac * 0.15 + ent * 0.05 + pos * 0.05
            weighted_sums.append(weighted_sum)
    
    return pairs, labels, feature_scores, weighted_sums

In [None]:
spacy_pipe = spacy.load("en_core_web_md")
sentence_tf = SentenceTransformer('all-MiniLM-L6-v2')

summary_ids = dataset.summary_id.to_list()
source_docs = dataset.source_text.to_list()
target_docs = dataset.target_text.to_list()
sids_all =  dataset.sids.to_list()

results = []

for idx, source_doc in enumerate(source_docs):
    summary_id = summary_ids[idx]
    target_doc = target_docs[idx]
    sids = sids_all[idx]
    
    source_tokenized = [spacy_pipe(s) for s in source_doc]
    target_tokenized = [spacy_pipe(s) for s in target_doc]

    pairs, labels, feature_scores, weighted_sums = pairwise(source_doc, target_doc, source_tokenized, target_tokenized, sids)
    raw_predictions = [1 if ws == max(weighted_sums) else 0 for ws in weighted_sums]
    f1 = f1_score(labels, raw_predictions)
    acc = accuracy_score(labels, raw_predictions)

    results.append({
        "summary_id": summary_id,
        "labels": labels,
        "predictions": raw_predictions,
        "f1": f1,
        "accuracy": acc,
    })

In [None]:
df = pd.DataFrame(results)
df.head(5)