In [1]:
# imports
import distance
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from Levenshtein import ratio
import spacy
nlp = spacy.load("en_core_web_lg")

from sentence_transformers import SentenceTransformer
model=SentenceTransformer('paraphrase-MiniLM-L6-v2')
from scipy.spatial.distance import cosine as cos_dist

from transformers import pipeline
sentiment_pipeline = pipeline("sentiment-analysis")

import textstat
import subprocess


No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [2]:
# manually selected preprint-publication pairs
from manual_pairs import pairs

In [3]:
polarize = lambda d:((d['label']=='POSITIVE')*2-1) * d['score']
mean = lambda l:sum(l)/len(l) if len(l)>0 else 0
def length_sim(s1, s2):
    l1 = len(s1)
    l2 = len(s2)
    longer = max(l1, l2)
    score = 1 - abs(l1-l2)/longer
    return score


def jaccard_sim(s1, s2):
    return 1 - distance.jaccard(s1, s2)

def sorensen_sim(s1, s2):
    return 1 - distance.sorensen(s1, s2)

def levenshtein_sim(s1, s2):
    return ratio(s1, s2)

# it's not entirely obvious how the cosine similarity was done

def cosine_sim(s1, s2):
    # the article is using stopword removal, Porter stemming, then scikit-learn pairwise cosine similarity
    # no Porter stemming here, just using CountVectorizer with English stopwords
    vect1 = CountVectorizer(stop_words='english')
    
    x1 = vect1.fit_transform([s1,s2])

    # vect2 = CountVectorizer(stop_words='english')

    # x2 = vect2.fit_transform([s2])

    return cosine_similarity(x1[0], x1[1])
    # return x1

def new_sim1(s1, s2):
    e1 = model.encode(s1)
    e2 = model.encode(s2)
    return 1 - cos_dist(e1, e2)

def ent_sim(s1, s2):
    n1 = nlp(s1)
    n2 = nlp(s2)
    ent1_num = [ent.text for ent in n1.ents if ent.label_ in ['CARDINAL','PERCENT']]
    ent2_num = [ent.text for ent in n2.ents if ent.label_ in ['CARDINAL','PERCENT']]

    ## intersection over union metric
    ent1_num_set = set(ent1_num)
    ent2_num_set = set(ent2_num)
    try:
        intersection_union = len(ent1_num_set.intersection(ent2_num_set)) / len(ent1_num_set.union(ent2_num_set))
        num_sim = intersection_union
    except ZeroDivisionError:
        num_sim = None
    
    ent1 = [ent.text for ent in n1.ents]
    ent2 = [ent.text for ent in n2.ents]
    ent1_set = set(ent1)
    ent2_set = set(ent2)
    try:
        intersection_union = len(ent1_set.intersection(ent2_set)) / len(ent1_set.union(ent2_set))
        ent_sim = intersection_union
    except ZeroDivisionError:
        ent_sim = None
    return num_sim, ent_sim

def match(s1, s2):
    
    # process using spacy
    n1 = nlp(s1)
    n2 = nlp(s2)
    # break into sentences
    sent1 = list(n1.sents)
    sent2 = list(n2.sents)
    m = len(sent1)
    n = len(sent2)

    # comparing sentence similarity
    def sent_sim(i, j):
        s1 = sent1[i-1].text
        s2 = sent2[j-1].text

        cos_sim = 1-cos_dist(model.encode(s1), model.encode(s2))
        set1 = set(s1.split())
        set2 = set(s2.split())
        u = len(set1.union(set2))
        if u:
            iu = len(set1.intersection(set2))/u
        else:
            iu = 0

        lemmas1 = set([tok.lemma_ for tok in sent1[i-1]])
        lemmas2 = set([tok.lemma_ for tok in sent2[j-1]])
        lemma_u = len(lemmas1.union(lemmas2))
        if lemma_u:
            lemma_iu = len(lemmas1.intersection(lemmas2))/lemma_u
        else:
            lemma_iu = 0


        return cos_sim + 0.7*iu + 0.5*(lemma_iu**2)
    # dynamic programming: approach is similar to LCS
    dp = [[0 for _ in range(n+1)] for _ in range(m+1)]
    for i in range(1, m+1):
        for j in range(1, n+1):
            print(i,j)
            m1 = sent_sim(i, j) + dp[i-1][j-1]
            m2 = dp[i][j-1]
            m3 = dp[i-1][j]
            dp[i][j] = max(m1, m2, m3)

    i = m
    j = n
    value = dp[i][j]
    matching = []
    while i>0 and j>0:
        if value == dp[i-1][j]:
            matching.append((sent1[i-1].text, None, None))
            i-=1
        elif value == dp[i][j-1]:
            matching.append((None, None, sent2[j-1].text))
            j-=1
        else:
            matching.append((sent1[i-1].text, dp[i][j]-dp[i-1][j-1], sent2[j-1].text))
            i-=1
            j-=1
    return matching, dp

#using neosca isn't really currently recommended. It takes a long time to run and has a number of dependencies. It's not entirely clear what the results from it mean.
def get_sca(s):
    res = subprocess.run(["nsca", "--text", s])
    if res.returncode == 0:
        with open('result.csv', 'r') as f:
            values_string = f.read().split('\n')[1]
            values = tuple(map(float, values_string.split(',')[1:]))
        return values
    else:
        print("Something went wrong when running the command line neosca", res)
        return res



In [4]:
for pair in pairs:
    descr = pair['description']
    pre = pair['pre']
    pub = pair['pub']
    print(descr, pre, pub)
    preSentences = [sent.text for sent in nlp(pre).sents]
    pubSentences = [sent.text for sent in nlp(pub).sents]

    pair['length similarity'] = length_sim(pre, pub)
    pair['jaccard similarity'] = jaccard_sim(pre, pub)
    pair['sorensen similarity'] = sorensen_sim(pre, pub)
    pair['levenshtein similarity'] = levenshtein_sim(pre, pub)
    pair['cosine'] = cosine_sim(pre, pub)
    pair['sentence transformers'] = new_sim1(pre, pub)
    pair['num'], pair['ent'] = ent_sim(pre, pub)
    pair['pre sentiment'] = mean(list(map(polarize, sentiment_pipeline(preSentences))))
    pair['pub sentiment'] = mean(list(map(polarize, sentiment_pipeline(pubSentences))))
    pair['pre reading ease'] = textstat.flesch_reading_ease(pre)
    pair['pub reading ease'] = textstat.flesch_reading_ease(pub)
    pair['pre reading grade level'] = textstat.flesch_kincaid_grade(pre)
    pair['pub reading grade level'] = textstat.flesch_kincaid_grade(pub)


Test 1: near identical This is the first part of a testing pair, where the second part will be almost identical. This is the second part of a testing pair, where the first part was almost identical.
title of first COVID NMA study 10.1101/2020.01.30.20019844 The impact of transmission control measures during the first 50 days of the COVID-19 epidemic in China An investigation of transmission control measures during the first 50 days of the COVID-19 epidemic in China
abstract of first COVID NMA study 10.1101/2020.01.30.20019844 Respiratory illness caused by a novel coronavirus (COVID-19) appeared in China during December 2019. Attempting to contain infection, China banned travel to and from Wuhan city on 23 January and implemented a national emergency response. Here we evaluate the spread and control of the epidemic based on a unique synthesis of data including case reports, human movement and public health interventions. The Wuhan shutdown slowed the dispersal of infection to other citi

In [6]:
df = pd.DataFrame(pairs)
print(df)

                                         description  \
0                             Test 1: near identical   
1  title of first COVID NMA study 10.1101/2020.01...   
2  abstract of first COVID NMA study 10.1101/2020...   
3  COVID NMA 'Association between 2019-nCoV trans...   
4  COVID NMA: Abstract of 'Clinical Features of C...   
5  COVID NMA: Abstract of 'Therapeutic effects of...   
6  COVID NMA: Abstract of 'Impact assessment of n...   

                                                 pre  \
0  This is the first part of a testing pair, wher...   
1  The impact of transmission control measures du...   
2  Respiratory illness caused by a novel coronavi...   
3  Cases of a novel type of contagious pneumonia ...   
4  BACKGROUND A recent outbreak of SARS-CoV-2 inf...   
5  The human coronavirus HCoV-19 infection can ca...   
6  \nBackground A range of public health measures...   

                                                 pub  length similarity  \
0  This is the second part 