In [1]:
import pandas as pd
import numpy as np
import spacy
import re
#from __future__ import unicode_literals, print_function
from spacy.lang.en import English 
from sklearn.metrics.pairwise import cosine_similarity
from transformers import DistilBertTokenizerFast, DistilBertModel, pipeline
from tqdm.auto import tqdm

a1 = pd.read_csv('articles1.csv', sep=',')

# dataframe
articles = a1
articles = articles[['content']]

# subset for demo
articles = articles.iloc[0:10,:] # 10 articles
articles.head()

Unnamed: 0,content
0,WASHINGTON — Congressional Republicans have...
1,"After the bullet shells get counted, the blood..."
2,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,"Death may be the great equalizer, but it isn’t..."
4,"SEOUL, South Korea — North Korea’s leader, ..."


In [2]:
# sentence split
nlp = English()
nlp.add_pipe('sentencizer') 

def sentence_split(input_text): 
    doc = nlp(input_text)
    split = [sent.text.strip() for sent in doc.sents]
    return split # list of input split into sentences

In [4]:
# test input - 1 article at a time
list_of_str = sentence_split(articles['content'][0])
list_of_str[0:3]

['WASHINGTON  —   Congressional Republicans have a new fear when it comes to their    health care lawsuit against the Obama administration: They might win.',
 'The incoming Trump administration could choose to no longer defend the executive branch against the suit, which challenges the administration’s authority to spend billions of dollars on health insurance subsidies for   and   Americans, handing House Republicans a big victory on    issues.',
 'But a sudden loss of the disputed subsidies could conceivably cause the health care program to implode, leaving millions of people without access to health insurance before Republicans have prepared a replacement.']

In [5]:
# one article to dataframe
sentences_df = pd.DataFrame(list_of_str, columns=['content'])
sentences_df.head()

Unnamed: 0,content
0,WASHINGTON — Congressional Republicans have...
1,The incoming Trump administration could choose...
2,But a sudden loss of the disputed subsidies co...
3,That could lead to chaos in the insurance mark...
4,"To stave off that outcome, Republicans could f..."


In [6]:
# load model
model = DistilBertModel.from_pretrained("distilbert-base-uncased")
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
"""# batch size test - don't think necessary here
for batch_size in [1, 8, 64, 256]:
    print("-" * 30)
    print(f"Streaming batch_size={batch_size}")
    for out in tqdm(pipe(list_of_str, batch_size=batch_size), total=len(list_of_str)):
        pass"""

In [8]:
# model implementation with pipeline batching
batch_size = 1 
pipe = pipeline('feature-extraction', model=model, tokenizer=tokenizer, return_tensors='pt', padding=True, truncation=True, batch_size=batch_size)
test = pipe(list_of_str)

In [9]:
# embeddings
empty = []
for i in range(len(test)):
    empty.append(test[i][0][-1]) # last hidden layer
document_embeddings = np.array(empty) # encoded vectors

In [10]:
# define highlighter
def highlight(text):
    hl = "\033[44m" + str(text) + "\033[m"
    return hl

print(highlight('Hello World!'))

[44mHello World![m


In [None]:
# sample text: first 5 sentences of article 1
first5 = 'WASHINGTON  —   Congressional Republicans have a new fear when it comes to their    health care lawsuit against the Obama administration: They might win. The incoming Trump administration could choose to no longer defend the executive branch against the suit, which challenges the administration’s authority to spend billions of dollars on health insurance subsidies for   and   Americans, handing House Republicans a big victory on    issues. But a sudden loss of the disputed subsidies could conceivably cause the health care program to implode, leaving millions of people without access to health insurance before Republicans have prepared a replacement. That could lead to chaos in the insurance market and spur a political backlash just as Republicans gain full control of the government. To stave off that outcome, Republicans could find themselves in the awkward position of appropriating huge sums to temporarily prop up the Obama health care law, angering conservative voters who have been demanding an end to the law for years.'

In [24]:
def similarity_by_sentence(
                        user_input: str, 
                        df: pd.DataFrame = sentences_df, 
                        orig_doc: np.array = document_embeddings, 
                        sim: object = cosine_similarity
                        ) -> list:
    """
    Returns similarity vectors for each sentence in input.

    Args:
        1. user_input
            - input string to test for plagiarism
        2. df 
            - dataframe of article split into sentences
        3. orig_doc 
            - vectorized string data
        4. sim (function)
            - cosine similarity - compares similarity of vectors
    Returns:
        1. scores
            - list of similarity vectors for each sentence
    
    """
    scores = []
    # iterate over each sentence
    for sentence in sentence_split(user_input):
        
        # add string to df
        sentences_df_plus_input = df.copy()
        sentences_df_plus_input.loc[len(df.index)] = sentence 
        
        # encode
        input_embedding = pipe(sentence)[0][-1]
        
        # compute cosine similarities
        new_embedding = np.vstack([orig_doc, input_embedding]) # append encoded vector
        adj_pairwise_similarities = abs(sim(new_embedding))

        scores.append(adj_pairwise_similarities[-1])
    return scores

# test
similarity_by_sentence(first5)[0] # list of 5 vectors

array([1.        , 0.40768299, 0.40755288, 0.67590143, 0.4246948 ,
       0.40793986, 0.40041024, 0.33731479, 0.39235208, 0.40756263,
       0.38408862, 0.29963826, 0.37592479, 0.33262396, 0.40496517,
       0.87660168, 0.30394477, 0.82082471, 0.33819567, 0.41830157,
       0.39739546, 0.34681845, 0.38619247, 0.55832353, 0.42772886,
       0.97172558, 1.        ])

In [18]:
def top_sentences(
                user_input: str, 
                df: pd.DataFrame = sentences_df, 
                similarity_by_sentence: object = similarity_by_sentence, 
                ) -> list:
    """
    Returns closest match for each sentence.

    Args:
        1. user_input
            - input string to test for plagiarism
        2. df 
            - dataframe of article split into sentences
        3. similarity_by_sentence (function)
            - returns list of similarity vectors for each sentence in input
    Returns:
        1. top
            - list of sentences most similar to each sentence -> (so same length as input)
    """
    # return most similar sentence for each input sentence
    doc_id = df.shape[0]
    scores = similarity_by_sentence(user_input)
    similar_str = []
    top = []

    for vec in scores:
        similar_indices = np.argsort(vec)[::-1]
        for index in similar_indices:
            if index == doc_id: 
                continue
            similar_str.append((df.iloc[index]["content"])) #, vec[index]))
        top.append(similar_str[::len(similar_indices)-1]) # most similar sentence 
    top = top[-1]

    return top

top_sentences(first5)

['WASHINGTON  —   Congressional Republicans have a new fear when it comes to their    health care lawsuit against the Obama administration: They might win.',
 'The incoming Trump administration could choose to no longer defend the executive branch against the suit, which challenges the administration’s authority to spend billions of dollars on health insurance subsidies for   and   Americans, handing House Republicans a big victory on    issues.',
 'But a sudden loss of the disputed subsidies could conceivably cause the health care program to implode, leaving millions of people without access to health insurance before Republicans have prepared a replacement.',
 'That could lead to chaos in the insurance market and spur a political backlash just as Republicans gain full control of the government.',
 'To stave off that outcome, Republicans could find themselves in the awkward position of appropriating huge sums to temporarily prop up the Obama health care law, angering conservative vote

In [20]:
def output(
        user_input, 
        sim_threshold = 0.95,
        similarity_by_sentence: object = similarity_by_sentence,
        df: pd.DataFrame = sentences_df,
        top_sentences: object = top_sentences,
        list_of_str: list = list_of_str,
        highlight: object = highlight,
        ) -> str:
    """
    Returns whole article with sentences similar to user_input highlighted.

    Args:
        1. user_input
            - input string to test for plagiarism
        2. sim_threshold
            - cosine similarity threshold
        3. similarity_by_sentence (function)
            - returns list of similarity vectors for each sentence in input
        4. df
            - dataframe of article split into sentences
        5. top_sentences (function)
            - returns list of most similar sentences to user_input
        6. list_of_str
            - list of article split into sentences
        7. highlight (function)
            - highlights text
    Returns:
        1. result
            - whole article/doc with highlighted sentences where plagiarism is detected
    """
    display = []
    scores = similarity_by_sentence(user_input) # similarity vectors
    top = top_sentences(user_input) # most similar sentences
    doc_id = df.shape[0]

    # display whole doc with highlighted text
    for vec in scores:
        similar_indices = np.argsort(vec)[::-1]
        for index in similar_indices:
            if index == doc_id: 
                continue

            # highlight sentences above threshold
            for sentence in list_of_str:
                if sentence in top:
                    if vec[index] > sim_threshold: # apply similarity threshold
                        display.append(str(highlight(sentence))) 
                    else:
                        display.append(sentence)
                else:
                    display.append(sentence) 

    result = display[0:len(list_of_str)] # output as list
    return ' '.join(result) # output as string

In [21]:
"""Sample text for testing"""
# first 5 sentences from article 1
out = output(first5) # sentence 1 from article 1
#out = str_input('WASHINGTON  —   Congressional Republicans have a new fear when it comes to their health care lawsuit against the Obama administration: They might win.')
# mix from same article
#out = str_input('Eager to avoid an ugly political pileup, Republicans on Capitol Hill and the Trump transition team are gaming out how to handle the lawsuit, which, after the election, has been put in limbo until at least late February by the United States Court of Appeals for the District of Columbia Circuit. But on spending power and standing, the Trump administration may come under pressure from advocates of presidential authority to fight the House no matter their shared views on health care, since those precedents could have broad repercussions.')
# random
#out = str_input('somewhere over the rainbow')

out 

'\x1b[44mWASHINGTON  —   Congressional Republicans have a new fear when it comes to their    health care lawsuit against the Obama administration: They might win.\x1b[m \x1b[44mThe incoming Trump administration could choose to no longer defend the executive branch against the suit, which challenges the administration’s authority to spend billions of dollars on health insurance subsidies for   and   Americans, handing House Republicans a big victory on    issues.\x1b[m \x1b[44mBut a sudden loss of the disputed subsidies could conceivably cause the health care program to implode, leaving millions of people without access to health insurance before Republicans have prepared a replacement.\x1b[m \x1b[44mThat could lead to chaos in the insurance market and spur a political backlash just as Republicans gain full control of the government.\x1b[m \x1b[44mTo stave off that outcome, Republicans could find themselves in the awkward position of appropriating huge sums to temporarily prop up the Ob