In [3]:
import pandas as pd
import numpy as np
import spacy
from sklearn.metrics.pairwise import cosine_similarity
import re
from transformers import DistilBertTokenizerFast, DistilBertModel, pipeline
from tqdm.auto import tqdm

a1 = pd.read_csv('s3://gitlab-sagemaker-init/Data/Raw/plagiarism_detection/archive (1)/articles1.csv', sep=',')
a2 = pd.read_csv('s3://gitlab-sagemaker-init/Data/Raw/plagiarism_detection/archive (1)/articles2.csv', sep=',')
a3 = pd.read_csv('s3://gitlab-sagemaker-init/Data/Raw/plagiarism_detection/archive (1)/articles3.csv', sep=',')

# dataframe
articles = pd.concat([a1,a2,a3])
articles = articles[['content']]

# subset for demo
articles = articles.iloc[0:100,:]

In [4]:
# load model
model = DistilBertModel.from_pretrained("distilbert-base-uncased")
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [41]:
list_of_str = list(articles['content'])

In [139]:
# batch size test
# for example, sample of 100 yields fastest iteration with no batching
for batch_size in [1, 8, 64, 256]:
    print("-" * 30)
    print(f"Streaming batch_size={batch_size}")
    for out in tqdm(pipe(list_of_str, batch_size=batch_size), total=len(list_of_str)):
        pass

------------------------------
Streaming batch_size=1


  0%|          | 0/100 [00:00<?, ?it/s]

------------------------------
Streaming batch_size=8


  0%|          | 0/100 [00:00<?, ?it/s]

------------------------------
Streaming batch_size=64


  0%|          | 0/100 [00:00<?, ?it/s]

------------------------------
Streaming batch_size=256


  0%|          | 0/100 [00:00<?, ?it/s]

In [140]:
# model implementation with pipeline batching
batch_size = 1 
pipe = pipeline('feature-extraction', model=model, tokenizer=tokenizer, return_tensors='pt', padding=True, truncation=True, batch_size=batch_size)
test = pipe(list_of_str)

empty = []
for i in range(len(test)):
    empty.append(test[i][0][-1]) # last hidden layer
document_embeddings = np.array(empty) # encoded vectors

In [44]:
def most_similar(doc_id, similarity_matrix, df):
    """
    This function returns a list of the top 5 strings/documents 
    (from our database) most similar to our input.
    
    Args:
        1. doc_id (obj: int)
            - user input string to test for plagiarism
        2. similarity_matrix (obj: NumPy array)
            - n-dimensional array of cosine similarity scores
        3. df (obj: Pandas dataframe)
            - dataframe of input strings
    Returns:
        1. top5 (obj: list)
            - list of top 5 documents most similar to input string, along with similarity scores
    """
    similar_str = []
    top5=[]
    similar_ix = np.argsort(similarity_matrix[doc_id])[::-1]
    for ix in similar_ix:
        if ix == doc_id:
            continue
        similar_str.append((f'Excerpt: {df.iloc[ix]["content"][0:100]}...', f'Cosine Similarity : {similarity_matrix[doc_id][ix]}'))
    for i in range(5):
        top5.append(similar_str[i])
    return top5

In [134]:
def str_input(sentence: str, df: pd.DataFrame = articles, orig_doc: pd.array = document_embeddings, sim = cosine_similarity, func = most_similar): 
    """ 
    This function takes input string to test for similarity to other strings/documents 
    in our database.
    
    Args:
        1. sentence (obj: string)
            - user input string to test for plagiarism
        2. df (obj: Pandas dataframe)
            - contains articles to compare
        3. orig_doc (obj: NumPy array)
            - vectorized string data
        4. sim (obj: function)
            - cosine similarity - compares similarity of vectors
        5. func (obj: function)
            - returns display of similar strings/documents
        6. model (obj: model)
            - implement BERT model
    Returns:
        1. top5 (obj: list)
            - list of top 5 documents most similar to input string, along with similarity scores
    """
    # remove punctuation
    #cleaned_sentence = re.sub(r'[^\w\s]', '', str(sentence))
    
    # add string to df
    articles_plus_input = df.copy()
    articles_plus_input.loc[len(df.index)] = sentence 
    
    # encode
    #encoded = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True)
    #input_embedding = model(**encoded)['last_hidden_state'].detach().numpy()[0][-1]
    input_embedding = pipe(sentence)[0][-1]
    
    # compute cosine similarities
    new_embedding = np.vstack([orig_doc, input_embedding]) # append encoded vector
    adj_pairwise_similarities = abs(sim(new_embedding))
    
    return func(articles_plus_input.shape[0]-1, adj_pairwise_similarities, articles_plus_input)

In [141]:
str_input('here is a test sentence')

[('Excerpt: This article is part of a series aimed at helping you navigate life’s opportunities and challenges. ...',
  'Cosine Similarity : 0.23963339064016137'),
 ('Excerpt: When the Green Bay Packers lost to the Washington Redskins in Week 11, dropping to   Aaron Rodgers v...',
  'Cosine Similarity : 0.23776744939233005'),
 ('Excerpt: For the 12th straight year, the Travel section presents its annual Places to Go issue. You will like...',
  'Cosine Similarity : 0.23546178172064744'),
 ('Excerpt:  (Want to get this briefing by email? Here’s the  .) Good morning. Here’s what you need to know: • C...',
  'Cosine Similarity : 0.23312710792647662'),
 ('Excerpt: After the bullet shells get counted, the blood dries and the votive candles burn out, people peer do...',
  'Cosine Similarity : 0.2321069741634577')]

In [142]:
str_input(long_text) # full excerpt

[('Excerpt: WASHINGTON  —   Congressional Republicans have a new fear when it comes to their    health care laws...',
  'Cosine Similarity : 1.0000000000000004'),
 ('Excerpt: WASHINGTON  —   Congress opened for battle over the Affordable Care Act on Wednesday as Republicans ...',
  'Cosine Similarity : 0.9708710681910544'),
 ('Excerpt: WASHINGTON  —   Vice   Mike Pence and the top Republicans in Congress made clear on Wednesday, more ...',
  'Cosine Similarity : 0.9639289438086232'),
 ('Excerpt: WASHINGTON  —   It was supposed to be a triumphant morning for Republicans on Capitol Hill  —   a mo...',
  'Cosine Similarity : 0.9608567073816159'),
 ('Excerpt: WASHINGTON  —   Majorities in Congress often overreach, but usually not on the very first day. House...',
  'Cosine Similarity : 0.9604713496482427')]

In [137]:
# mid-size excerpt
str_input('Obama administration had been distributing the health insurance subsidies, in violation of the Constitution, without approval from Congress. The Justice Department, confident that Judge Collyer’s decision would be reversed, quickly appealed, and the subsidies have remained in place during the appeal. In successfully seeking a temporary halt in the proceedings after Mr. Trump won, House Republicans last month told the court that they “and the  ’s transition team currently are discussing potential options for resolution of this matter, to take effect after the  ’s inauguration on Jan. 20, 2017. ” The suspension of the case, House lawyers said, will “provide the')

[('Excerpt: WASHINGTON  —   Congressional Republicans have a new fear when it comes to their    health care laws...',
  'Cosine Similarity : 0.9558768026297194'),
 ('Excerpt: WASHINGTON  —   Congress opened for battle over the Affordable Care Act on Wednesday as Republicans ...',
  'Cosine Similarity : 0.935531523691726'),
 ('Excerpt: WASHINGTON  —   It was supposed to be a triumphant morning for Republicans on Capitol Hill  —   a mo...',
  'Cosine Similarity : 0.9220396312133845'),
 ('Excerpt: WASHINGTON  —   Vice   Mike Pence and the top Republicans in Congress made clear on Wednesday, more ...',
  'Cosine Similarity : 0.9183812507816784'),
 ('Excerpt: WASHINGTON  —     Donald J. Trump lashed out at Democrats on Thursday over their efforts to preserve...',
  'Cosine Similarity : 0.9137772488365332')]

In [50]:
# strings for testing
short_text = "hello"
long_text = 'WASHINGTON  —   Congressional Republicans have a new fear when it comes to their    health care lawsuit against the Obama administration: They might win. The incoming Trump administration could choose to no longer defend the executive branch against the suit, which challenges the administration’s authority to spend billions of dollars on health insurance subsidies for   and   Americans, handing House Republicans a big victory on    issues. But a sudden loss of the disputed subsidies could conceivably cause the health care program to implode, leaving millions of people without access to health insurance before Republicans have prepared a replacement. That could lead to chaos in the insurance market and spur a political backlash just as Republicans gain full control of the government. To stave off that outcome, Republicans could find themselves in the awkward position of appropriating huge sums to temporarily prop up the Obama health care law, angering conservative voters who have been demanding an end to the law for years. In another twist, Donald J. Trump’s administration, worried about preserving executive branch prerogatives, could choose to fight its Republican allies in the House on some central questions in the dispute. Eager to avoid an ugly political pileup, Republicans on Capitol Hill and the Trump transition team are gaming out how to handle the lawsuit, which, after the election, has been put in limbo until at least late February by the United States Court of Appeals for the District of Columbia Circuit. They are not yet ready to divulge their strategy. “Given that this pending litigation involves the Obama administration and Congress, it would be inappropriate to comment,” said Phillip J. Blando, a spokesman for the Trump transition effort. “Upon taking office, the Trump administration will evaluate this case and all related aspects of the Affordable Care Act. ” In a potentially   decision in 2015, Judge Rosemary M. Collyer ruled that House Republicans had the standing to sue the executive branch over a spending dispute and that the Obama administration had been distributing the health insurance subsidies, in violation of the Constitution, without approval from Congress. The Justice Department, confident that Judge Collyer’s decision would be reversed, quickly appealed, and the subsidies have remained in place during the appeal. In successfully seeking a temporary halt in the proceedings after Mr. Trump won, House Republicans last month told the court that they “and the  ’s transition team currently are discussing potential options for resolution of this matter, to take effect after the  ’s inauguration on Jan. 20, 2017. ” The suspension of the case, House lawyers said, will “provide the   and his future administration time to consider whether to continue prosecuting or to otherwise resolve this appeal. ” Republican leadership officials in the House acknowledge the possibility of “cascading effects” if the   payments, which have totaled an estimated $13 billion, are suddenly stopped. Insurers that receive the subsidies in exchange for paying    costs such as deductibles and   for eligible consumers could race to drop coverage since they would be losing money. Over all, the loss of the subsidies could destabilize the entire program and cause a lack of confidence that leads other insurers to seek a quick exit as well. Anticipating that the Trump administration might not be inclined to mount a vigorous fight against the House Republicans given the  ’s dim view of the health care law, a team of lawyers this month sought to intervene in the case on behalf of two participants in the health care program. In their request, the lawyers predicted that a deal between House Republicans and the new administration to dismiss or settle the case “will produce devastating consequences for the individuals who receive these reductions, as well as for the nation’s health insurance and health care systems generally. ” No matter what happens, House Republicans say, they want to prevail on two overarching concepts: the congressional power of the purse, and the right of Congress to sue the executive branch if it violates the Constitution regarding that spending power. House Republicans contend that Congress never appropriated the money for the subsidies, as required by the Constitution. In the suit, which was initially championed by John A. Boehner, the House speaker at the time, and later in House committee reports, Republicans asserted that the administration, desperate for the funding, had required the Treasury Department to provide it despite widespread internal skepticism that the spending was proper. The White House said that the spending was a permanent part of the law passed in 2010, and that no annual appropriation was required  —   even though the administration initially sought one. Just as important to House Republicans, Judge Collyer found that Congress had the standing to sue the White House on this issue  —   a ruling that many legal experts said was flawed  —   and they want that precedent to be set to restore congressional leverage over the executive branch. But on spending power and standing, the Trump administration may come under pressure from advocates of presidential authority to fight the House no matter their shared views on health care, since those precedents could have broad repercussions. It is a complicated set of dynamics illustrating how a quick legal victory for the House in the Trump era might come with costs that Republicans never anticipated when they took on the Obama White House.'
short_list_of_str = ['first! str%ing', 'sec-ond', 'and the ?third']

In [42]:
"""# non-pipeline model implementation
encoded_input = tokenizer(list_of_str, return_tensors='pt', padding=True, truncation=True)
output = model(**encoded_input)

input_array = encoded_input['input_ids'].numpy()
output_array = output['last_hidden_state'].detach().numpy()

enc_list = []
for i in range(len(output_array)):
     enc_list.append(output_array[i][-1])

print('encoded input: \n', input_array)
#print('output: \n', enc_list)"""

encoded input: 
 [[  101  2899  1517 ...  1998  2010   102]
 [  101  2044  1996 ...  6439  1010   102]
 [  101  2043 10598 ...  8571  1010   102]
 ...
 [  101  1037  2146 ...  3345  1012   102]
 [  101  5626  1517 ...  2015  2000   102]
 [  101  8084  2007 ...  3811 11126   102]]
