# Plagiarism Detection using Transformers based models   
Author: [Zoumana KEITA](https://medium.com/@zoumanakeita)

In [1]:
import pandas as pd
from tqdm import tqdm

In [2]:
#source_data = pd.read_csv("cord19_df.csv", low_memory=False)

In [3]:
def preprocess_data(data_path, sample_size):

  # Read the data from specific path
  data = pd.read_csv(data_path, low_memory=False)

  # Drop articles without Abstract
  data = data.dropna(subset = ['abstract']).reset_index(drop = True)

  # Get "sample_size" random articles
  data = data.sample(sample_size)[['abstract', 'paper_id']]

  return data


data_path = "cord19_df.csv"
source_data = preprocess_data(data_path, 100)

In [4]:
source_data.sample(5)

Unnamed: 0,abstract,paper_id
39063,BACKGROUND: In children suffering from severe ...,94a14012df753e9d47400bf1b660d7559e4976d2
30713,Summary Background The ResPOC study demonstrat...,9c6e8475fcce3358d74c1d01f229ae3a6ab54907
25911,Abstract Viral pathogenesis is a field in rapi...,9b72a84c32d2c8e88183038483b0648be006dee1
362,Background: Human infections with zoonotic cor...,a67012609fad77c2a1dc55f139b044c546cd13a8
29735,The spike glycoprotein is a major neutralizing...,fb35ca21196de0d62cae1d2d8fc0677ca1e04677


## Load BERT Model

In [5]:
!pip -q install transformers
import torch
from keras.preprocessing.sequence import pad_sequences
from transformers import BertTokenizer,  AutoModelForSequenceClassification

# Cosine Similarity
from sklearn.metrics.pairwise import cosine_similarity

In [6]:
model_path = "bert-base-uncased"

tokenizer = BertTokenizer.from_pretrained(model_path, 
                                          do_lower_case=True)

model = AutoModelForSequenceClassification.from_pretrained(model_path,
                                                          output_attentions=False,
                                                          output_hidden_states=True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [7]:
def create_vector_from_text(tokenizer, model, text, MAX_LEN = 510):
    
    input_ids = tokenizer.encode(
                        text, 
                        add_special_tokens = True, 
                        max_length = MAX_LEN,                           
                   )    

    results = pad_sequences([input_ids], maxlen=MAX_LEN, dtype="long", 
                              truncating="post", padding="post")
    
    # Remove the outer list.
    input_ids = results[0]

    # Create attention masks    
    attention_mask = [int(i>0) for i in input_ids]
    
    # Convert to tensors.
    input_ids = torch.tensor(input_ids)
    attention_mask = torch.tensor(attention_mask)

    # Add an extra dimension for the "batch" (even though there is only one 
    # input in this batch.)
    input_ids = input_ids.unsqueeze(0)
    attention_mask = attention_mask.unsqueeze(0)
    
    # Put the model in "evaluation" mode, meaning feed-forward operation.
    model.eval()
    
    # Run the text through BERT, and collect all of the hidden states produced
    # from all 12 layers. 
    with torch.no_grad():        
        logits, encoded_layers = model(
                                    input_ids = input_ids, 
                                    token_type_ids = None, 
                                    attention_mask = attention_mask,
                                    return_dict=False)

    layer_i = 12 # The last BERT layer before the classifier.
    batch_i = 0 # Only one input in the batch.
    token_i = 0 # The first token, corresponding to [CLS]
        
    # Extract the embedding.
    vector = encoded_layers[layer_i][batch_i][token_i]

    # Move to the CPU and convert to numpy ndarray.
    vector = vector.detach().cpu().numpy()

    return(vector)

# Create vector Database

In [8]:
import numpy as np

def create_vector_database(data):
    
    # The list of all the vectors
    vectors = []
    
    # Get overall text data
    source_data = data.abstract.values
    
    # Loop over all the comment and get the embeddings
    for text in tqdm(source_data):
        
        # Get the embedding 
        vector = create_vector_from_text(tokenizer, model, text)
        
        #add it to the list
        vectors.append(vector)
    
    data["vectors"] = vectors
    data["vectors"] = data["vectors"].apply(lambda emb: np.array(emb))
    data["vectors"] = data["vectors"].apply(lambda emb: emb.reshape(1, -1))
    
    return data

In [9]:
vector_database = create_vector_database(source_data)

  0%|          | 0/100 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 100/100 [03:21<00:00,  2.01s/it]


In [10]:
vector_database.sample(5)

Unnamed: 0,abstract,paper_id,vectors
15295,Ecofriendly N-heterocyclic carbene (NHC) organ...,74f8669398e63312feb0b386234ce36ee8d873f6,"[[-1.0229445, -0.3536779, -0.012334654, 0.0671..."
33408,BACKGROUND: It has been postulated that geneti...,c6008b68c8b16e3a6a48a2cb892bac5c9353df86,"[[-0.66390014, -0.8412495, -0.85710824, -0.526..."
32333,BACKGROUND: Continuous outbreaks of the highly...,dca1ffcfcd6a2b7c8495e77c438d6b91d065bbf5,"[[-0.8390897, -0.9009724, -0.60617423, -0.6275..."
17103,Summary The burden of pneumonia in Australian ...,f9b0a5c1f3dd04fc1b61a0c6ab55974b4f451533,"[[-0.19816953, 0.23041387, -1.0591033, 0.09286..."
362,Background: Human infections with zoonotic cor...,a67012609fad77c2a1dc55f139b044c546cd13a8,"[[-0.39718565, -0.6395205, -0.28191552, -0.506..."


# Language detector and translation

In [11]:
!pip -q install sentencepiece
from transformers import MarianMTModel, MarianTokenizer

**Note**: 
Make sure to restart your kernel after running the previous command to be able to use sentencepiece module. 

In [12]:
"""
Candidate Languages
de: German
fr: French      el: Greek         
ja: Japan       ru: Russian
"""
language_list = ['de', 'fr', 'el', 'ja', 'ru']

In [13]:
# Install the library
!pip -q install langdetect
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0

In [14]:
def translate_text(text, text_lang, target_lang='en'):

  # Get the name of the model
  model_name = f"Helsinki-NLP/opus-mt-{text_lang}-{target_lang}"

  # Get the tokenizer
  tokenizer = MarianTokenizer.from_pretrained(model_name)

  # Instantiate the model
  model = MarianMTModel.from_pretrained(model_name)

  # Translation of the text
  formated_text = ">>{}<< {}".format(text_lang, text)

  translation = model.generate(**tokenizer([formated_text], return_tensors="pt", padding=True))

  translated_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translation][0]

  return translated_text

In [28]:
def process_document(text):
    """
    Create a vector for given text and adjust it for cosine similarity search
    """
    text_vect = create_vector_from_text(tokenizer, model, text)
    text_vect = np.array(text_vect)
    text_vect = text_vect.reshape(1, -1)

    return text_vect

    
def is_plagiarism(similarity_score, plagiarism_threshold):

  is_plagiarism = False

  if(similarity_score >= plagiarism_threshold):
    is_plagiarism = True

  return is_plagiarism


def check_incoming_document(incoming_document):

  text_lang = detect(incoming_document)
  language_list = ['de', 'fr', 'el', 'ja', 'ru']

  final_result = ""

  if(text_lang == 'en'):
    final_result = incoming_document 

  elif(text_lang not in language_list):
    final_result = None

  else:
    # Translate in English
    final_result = translate_text(incoming_document, text_lang)

  return final_result


def run_plagiarism_analysis(query_text, data, plagiarism_threshold=0.8):

    top_N=3

    # Check the language of the query/incoming text and translate if required. 
    document_translation = check_incoming_document(query_text)

    if(document_translation is None):
      print("Only the following languages are supported: English, French, Russian, German, Greek and Japanese")
      exit(-1)

    else:
      # Preprocess the document to get the required vector for similarity analysis
      query_vect = process_document(document_translation)
      
      # Run similarity Search
      data["similarity"] = data["vectors"].apply(lambda x: cosine_similarity(query_vect, x))
      data["similarity"] = data["similarity"].apply(lambda x: x[0][0])

      similar_articles = data.sort_values(by='similarity', ascending=False)[0:top_N+1]
      formated_result = similar_articles[["abstract", "paper_id", "similarity"]].reset_index(drop = True)

      similarity_score = formated_result.iloc[0]["similarity"] 
      most_similar_article = formated_result.iloc[0]["abstract"] 
      is_plagiarism_bool = is_plagiarism(similarity_score, plagiarism_threshold)

      plagiarism_decision = {'similarity_score': similarity_score, 
                             'is_plagiarism': is_plagiarism_bool,
                             'most_similar_article': most_similar_article, 
                             'article_submitted': query_text
                            }

      return plagiarism_decision

In [29]:
# Select an existing article from the database
new_incoming_text = source_data.iloc[0]['abstract']

# Run the plagiarism detection
analysis_result = run_plagiarism_analysis(new_incoming_text, vector_database, plagiarism_threshold=0.8)

In [30]:
analysis_result

{'similarity_score': 1.0,
 'is_plagiarism': True,
 'most_similar_article': 'Zoonotic viruses of wildlife origin have caused the majority of recent emerging infectious diseases (EIDs) that have had significant impacts on human health or economies. Animal consumption-based food systems, ranging from the harvest of free-ranging wild species (hereafter, wild harvest systems) to the in situ stocking of domestic or farmed wild animals (hereafter, animal production systems), have been implicated in the emergence of many of these viruses, including HIV, Ebola, SARS, and highly pathogenic avian influenza (HPAI).',
 'article_submitted': 'Zoonotic viruses of wildlife origin have caused the majority of recent emerging infectious diseases (EIDs) that have had significant impacts on human health or economies. Animal consumption-based food systems, ranging from the harvest of free-ranging wild species (hereafter, wild harvest systems) to the in situ stocking of domestic or farmed wild animals (hereaf

In [39]:
french_article_to_check = """
Les Réseaux d’Innovation et de Transfert Agricole (RITA) ont été créés en 2011 pour mieux connecter la recherche et le développement agricole, intra et inter-DOM, avec un objectif d’accompagnement de la diversification des productions locales. Le CGAAER a été chargé d'analyser ce dispositif et de proposer des pistes d'action pour améliorer la chaine Recherche – Formation – Innovation – Développement – Transfert dans les outre-mer dans un contexte d'agriculture durable, au profit de l'accroissement de l'autonomie alimentaire.
"""

In [40]:
analysis_result = run_plagiarism_analysis(french_article_to_check, vector_database, plagiarism_threshold=0.8)
analysis_result



{'similarity_score': 0.783689,
 'is_plagiarism': False,
 'most_similar_article': 'Abstract It is not widely known that quite a few researchers are faced with difficulties in using various resources of disaster management research in Korea. The article aims to assess how rigorously the Korean field of disaster management research resources has been managed or how it can be improved for the ultimate goal of disaster management. Descriptive content analysis has been used as the major methodology by referring to the Johari window. In doing so, electronic research resources have been systematically compared with integrated research resources via the perspective of Korean-speaking researchers and that of English-speaking researchers. The conclusion is that two researchers have to be integrated with all four research resources (open, blind, hidden, and unknown resources) by implementing assigned responsibilities as well as freely asking questions. Ultimately, this will be conducive to reducin

In [42]:
german_article_to_check = """Derzeit ist eine Reihe strukturell und funktionell unterschiedlicher temperaturempfindlicher Elemente wie RNA-Thermometer bekannt, die eine Vielzahl biologischer Prozesse in Bakterien, einschließlich der Virulenz, steuern. Auf der Grundlage einer Computer- und thermodynamischen Analyse der vollständig sequenzierten Genome von 25 Salmonella enterica-Isolaten wurden ein Algorithmus und Kriterien für die Suche nach potenziellen RNA-Thermometern entwickelt. Er wird es ermöglichen, die Suche nach potentiellen Riboschaltern im Genom anderer gesellschaftlich wichtiger Krankheitserreger durchzuführen. Für S. enterica wurden neben dem bekannten 4U-RNA-Thermometer vier Hairpin-Loop-Strukturen identifiziert, die wahrscheinlich als weitere RNA-Thermometer fungieren. Sie erfüllen die notwendigen und hinreichenden Bedingungen für die Bildung von RNA-Thermometern und sind hochkonservative nichtkanonische Strukturen, da diese hochkonservativen Strukturen im Genom aller 25 Isolate von S. enterica gefunden wurden. Die Hairpins, die eine kreuzförmige Struktur in der supergewickelten pUC8-DNA bilden, wurden mit Hilfe der Rasterkraftmikroskopie sichtbar gemacht."""




analysis_result = run_plagiarism_analysis(german_article_to_check, vector_database, plagiarism_threshold=0.8)
analysis_result

Downloading:   0%|          | 0.00/797k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/768k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.38k [00:00<?, ?B/s]



Downloading:   0%|          | 0.00/298M [00:00<?, ?B/s]



{'similarity_score': 0.9701626,
 'is_plagiarism': True,
 'most_similar_article': 'Currently, a number of structurally and functionally different temperature-sensitive elements such as RNA thermometers which control a variety of biological processes in bacteria, including virulence, are known. Based on computer and thermodynamic analysis of completely sequenced genomes of 25 Salmonella enterica isolates, the algorithm and criteria for the search of potential RNA thermometers were developed. It will make it possible to carry out the search for potential riboswitches in the genome of other socially important pathogens. For S. enterica, apart from the known 4U RNA thermometer, four hairpin-loop structures were identified which may probably act as additional RNA thermometers. They satisfy the necessary and sufficient conditions for formation of RNA thermometers and are highly conservative uncanonical structures, since these highly conservative structures were found in the genome of all 25 i