In [1]:
import os
import pandas as pd
from tqdm import tqdm
from re import sub
import numpy as np
from thefuzz import fuzz
import shortuuid
import xml.etree.ElementTree as ET
import re
import sys
from gensim.utils import simple_preprocess
import gensim.downloader as api
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim.similarities import SparseTermSimilarityMatrix, WordEmbeddingSimilarityIndex, SoftCosineSimilarity, Similarity
from setup import *
import torch
from transformers import BertTokenizer, BertModel
import matplotlib.pyplot as plt
from scipy.spatial.distance import cosine

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
df = pd.read_json(os.path.join(OUT_DIR, FORMATTED_DATA_FILENAME), orient = "index")
df[TEXTEMBED1] = None
df[TEXTEMBED2] = None
df[COSINE_DISTANCE] = None
df.head()

Unnamed: 0,dataset,id_1,id_2,text_1,text_2,is_paraphrase,text_embedding_1,text_embedding_2,cosine_distance
0,DMoP,LHjpHEXk,gwDCzrDa,Roy of the Rovers,Roy of the Rovers,True,,,
1,DMoP,eTWUFL9g,kcR9wLxs,Roy of the Rovers is a British comic strip abo...,Roy of the Rovers is a British funny cartoon a...,True,,,
2,DMoP,KydYbt3U,M354NU7E,"The weekly strip ran until 1993, following Roy...",The week after week strip kept running until 1...,True,,,
3,DMoP,UiBxZ7ux,j5A8hRzD,Football-themed stories were a staple of Briti...,Football-themed stories were a staple of Briti...,True,,,
4,DMoP,W2NyueS6,Q3kK3v3Z,"The stock media phrase ""real 'Roy of the Rover...","The stock media express ""genuine 'Roy of the R...",True,,,


In [21]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True )
# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [28]:
#Check for paraphrase with fuzzy based
t1_embeddings = []
t2_embeddings = []
print("Creating embeddings for each sentence (text1 % text2) and calculating their distances ...")
for i, row in tqdm(df.iterrows(), total=df.shape[0]):
    if i>500:
        break

    # mark the text with BERT special characters
    t_1 = "[CLS] " + row[TEXT1].replace(".", ". [SEP][CLS]") 
    t_2 = "[CLS] " + row[TEXT2].replace(".", ". [SEP][CLS]") 
    if t_1.endswith("[CLS]"):
        t_1 = t_1[:-5]
    if not t_1.endswith("[SEP]"):
        t_1 = t_1 + " [SEP]"
    if t_2.endswith("[CLS]"):
        t_2 = t_2[:-5]
    if not t_2.endswith("[SEP]"):
        t_2 = t_2 + " [SEP]"

    # tokenize with BERT tokenizer
    t1_tokenized = tokenizer.tokenize(t_1)
    t2_tokenized = tokenizer.tokenize(t_2)

    # throw out longer that 512 token texts because BERT model struggels to process them
    if len(t1_tokenized) > 512 or len(t2_tokenized) > 512:
        continue

    # map tokens to vocab indices
    t1_indexed = tokenizer.convert_tokens_to_ids(t1_tokenized)
    t2_indexed = tokenizer.convert_tokens_to_ids(t2_tokenized)

    t1_segments_ids = [1] * len(t1_tokenized)
    t2_segments_ids = [1] * len(t2_tokenized)

    #Extract Embeddings
    t1_tensor = torch.tensor([t1_indexed])
    t1_segments_tensors = torch.tensor([t1_segments_ids])
    t2_tensor = torch.tensor([t2_indexed])
    t2_segments_tensors = torch.tensor([t2_segments_ids])


    # collect all of the hidden states produced from all layers 
    with torch.no_grad():
        t1_hidden_states = model(t1_tensor, t1_segments_tensors)[2]
        t2_hidden_states = model(t2_tensor, t2_segments_tensors)[2]

    # Concatenate the tensors for all layers (reate a new dimension in the tensor)
    t1_embeds = torch.stack(t1_hidden_states, dim=0)
    t2_embeds = torch.stack(t2_hidden_states, dim=0)

    # Remove dimension 1, the "batches".
    t1_embeds = torch.squeeze(t1_embeds, dim=1)
    t2_embeds = torch.squeeze(t2_embeds, dim=1)

    #Switch dimensions
    t1_embeds = t1_embeds.permute(1,0,2)
    t2_embeds = t2_embeds.permute(1,0,2)

    # Create Word Vector Representation for all tokens within the sentences
    t1_token_vecs = []
    t2_token_vecs = []
    for token in t1_embeds:
        # Concatenate the vectors (that is, append them together) from the last four layers.
        cat_vec = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)
        t1_token_vecs.append(cat_vec)
    for token in t2_embeds:
        # Concatenate the vectors (that is, append them together) from the last four layers.
        cat_vec = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)
        t2_token_vecs.append(cat_vec)

    # Create Sentence Vector Representations (average of all token vectors)
    text1_embedding = torch.mean(t1_hidden_states[-2][0], dim=0)
    text2_embedding = torch.mean(t2_hidden_states[-2][0], dim=0)

    cos_distance = 1 - cosine(text1_embedding, text2_embedding)

    df.at[i, TEXTEMBED1] = text1_embedding
    df.at[i, TEXTEMBED2] = text2_embedding
    df.at[i, COSINE_DISTANCE] = cos_distance

Checking for paraprhases with the fuzzy-based method. Dataframe rows to process: 179661


  0%|          | 501/179661 [04:25<26:20:25,  1.89it/s]


In [30]:
df.head(500)

Unnamed: 0,dataset,id_1,id_2,text_1,text_2,is_paraphrase,text_embedding_1,text_embedding_2,cosine_distance
0,DMoP,LHjpHEXk,gwDCzrDa,Roy of the Rovers,Roy of the Rovers,True,"[tensor(-0.4072), tensor(0.0276), tensor(-0.27...","[tensor(-0.4072), tensor(0.0276), tensor(-0.27...",1
1,DMoP,eTWUFL9g,kcR9wLxs,Roy of the Rovers is a British comic strip abo...,Roy of the Rovers is a British funny cartoon a...,True,"[tensor(-0.7420), tensor(-0.1898), tensor(0.26...","[tensor(-0.6134), tensor(-0.1051), tensor(0.30...",0.979295
2,DMoP,KydYbt3U,M354NU7E,"The weekly strip ran until 1993, following Roy...",The week after week strip kept running until 1...,True,"[tensor(-0.7921), tensor(-0.3342), tensor(0.19...","[tensor(-0.9006), tensor(-0.3374), tensor(0.20...",0.958392
3,DMoP,UiBxZ7ux,j5A8hRzD,Football-themed stories were a staple of Briti...,Football-themed stories were a staple of Briti...,True,"[tensor(-0.6070), tensor(-0.3094), tensor(-0.0...","[tensor(-0.4892), tensor(-0.1072), tensor(0.19...",0.960228
4,DMoP,W2NyueS6,Q3kK3v3Z,"The stock media phrase ""real 'Roy of the Rover...","The stock media express ""genuine 'Roy of the R...",True,"[tensor(0.1027), tensor(0.2486), tensor(0.1573...","[tensor(-0.1071), tensor(0.3589), tensor(0.311...",0.97198
...,...,...,...,...,...,...,...,...,...
495,MSRP,1967578,1967664,The decision to issue new guidance has been pr...,Scotland Yard's decision to issue new guidance...,True,"[tensor(-0.1433), tensor(-0.5085), tensor(-0.0...","[tensor(-0.1010), tensor(-0.7359), tensor(-0.1...",0.959953
496,MSRP,317570,317290,The memo on protecting sales of Windows and ot...,"The memo specifically mentioned Linux, a still...",True,"[tensor(-0.4673), tensor(0.0027), tensor(0.329...","[tensor(-0.3790), tensor(-0.3315), tensor(0.00...",0.96594
497,MSRP,2047034,2046820,"Unable to find a home for him, a judge told me...",The judge had told the state Department of Men...,True,"[tensor(-0.0604), tensor(0.0570), tensor(-0.19...","[tensor(0.0215), tensor(-0.0550), tensor(-0.27...",0.92392
498,MSRP,84518,84541,"Also Tuesday, the United States also released ...","Meanwhile in southern Iraq, the United States ...",True,"[tensor(-0.1560), tensor(-0.0777), tensor(0.10...","[tensor(0.1307), tensor(-0.2524), tensor(-0.28...",0.947375
