In [107]:
import os
import pandas as pd
from tqdm import tqdm
from re import sub
import numpy as np
from thefuzz import fuzz
import shortuuid
import xml.etree.ElementTree as ET
import re
import sys
from gensim.utils import simple_preprocess
import gensim.downloader as api
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim.similarities import SparseTermSimilarityMatrix, WordEmbeddingSimilarityIndex, SoftCosineSimilarity, Similarity

In [108]:
DATASETS_FOLDER = "datasets"    #the folder that contains the dataset directories to read in
FORMATTED_DATA_FILENAME = "true_data.json"  #the name of the file that contains the data to read in
DATASETS = ["DMoP", "MSRP", "ETPC"]     #the folders in the DATASETS_FOLDER should be named like the datasets here
OUT_DIR = "output"      #the directory to output the formatted json in

FUZZY = "fuzzy_based_result"
SEMANTIC = "semantic_based_result"

STOPWORDS = ['the', 'and', 'are', 'a']

pd.set_option("display.max_colwidth", None)

In [109]:
df = pd.read_json(os.path.join(OUT_DIR, FORMATTED_DATA_FILENAME), orient = "index")
df.head()

Unnamed: 0,dataset,id_1,id_2,text_1,text_2,is_paraphrase
0,DMoP,HiDc8dkT,3n3Na3BD,Roy of the Rovers,Roy of the Rovers,True
1,DMoP,5KfcxDBb,QUzQei9u,"Roy of the Rovers is a British comic strip about the life and times of a fictional footballer named Roy Race, who played for Melchester Rovers. The strip first appeared in the ""Tiger"" in 1954, before giving its name to a weekly (and later monthly) comic magazine, published by IPC and Fleetway from 1976 until 1995, in which it was the main feature.","Roy of the Rovers is a British funny cartoon about the life and times of an anecdotal footballer named Roy Race, who played for Melchester Rovers. The strip originally showed up in the ""Tiger"" in 1954, preceding giving its name to a week after week (and later month to month) comic magazine, distributed by IPC and Fleetway from 1976 until 1995, in which it was the principle include.",True
2,DMoP,gxShcSRt,5nSAkBfd,"The weekly strip ran until 1993, following Roy's playing career until its conclusion after he lost his left foot in a helicopter crash. When the monthly comic was launched later that year the focus switched to Roy's son Rocky, who also played for Melchester. This publication was short-lived, and folded after only 19 issues. The adventures of the Race family were subsequently featured in the monthly ""Match of the Day"" football magazine, in which father and son were reunited as manager and player respectively. These strips began in 1997 and continued until the magazine's closure in May 2001.","The week after week strip kept running until 1993, after Roy's playing vocation until its decision after he lost his left foot in a helicopter crash. At the point when the month to month comic was propelled soon thereafter the center changed to Roy's child Rocky, who additionally played for Melchester. This production was brief, and collapsed after just 19 issues. The undertakings of the Race family were in this manner highlighted in the month to month ""Match of the Day"" football magazine, in which father and child were brought together as chief and player separately. These strips started in 1997 and proceeded until the magazine's conclusion in May 2001.",True
3,DMoP,ERwDq4GC,mFG5GR3n,"Football-themed stories were a staple of British comics for boys from the 1950s onwards, and Roy of the Rovers was the most popular. To keep the strip exciting, Melchester was almost every year either competing for major honours or struggling against relegation to a lower division; a normal, uneventful season of mid-table mediocrity was unknown at Melchester Rovers. The strip followed the structure of the actual English football season, thus there were several months each year in summer when there was no league football. By far the most common summer storyline saw Melchester touring a fictional country in an exotic part of the world, often South America, where they would invariably be kidnapped and held to ransom. The average reader probably stayed with the comic regularly for only three or four years, therefore storylines were sometimes recycled; during the first ten years of his playing career, Roy was kidnapped at least four times. Roy also made numerous appearances for England, depicted playing alongside actual players such as Malcolm Macdonald and Trevor Francis.","Football-themed stories were a staple of British funnies for young men from the 1950s onwards, and Roy of the Rovers was the most prevalent. To keep the strip energizing, Melchester was pretty much consistently either seeking real distinctions or battling against transfer to a lower division; a typical, uneventful period of mid-table unremarkableness was obscure at Melchester Rovers. The strip pursued the structure of the real English football season, along these lines there were a while every year in summer when there was no association football. By a long shot the most well-known summer storyline saw Melchester visiting an anecdotal nation in a fascinating piece of the world, regularly South America, where they would perpetually be grabbed and held to recover. The normal peruser presumably remained with the comic consistently for just three or four years, in this way storylines were now and then reused; amid the initial ten years of his playing vocation, Roy was seized something like multiple times. Roy likewise shown up for England, delineated playing nearby real players, for example, Malcolm Macdonald and Trevor Francis.",True
4,DMoP,Fq6uMo6z,RWeiMaNw,"The stock media phrase ""real 'Roy of the Rovers' stuff"" is often used by football writers, commentators and fans when describing displays of great skill, or surprising results that go against the odds, in reference to the dramatic storylines that were the strip's trademark.","The stock media express ""genuine 'Roy of the Rovers' stuff"" is regularly utilized by football journalists, observers and fans while portraying presentations of extraordinary ability, or amazing outcomes that conflict with the chances, in reference to the sensational storylines that were the strip's trademark.",True


In [126]:
#Check for paraphrase with fuzzy based
fuzzy_results = []
print("Checking for paraprhases with the fuzzy-based method. Dataframe rows to process: " + str(len(df)))
for i, row in tqdm(df.iterrows()):
    fuzzy_results.append(float(fuzz.ratio(row["text_1"], row["text_2"])/100))

df[FUZZY] = fuzzy_results

Checking for paraprhases with the fuzzy-based method. Dataframe rows to process: 11972


11972it [00:00, 12090.20it/s]


In [111]:
def preprocess(doc):
        # Tokenize and clean data
        doc = sub(r'<img[^<>]+(>|$)', " image_token ", doc)
        doc = sub(r'<[^<>]+(>|$)', " ", doc)
        doc = sub(r'\[img_assist[^]]*?\]', " ", doc)
        doc = sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', " url_token ", doc)
        return [token for token in simple_preprocess(doc, min_len=0, max_len=float("inf")) if token not in STOPWORDS]


In [112]:
def check_semantic(corpus, string_2, similarity_matrix):

    query = preprocess(string_2)

    query_tf = tfidf[dictionary.doc2bow(query)]
    index = SoftCosineSimilarity(tfidf[[dictionary.doc2bow(document) for document in corpus]], similarity_matrix)

    return index[query_tf]

In [113]:
corpus = [ preprocess(document) for document in list(df["text_1"]) ]
# use a pre trained model: https://huggingface.co/fse/glove-wiki-gigaword-50 , https://nlp.stanford.edu/pubs/glove.pdf
glove = api.load("glove-wiki-gigaword-50")
similarity_index = WordEmbeddingSimilarityIndex(glove)

In [114]:
# Build the term dictionary and the tfidf model
dictionary = Dictionary(corpus)
tfidf = TfidfModel(dictionary=dictionary)

# Create the term similarity matrix.    
print("Creating the similarity matrix...")
similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary, tfidf)     #takes a long time


Creating the similarity matrix...


100%|██████████| 15821/15821 [04:12<00:00, 62.65it/s] 


In [132]:
print("Processing texts...")
semantic_results = []
for i, row in tqdm(df.iterrows(), total=df.shape[0]):
    sim = check_semantic(corpus, row["text_2"], similarity_matrix)
    try:
        semantic_results.append(sim[i])
    except Exception as e:
        print("result is " + str(sim) + ". Appending the only result value: " + str(float(sim.item())))
        print(e)
        semantic_results.append(float(sim.item()))
        continue

df[SEMANTIC] = semantic_results

Processing texts...


  1%|▏         | 172/11972 [07:17<6:21:22,  1.94s/it] 

result is 0.0. Appending the only result value: 0.0
too many indices for array: array is 0-dimensional, but 1 were indexed


  3%|▎         | 308/11972 [13:03<5:34:34,  1.72s/it] 

result is 0.0. Appending the only result value: 0.0
too many indices for array: array is 0-dimensional, but 1 were indexed


100%|██████████| 11972/11972 [5:11:10<00:00,  1.56s/it] 


In [133]:
df.head(175)

Unnamed: 0,dataset,id_1,id_2,text_1,text_2,is_paraphrase,fuzzy_based_result,semantic_based_result
0,DMoP,HiDc8dkT,3n3Na3BD,Roy of the Rovers,Roy of the Rovers,True,1.00,1.000000
1,DMoP,5KfcxDBb,QUzQei9u,"Roy of the Rovers is a British comic strip about the life and times of a fictional footballer named Roy Race, who played for Melchester Rovers. The strip first appeared in the ""Tiger"" in 1954, before giving its name to a weekly (and later monthly) comic magazine, published by IPC and Fleetway from 1976 until 1995, in which it was the main feature.","Roy of the Rovers is a British funny cartoon about the life and times of an anecdotal footballer named Roy Race, who played for Melchester Rovers. The strip originally showed up in the ""Tiger"" in 1954, preceding giving its name to a week after week (and later month to month) comic magazine, distributed by IPC and Fleetway from 1976 until 1995, in which it was the principle include.",True,0.83,0.777395
2,DMoP,gxShcSRt,5nSAkBfd,"The weekly strip ran until 1993, following Roy's playing career until its conclusion after he lost his left foot in a helicopter crash. When the monthly comic was launched later that year the focus switched to Roy's son Rocky, who also played for Melchester. This publication was short-lived, and folded after only 19 issues. The adventures of the Race family were subsequently featured in the monthly ""Match of the Day"" football magazine, in which father and son were reunited as manager and player respectively. These strips began in 1997 and continued until the magazine's closure in May 2001.","The week after week strip kept running until 1993, after Roy's playing vocation until its decision after he lost his left foot in a helicopter crash. At the point when the month to month comic was propelled soon thereafter the center changed to Roy's child Rocky, who additionally played for Melchester. This production was brief, and collapsed after just 19 issues. The undertakings of the Race family were in this manner highlighted in the month to month ""Match of the Day"" football magazine, in which father and child were brought together as chief and player separately. These strips started in 1997 and proceeded until the magazine's conclusion in May 2001.",True,0.75,0.756583
3,DMoP,ERwDq4GC,mFG5GR3n,"Football-themed stories were a staple of British comics for boys from the 1950s onwards, and Roy of the Rovers was the most popular. To keep the strip exciting, Melchester was almost every year either competing for major honours or struggling against relegation to a lower division; a normal, uneventful season of mid-table mediocrity was unknown at Melchester Rovers. The strip followed the structure of the actual English football season, thus there were several months each year in summer when there was no league football. By far the most common summer storyline saw Melchester touring a fictional country in an exotic part of the world, often South America, where they would invariably be kidnapped and held to ransom. The average reader probably stayed with the comic regularly for only three or four years, therefore storylines were sometimes recycled; during the first ten years of his playing career, Roy was kidnapped at least four times. Roy also made numerous appearances for England, depicted playing alongside actual players such as Malcolm Macdonald and Trevor Francis.","Football-themed stories were a staple of British funnies for young men from the 1950s onwards, and Roy of the Rovers was the most prevalent. To keep the strip energizing, Melchester was pretty much consistently either seeking real distinctions or battling against transfer to a lower division; a typical, uneventful period of mid-table unremarkableness was obscure at Melchester Rovers. The strip pursued the structure of the real English football season, along these lines there were a while every year in summer when there was no association football. By a long shot the most well-known summer storyline saw Melchester visiting an anecdotal nation in a fascinating piece of the world, regularly South America, where they would perpetually be grabbed and held to recover. The normal peruser presumably remained with the comic consistently for just three or four years, in this way storylines were now and then reused; amid the initial ten years of his playing vocation, Roy was seized something like multiple times. Roy likewise shown up for England, delineated playing nearby real players, for example, Malcolm Macdonald and Trevor Francis.",True,0.74,0.862936
4,DMoP,Fq6uMo6z,RWeiMaNw,"The stock media phrase ""real 'Roy of the Rovers' stuff"" is often used by football writers, commentators and fans when describing displays of great skill, or surprising results that go against the odds, in reference to the dramatic storylines that were the strip's trademark.","The stock media express ""genuine 'Roy of the Rovers' stuff"" is regularly utilized by football journalists, observers and fans while portraying presentations of extraordinary ability, or amazing outcomes that conflict with the chances, in reference to the sensational storylines that were the strip's trademark.",True,0.72,0.678417
...,...,...,...,...,...,...,...,...
170,DMoP,853vvVmc,3dEArB7b,"In contemporary internet culture, individuals and organizations frequently pay homage to the number . For instance, the computer scientist Donald Knuth let the version numbers of his program TeX approach . The versions are 3, 3.1, 3.14, and so forth.","In contemporary web culture, people and associations as often as possible pay praise to the number . For example, the PC researcher Donald Knuth let the rendition quantities of his program TeX approach . The adaptations are 3, 3.1, 3.14, etc.",True,0.72,0.846090
171,DMoP,hZ2wpCEH,RNmNmkqZ,Footnotes,Commentaries,True,0.48,0.000000
172,DMoP,XhsHXfwM,AXGbzc9T,References,References,True,1.00,1.000000
173,DMoP,Szdx2b6G,gn7yPeym,Pirates of the Caribbean: Dead Man's Chest,Privateers of the Caribbean: Dead Man's Chest,True,0.94,1.000000


In [134]:
#Output data to json format
df.to_json(os.path.join(OUT_DIR, "detection_results.json"), orient = "index", index = True, indent = 4)