In [48]:
import os
import pandas as pd
from tqdm import tqdm
from re import sub
import numpy as np
from thefuzz import fuzz
import shortuuid
import xml.etree.ElementTree as ET
import re
import sys
from gensim.utils import simple_preprocess
import gensim.downloader as api
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim.similarities import SparseTermSimilarityMatrix, WordEmbeddingSimilarityIndex, SoftCosineSimilarity, Similarity

In [49]:
DATASETS_FOLDER = "datasets"    #the folder that contains the dataset directories to read in
FORMATTED_DATA_FILENAME = "true_data.json"  #the name of the file that contains the data to read in
DATASETS = ["DMoP", "MSRP", "ETPC"]     #the folders in the DATASETS_FOLDER should be named like the datasets here
OUT_DIR = "output"      #the directory to output the formatted json in

FUZZY = "fuzzy_based_result"
SEMANTIC = "semantic_based_result"

pd.set_option("display.max_colwidth", None)

In [50]:
df = pd.read_json(os.path.join(OUT_DIR, FORMATTED_DATA_FILENAME), orient = "index")
df.head()

Unnamed: 0,dataset,id_1,id_2,text_1,text_2,is_paraphrase
0,DMoP,HiDc8dkT,3n3Na3BD,Roy of the Rovers,Roy of the Rovers,True
1,DMoP,5KfcxDBb,QUzQei9u,"Roy of the Rovers is a British comic strip about the life and times of a fictional footballer named Roy Race, who played for Melchester Rovers. The strip first appeared in the ""Tiger"" in 1954, before giving its name to a weekly (and later monthly) comic magazine, published by IPC and Fleetway from 1976 until 1995, in which it was the main feature.","Roy of the Rovers is a British funny cartoon about the life and times of an anecdotal footballer named Roy Race, who played for Melchester Rovers. The strip originally showed up in the ""Tiger"" in 1954, preceding giving its name to a week after week (and later month to month) comic magazine, distributed by IPC and Fleetway from 1976 until 1995, in which it was the principle include.",True
2,DMoP,gxShcSRt,5nSAkBfd,"The weekly strip ran until 1993, following Roy's playing career until its conclusion after he lost his left foot in a helicopter crash. When the monthly comic was launched later that year the focus switched to Roy's son Rocky, who also played for Melchester. This publication was short-lived, and folded after only 19 issues. The adventures of the Race family were subsequently featured in the monthly ""Match of the Day"" football magazine, in which father and son were reunited as manager and player respectively. These strips began in 1997 and continued until the magazine's closure in May 2001.","The week after week strip kept running until 1993, after Roy's playing vocation until its decision after he lost his left foot in a helicopter crash. At the point when the month to month comic was propelled soon thereafter the center changed to Roy's child Rocky, who additionally played for Melchester. This production was brief, and collapsed after just 19 issues. The undertakings of the Race family were in this manner highlighted in the month to month ""Match of the Day"" football magazine, in which father and child were brought together as chief and player separately. These strips started in 1997 and proceeded until the magazine's conclusion in May 2001.",True
3,DMoP,ERwDq4GC,mFG5GR3n,"Football-themed stories were a staple of British comics for boys from the 1950s onwards, and Roy of the Rovers was the most popular. To keep the strip exciting, Melchester was almost every year either competing for major honours or struggling against relegation to a lower division; a normal, uneventful season of mid-table mediocrity was unknown at Melchester Rovers. The strip followed the structure of the actual English football season, thus there were several months each year in summer when there was no league football. By far the most common summer storyline saw Melchester touring a fictional country in an exotic part of the world, often South America, where they would invariably be kidnapped and held to ransom. The average reader probably stayed with the comic regularly for only three or four years, therefore storylines were sometimes recycled; during the first ten years of his playing career, Roy was kidnapped at least four times. Roy also made numerous appearances for England, depicted playing alongside actual players such as Malcolm Macdonald and Trevor Francis.","Football-themed stories were a staple of British funnies for young men from the 1950s onwards, and Roy of the Rovers was the most prevalent. To keep the strip energizing, Melchester was pretty much consistently either seeking real distinctions or battling against transfer to a lower division; a typical, uneventful period of mid-table unremarkableness was obscure at Melchester Rovers. The strip pursued the structure of the real English football season, along these lines there were a while every year in summer when there was no association football. By a long shot the most well-known summer storyline saw Melchester visiting an anecdotal nation in a fascinating piece of the world, regularly South America, where they would perpetually be grabbed and held to recover. The normal peruser presumably remained with the comic consistently for just three or four years, in this way storylines were now and then reused; amid the initial ten years of his playing vocation, Roy was seized something like multiple times. Roy likewise shown up for England, delineated playing nearby real players, for example, Malcolm Macdonald and Trevor Francis.",True
4,DMoP,Fq6uMo6z,RWeiMaNw,"The stock media phrase ""real 'Roy of the Rovers' stuff"" is often used by football writers, commentators and fans when describing displays of great skill, or surprising results that go against the odds, in reference to the dramatic storylines that were the strip's trademark.","The stock media express ""genuine 'Roy of the Rovers' stuff"" is regularly utilized by football journalists, observers and fans while portraying presentations of extraordinary ability, or amazing outcomes that conflict with the chances, in reference to the sensational storylines that were the strip's trademark.",True


In [51]:
#Check for paraphrase with fuzzy based
fuzzy_results = []
print("Checking for paraprhases with the fuzzy-based method. Dataframe rows to process: " + str(len(df)))
for i, row in tqdm(df.iterrows()):
    fuzzy_results.append(fuzz.ratio(row["text_1"], row["text_2"]))

df[FUZZY] = fuzzy_results

Checking for paraprhases with the fuzzy-based method. Dataframe rows to process: 11972


11972it [00:01, 11957.34it/s]


In [52]:
stopwords = ['the', 'and', 'are', 'a']

In [53]:
def preprocess(doc):
        # Tokenize and clean data
        doc = sub(r'<img[^<>]+(>|$)', " image_token ", doc)
        doc = sub(r'<[^<>]+(>|$)', " ", doc)
        doc = sub(r'\[img_assist[^]]*?\]', " ", doc)
        doc = sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', " url_token ", doc)
        return [token for token in simple_preprocess(doc, min_len=0, max_len=float("inf")) if token not in stopwords]


In [92]:
def check_semantic(corpus, string_2, similarity_index, similarity_matrix):

    query = preprocess(string_2)

    query_tf = tfidf[dictionary.doc2bow(query)]
    index = SoftCosineSimilarity(tfidf[[dictionary.doc2bow(document) for document in corpus]], similarity_matrix)

    return index[query_tf]

In [81]:
corpus = [ preprocess(document) for document in list(df["text_1"]) ]
# use a pre trained model: https://huggingface.co/fse/glove-wiki-gigaword-50 , https://nlp.stanford.edu/pubs/glove.pdf
glove = api.load("glove-wiki-gigaword-50")
similarity_index = WordEmbeddingSimilarityIndex(glove)

In [96]:
semantic_results = []

# Build the term dictionary, TF-idf model
dictionary = Dictionary(corpus)
tfidf = TfidfModel(dictionary=dictionary)

# Create the term similarity matrix.    
print("Creating the similarity matrix...")
similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary, tfidf)     #takes a long time

print("Processing texts...")
for i, row in tqdm(df.iterrows(), total=df.shape[0]):
    if i >= 50:
        semantic_results.append(-1)
    else:
        #print(check_semantic(row["text_1"], row["text_2"]))
        #print(check_semantic(corpus, row["text_2"], similarity_index, similarity_matrix)[i])
        semantic_results.append(check_semantic(corpus, row["text_2"], similarity_index, similarity_matrix)[i])



df[SEMANTIC] = semantic_results

Creating the similarity matrix...


  2%|▏         | 287/15821 [00:04<03:18, 78.30it/s]

In [None]:
df.head(50)

Unnamed: 0,dataset,id_1,id_2,text_1,text_2,is_paraphrase,fuzzy_based_result,semantic_based_result
0,DMoP,HiDc8dkT,3n3Na3BD,Roy of the Rovers,Roy of the Rovers,True,100,0.0
1,DMoP,5KfcxDBb,QUzQei9u,"Roy of the Rovers is a British comic strip about the life and times of a fictional footballer named Roy Race, who played for Melchester Rovers. The strip first appeared in the ""Tiger"" in 1954, before giving its name to a weekly (and later monthly) comic magazine, published by IPC and Fleetway from 1976 until 1995, in which it was the main feature.","Roy of the Rovers is a British funny cartoon about the life and times of an anecdotal footballer named Roy Race, who played for Melchester Rovers. The strip originally showed up in the ""Tiger"" in 1954, preceding giving its name to a week after week (and later month to month) comic magazine, distributed by IPC and Fleetway from 1976 until 1995, in which it was the principle include.",True,83,0.485433
2,DMoP,gxShcSRt,5nSAkBfd,"The weekly strip ran until 1993, following Roy's playing career until its conclusion after he lost his left foot in a helicopter crash. When the monthly comic was launched later that year the focus switched to Roy's son Rocky, who also played for Melchester. This publication was short-lived, and folded after only 19 issues. The adventures of the Race family were subsequently featured in the monthly ""Match of the Day"" football magazine, in which father and son were reunited as manager and player respectively. These strips began in 1997 and continued until the magazine's closure in May 2001.","The week after week strip kept running until 1993, after Roy's playing vocation until its decision after he lost his left foot in a helicopter crash. At the point when the month to month comic was propelled soon thereafter the center changed to Roy's child Rocky, who additionally played for Melchester. This production was brief, and collapsed after just 19 issues. The undertakings of the Race family were in this manner highlighted in the month to month ""Match of the Day"" football magazine, in which father and child were brought together as chief and player separately. These strips started in 1997 and proceeded until the magazine's conclusion in May 2001.",True,75,0.632704
3,DMoP,ERwDq4GC,mFG5GR3n,"Football-themed stories were a staple of British comics for boys from the 1950s onwards, and Roy of the Rovers was the most popular. To keep the strip exciting, Melchester was almost every year either competing for major honours or struggling against relegation to a lower division; a normal, uneventful season of mid-table mediocrity was unknown at Melchester Rovers. The strip followed the structure of the actual English football season, thus there were several months each year in summer when there was no league football. By far the most common summer storyline saw Melchester touring a fictional country in an exotic part of the world, often South America, where they would invariably be kidnapped and held to ransom. The average reader probably stayed with the comic regularly for only three or four years, therefore storylines were sometimes recycled; during the first ten years of his playing career, Roy was kidnapped at least four times. Roy also made numerous appearances for England, depicted playing alongside actual players such as Malcolm Macdonald and Trevor Francis.","Football-themed stories were a staple of British funnies for young men from the 1950s onwards, and Roy of the Rovers was the most prevalent. To keep the strip energizing, Melchester was pretty much consistently either seeking real distinctions or battling against transfer to a lower division; a typical, uneventful period of mid-table unremarkableness was obscure at Melchester Rovers. The strip pursued the structure of the real English football season, along these lines there were a while every year in summer when there was no association football. By a long shot the most well-known summer storyline saw Melchester visiting an anecdotal nation in a fascinating piece of the world, regularly South America, where they would perpetually be grabbed and held to recover. The normal peruser presumably remained with the comic consistently for just three or four years, in this way storylines were now and then reused; amid the initial ten years of his playing vocation, Roy was seized something like multiple times. Roy likewise shown up for England, delineated playing nearby real players, for example, Malcolm Macdonald and Trevor Francis.",True,74,0.733494
4,DMoP,Fq6uMo6z,RWeiMaNw,"The stock media phrase ""real 'Roy of the Rovers' stuff"" is often used by football writers, commentators and fans when describing displays of great skill, or surprising results that go against the odds, in reference to the dramatic storylines that were the strip's trademark.","The stock media express ""genuine 'Roy of the Rovers' stuff"" is regularly utilized by football journalists, observers and fans while portraying presentations of extraordinary ability, or amazing outcomes that conflict with the chances, in reference to the sensational storylines that were the strip's trademark.",True,72,0.521876
5,DMoP,M3W9h4f6,kAdHMifu,"Roy of the Rovers first appeared on 11 September 1954, as a weekly feature in the comic magazine ""Tiger"", debuting on the front page of the first issue. After 22 years of continued popularity, the strip was judged successful enough to sustain its own weekly comic, the eponymous ""Roy of the Rovers"", launched on 25 September 1976. The comic ran for 851 issues, until 20 March 1993, and included other football strips and features. At the peak of the comic's success about 450,000 copies were sold each week. There were also hardback annuals and holiday specials featuring a mix of reprinted and original content, and for a brief period, starting in 1986, Roy of the Rovers was serialised in the now defunct ""Today"" newspaper. These were all-new strips, focusing largely on the relationship between Roy and his wife Penny, rather than the action on the pitch. Between 1988 and 1993, a ""Best of Roy of the Rovers"" monthly comic was published, reprinting older stories.","Roy of the Rovers initially showed up on 11 September 1954, as a week by week highlight in the comic magazine ""Tiger"", appearing on the first page of the main issue. After 22Â years of proceeded with fame, the strip was made a decision about sufficiently fruitful to support its own week by week comic, the eponymous ""Roy of the Rovers"", propelled on 25 September 1976. The comic kept running for 851Â issues, until 20 March 1993, and included other football strips and highlights. At the pinnacle of the comic's prosperity around 450,000 duplicates were sold every week. There were likewise hardback annuals and occasion specials highlighting a blend of republished and unique substance, and for a concise period, beginning in 1986, Roy of the Rovers was serialized in the now outdated ""Today"" paper. These were all-new strips, concentrating generally on the connection among Roy and his better half Penny, instead of the activity on the pitch. Somewhere in the range of 1988 and 1993, a ""Best of Roy of the Rovers"" month to month comic was distributed, reproducing more established stories.",True,77,0.713688
6,DMoP,kYtEvKxh,Mxiw42kt,"Following the closure of the weekly title in 1993, the strip appeared in a relaunched monthly publication in September that year, with grittier storylines intended to attract teen and young adult fans who had read the weekly comic in their youth. Between January 1994 and January 1995, the monthly strips were mirrored by a weekly edition in ""Shoot"" magazine, which had in the late 1980s published a parody called Ray of the Rangers.","Following the conclusion of the week by week title in 1993, the strip showed up in a relaunched month to month distribution in September that year, with grittier storylines planned to draw in youngster and youthful grown-up fans who had perused the week after week comic in their childhood. Between January 1994 and January 1995, the month to month strips were reflected by a week by week release in ""Shoot"" magazine, which had in the late 1980s distributed a farce called Ray of the Rangers.",True,79,0.23292
7,DMoP,WAxxujwR,27UoMe8c,"The comic strip was resurrected in July 1997, printed as short (usually two-page) features in the BBC's monthly ""Match of the Day"" magazine. These strips ran until the magazine's demise in May 2001. By then the strip's wholesome tone, often espousing the virtues of fair play and strong moral character, was beginning to seem old-fashioned. The editor of ""Roy of the Rovers"" comic, Barrie Tomlinson, has commented that ""everyone seemed to be growing up a bit more quickly, and they wanted stories that were more realistic"". This series ran until 2001.","The funny cartoon was revived in July 1997, printed as short (generally two-page) includes in the BBC's month to month ""Match of the Day"" magazine. These strips kept running until the magazine's death in May 2001. By then the strip's healthy tone, regularly embracing the ideals of reasonable play and solid good character, was starting to appear to be antiquated. The manager of ""Roy of the Rovers"" comic, Barrie Tomlinson, has remarked that ""everybody appeared to grow up more rapidly, and they needed stories that were progressively practical"". This arrangement kept running until 2001.",True,75,0.610021
8,DMoP,GWHjSRFx,a4TvmTTz,"Then-rights holder Egmont published a 64-page ""collectors edition"" of the comic strip in April 2009, gathering together a number of 1980's era Roy of the Rovers stories in addition to other backup strips from the comic. Two ""Best of Roy of the Rovers"" books, featuring successive runs of strips from the 1980s and 1970s, were published in June 2008 and 2009 respectively.","At that point rights holder Egmont distributed a 64-page ""authorities version"" of the funny cartoon in April 2009, assembling some of 1980's period Roy of the Rovers stories notwithstanding other reinforcement takes from the comic. Two ""Best of Roy of the Rovers"" books, including progressive keeps running of strips from the 1970s, were distributed in June 2008 and 2009 separately.",True,73,0.560794
9,DMoP,YaaF9t3f,kfejKMJB,"In 2016, the rights to Roy of the Rovers and the rest of the Fleetway comics library were acquired by Rebellion Developments, who subsequently relaunched the character in a series of graphic novels, depicting Roy as a teenager.","In 2016, the rights to Roy of the Rovers and the remainder of the Fleetway funnies library were procured by Rebellion Developments, who along these lines relaunched the character in a progression of realistic books, delineating Roy as a youngster.",True,81,0.364025
