In [45]:
import os
import pandas as pd
from tqdm import tqdm
from fuzzywuzzy import fuzz
import shortuuid
import xml.etree.ElementTree as ET
import re
import sys

In [46]:
DATASETS_FOLDER = "datasets"
DATASETS = ["DMoP", "MSRP", "ETPC"]

pd.set_option("display.max_colwidth", None)

In [78]:
#Bring datasets to the same format (standardized)

df = pd.DataFrame(columns= ["dataset", "id_1", "id_2", "text_1", "text_2", "is_paraphrase"] )

for dataset in DATASETS:
    path_to_dataset = os.path.join(DATASETS_FOLDER, dataset)
    print("Processing dataset: " + str(path_to_dataset))

    df_tmp = pd.DataFrame(columns= ["dataset", "id_1", "id_2", "text_1", "text_2", "is_paraphrase"] )

    if dataset == "MSRP":
        with open(os.path.join(path_to_dataset, "msr_paraphrase_train.txt"), encoding="utf8", mode = "r") as file:
            fileiter = iter(file)
            next(fileiter)
            for i, line in enumerate(tqdm(fileiter)):
                line_list = line.split("\t")
                df_tmp.loc[i] = [dataset, line_list[1], line_list[2], line_list[3], line_list[4], line_list[0]]

            #lines = [line.rstrip() for line in lines]
    
    if dataset == "DMoP":
        dmop_path = os.path.join(path_to_dataset, "wikipedia_documents_train", "machined")

        for file in tqdm(os.listdir(os.path.join(dmop_path, "og"))):
            with open(os.path.join(dmop_path, "og", file), encoding="utf8", mode = "r") as f1:
                with open(os.path.join(dmop_path, "mg", str(file.split("-")[0])+"-SPUN.txt"), encoding="utf8", mode = "r") as f2:
                    og_lines = f1.readlines()
                    og_lines = [line.rstrip() for line in og_lines]
                    og_lines = [l for l in og_lines if l != ""]
                    mg_lines = f2.readlines()
                    mg_lines = [line.rstrip() for line in mg_lines]
                    mg_lines = [l for l in mg_lines if l != ""]

                    for i, og_line in enumerate(og_lines):
                        if og_line != "\n":
                            df_tmp.loc[i] = [dataset, shortuuid.uuid()[:8], shortuuid.uuid()[:8], og_line, mg_lines[i], 1]
    
    if dataset == "ETPC":
        with open(os.path.join(path_to_dataset, "text_pairs.xml"), encoding='utf-8', mode = "r") as file:
            tree = ET.parse(file)
            root = tree.getroot()
            for i, elem in enumerate(tqdm(root)):
                df_tmp.loc[i] = [dataset, elem[1].text, elem[2].text, elem[3].text, elem[4].text, elem[8].text]


    df = pd.concat([df, df_tmp], ignore_index = True)

Processing dataset: datasets\DMoP


100%|██████████| 4012/4012 [00:46<00:00, 85.50it/s] 


Processing dataset: datasets\MSRP


4076it [00:07, 549.17it/s]


Processing dataset: datasets\ETPC


100%|██████████| 5801/5801 [00:11<00:00, 499.01it/s]


In [79]:
df.head(70000)

Unnamed: 0,dataset,id_1,id_2,text_1,text_2,is_paraphrase
0,DMoP,Lvvhsrzr,7WDtT7UN,Roy of the Rovers,Roy of the Rovers,1
1,DMoP,3iPZ2453,YX88DEmF,"Roy of the Rovers is a British comic strip about the life and times of a fictional footballer named Roy Race, who played for Melchester Rovers. The strip first appeared in the ""Tiger"" in 1954, before giving its name to a weekly (and later monthly) comic magazine, published by IPC and Fleetway from 1976 until 1995, in which it was the main feature.","Roy of the Rovers is a British funny cartoon about the life and times of an anecdotal footballer named Roy Race, who played for Melchester Rovers. The strip originally showed up in the ""Tiger"" in 1954, preceding giving its name to a week after week (and later month to month) comic magazine, distributed by IPC and Fleetway from 1976 until 1995, in which it was the principle include.",1
2,DMoP,hN6XiKcz,CPjdVw83,"The weekly strip ran until 1993, following Roy's playing career until its conclusion after he lost his left foot in a helicopter crash. When the monthly comic was launched later that year the focus switched to Roy's son Rocky, who also played for Melchester. This publication was short-lived, and folded after only 19 issues. The adventures of the Race family were subsequently featured in the monthly ""Match of the Day"" football magazine, in which father and son were reunited as manager and player respectively. These strips began in 1997 and continued until the magazine's closure in May 2001.","The week after week strip kept running until 1993, after Roy's playing vocation until its decision after he lost his left foot in a helicopter crash. At the point when the month to month comic was propelled soon thereafter the center changed to Roy's child Rocky, who additionally played for Melchester. This production was brief, and collapsed after just 19 issues. The undertakings of the Race family were in this manner highlighted in the month to month ""Match of the Day"" football magazine, in which father and child were brought together as chief and player separately. These strips started in 1997 and proceeded until the magazine's conclusion in May 2001.",1
3,DMoP,mZRXWueU,ayKtGsTm,"Football-themed stories were a staple of British comics for boys from the 1950s onwards, and Roy of the Rovers was the most popular. To keep the strip exciting, Melchester was almost every year either competing for major honours or struggling against relegation to a lower division; a normal, uneventful season of mid-table mediocrity was unknown at Melchester Rovers. The strip followed the structure of the actual English football season, thus there were several months each year in summer when there was no league football. By far the most common summer storyline saw Melchester touring a fictional country in an exotic part of the world, often South America, where they would invariably be kidnapped and held to ransom. The average reader probably stayed with the comic regularly for only three or four years, therefore storylines were sometimes recycled; during the first ten years of his playing career, Roy was kidnapped at least four times. Roy also made numerous appearances for England, depicted playing alongside actual players such as Malcolm Macdonald and Trevor Francis.","Football-themed stories were a staple of British funnies for young men from the 1950s onwards, and Roy of the Rovers was the most prevalent. To keep the strip energizing, Melchester was pretty much consistently either seeking real distinctions or battling against transfer to a lower division; a typical, uneventful period of mid-table unremarkableness was obscure at Melchester Rovers. The strip pursued the structure of the real English football season, along these lines there were a while every year in summer when there was no association football. By a long shot the most well-known summer storyline saw Melchester visiting an anecdotal nation in a fascinating piece of the world, regularly South America, where they would perpetually be grabbed and held to recover. The normal peruser presumably remained with the comic consistently for just three or four years, in this way storylines were now and then reused; amid the initial ten years of his playing vocation, Roy was seized something like multiple times. Roy likewise shown up for England, delineated playing nearby real players, for example, Malcolm Macdonald and Trevor Francis.",1
4,DMoP,58eWTXTi,SVKvYtRE,"The stock media phrase ""real 'Roy of the Rovers' stuff"" is often used by football writers, commentators and fans when describing displays of great skill, or surprising results that go against the odds, in reference to the dramatic storylines that were the strip's trademark.","The stock media express ""genuine 'Roy of the Rovers' stuff"" is regularly utilized by football journalists, observers and fans while portraying presentations of extraordinary ability, or amazing outcomes that conflict with the chances, in reference to the sensational storylines that were the strip's trademark.",1
...,...,...,...,...,...,...
10045,ETPC,2685984,2686122,"After Hughes refused to rehire Hernandez, he complained to the Equal Employment Opportunity Commission.",Hernandez filed an Equal Employment Opportunity Commission complaint and sued.,0
10046,ETPC,339215,339172,There are 103 Democrats in the Assembly and 47 Republicans.,Democrats dominate the Assembly while Republicans control the Senate.,0
10047,ETPC,2996850,2996734,Bethany Hamilton remained in stable condition Saturday after the attack Friday morning.,"Bethany, who remained in stable condition after the attack Friday morning, talked of the attack Saturday.",0
10048,ETPC,2095781,2095812,"Last week the power station’s US owners, AES Corp, walked away from the plant after banks and bondholders refused to accept its financial restructuring offer.","The news comes after Drax's American owner, AES Corp. AES.N , last week walked away from the plant after banks and bondholders refused to accept its restructuring offer.",1
