In [1]:
import os
import pandas as pd
from tqdm import tqdm
from fuzzywuzzy import fuzz
import shortuuid
import xml.etree.ElementTree as ET
import re
import sys
import string
from setup import *

In [2]:
pd.set_option("display.max_colwidth", None)

# For Debugging:
#DATASETS = ["MPC", "SAv2"]

In [3]:
#Bring datasets to the same format (standardized)

df = pd.DataFrame(columns= [DATASET, PAIR_ID, ID1, ID2, TEXT1, TEXT2, PARAPHRASE] )

for dataset in DATASETS:
    path_to_dataset = os.path.join(DATASETS_FOLDER, dataset)
    print("Processing dataset: " + str(path_to_dataset))

    counter = 0

    df_tmp = pd.DataFrame(columns= [DATASET, PAIR_ID, ID1, ID2, TEXT1, TEXT2, PARAPHRASE] )

    if dataset == "MPC":
        dmop_path = os.path.join(path_to_dataset, "wikipedia_documents_train", "machined")      #read train data
        for file in tqdm(os.listdir(os.path.join(dmop_path, "og"))):
            with open(os.path.join(dmop_path, "og", file), encoding="utf8", mode = "r") as f1:
                with open(os.path.join(dmop_path, "mg", str(file.split("-")[0])+"-SPUN.txt"), encoding="utf8", mode = "r") as f2:
                    og_lines = f1.readlines()
                    og_lines = [line.rstrip() for line in og_lines]
                    og_lines = [l for l in og_lines if l != ""]
                    mg_lines = f2.readlines()
                    mg_lines = [line.rstrip() for line in mg_lines]
                    mg_lines = [l for l in mg_lines if l != ""]

                    if len(og_lines) != len(mg_lines):
                        print("ERROR")

                    for i, og_line in enumerate(og_lines):
                        counter = counter+1
                        if counter > 30:
                            break

                        if og_line != "\n":
                            df_tmp.loc[i] = [dataset, shortuuid.uuid()[:8], shortuuid.uuid()[:8], shortuuid.uuid()[:8], og_line, mg_lines[i], True]
        
        df = pd.concat([df, df_tmp], ignore_index = True)
        df_tmp = pd.DataFrame(columns= [DATASET, PAIR_ID, ID1, ID2, TEXT1, TEXT2, PARAPHRASE] )

        dmop_path = os.path.join(path_to_dataset, "wikipedia_documents_test", "machined")
        for file in tqdm(os.listdir(os.path.join(dmop_path, "og"))):        #read test data (combine as there is no ML process involved)
            with open(os.path.join(dmop_path, "og", file), encoding="utf8", mode = "r") as f1:
                with open(os.path.join(dmop_path, "mg", str(file.split("-")[0])+"-SPUN.txt"), encoding="utf8", mode = "r") as f2:
                    og_lines = f1.readlines()
                    og_lines = [line.rstrip() for line in og_lines]
                    og_lines = [l for l in og_lines if l != ""]
                    mg_lines = f2.readlines()
                    mg_lines = [line.rstrip() for line in mg_lines]
                    mg_lines = [l for l in mg_lines if l != ""]

                    for i, og_line in enumerate(og_lines):
                        counter = counter+1
                        if counter > 30:
                            break
                        if og_line != "\n":
                            df_tmp.loc[i] = [dataset, shortuuid.uuid()[:8], shortuuid.uuid()[:8], shortuuid.uuid()[:8], og_line, mg_lines[i], True]
    
    if dataset == "ETPC":
        with open(os.path.join(path_to_dataset, "text_pairs.xml"), encoding='utf-8', mode = "r") as file:
            tree = ET.parse(file)
            root = tree.getroot()
            for i, elem in enumerate(tqdm(root)):
                counter = counter+1
                if counter > 30:
                    break
                df_tmp.loc[i] = [dataset, shortuuid.uuid()[:8], elem[1].text, elem[2].text, elem[3].text, elem[4].text, bool(int(elem[8].text))]
    
    if dataset == "SAv2":
        asv2_path = os.path.join(path_to_dataset)      #read train data
        with open(os.path.join(asv2_path, "normal.aligned"), encoding="utf8", mode = "r") as f1:
            with open(os.path.join(asv2_path, "simple.aligned"), encoding="utf8", mode = "r") as f2:
                og_lines = f1.readlines()
                og_lines = [line.rstrip() for line in og_lines]
                og_lines = [l for l in og_lines if l != ""]
                mg_lines = f2.readlines()
                mg_lines = [line.rstrip() for line in mg_lines]
                mg_lines = [l for l in mg_lines if l != ""]

                for i, og_line in enumerate(tqdm(og_lines)):
                    counter = counter+1
                    if counter > 30:
                        break
                    if og_line != "\n":
                        df_tmp.loc[i] = [
                            dataset, 
                            shortuuid.uuid()[:8],
                            og_line.split("\t")[0].translate(str.maketrans('', '', string.punctuation+" ")) + "_" + shortuuid.uuid()[:8], 
                            mg_lines[i].split("\t")[0].translate(str.maketrans('', '', string.punctuation+" ")) + "_" + shortuuid.uuid()[:8], 
                            og_line.split("\t")[2], 
                            mg_lines[i].split("\t")[2], 
                            True
                        ]

    df = pd.concat([df, df_tmp], ignore_index = True)   #concat the lastly processed dataset to the combined dataset

Processing dataset: datasets\MPC


100%|██████████| 4012/4012 [00:02<00:00, 1893.18it/s]
100%|██████████| 1990/1990 [00:00<00:00, 2547.28it/s]


Processing dataset: datasets\ETPC


  1%|          | 30/5801 [00:00<00:08, 681.67it/s]


Processing dataset: datasets\SAv2


  0%|          | 30/167689 [00:00<04:28, 624.87it/s]


In [4]:
df.head(70000)

Unnamed: 0,dataset,pair_id,id_1,id_2,text_1,text_2,is_paraphrase
0,MPC,eSSazcLb,dSYrKRgD,YGtGXWQ8,Uriel Sebree,Uriel Sebree,True
1,MPC,NDrJ2EA6,Y4LXcBGq,5TiNENVT,"Uriel Sebree (February 20, 1848 – August 6, 1922) was a career officer in the United States Navy. He entered the Naval Academy during the Civil War and served until 1910, retiring as a rear admiral. He is best remembered for his two expeditions into the Arctic and for serving as acting governor of American Samoa. He was also commander-in-chief of the Pacific Fleet.","Uriel Sebree (FebruaryÂ 20, 1848Â â AugustÂ 6, 1922) was a lifelong officer in the United States Navy. He entered the Naval Academy amid the Civil War and served until 1910, resigning as a back naval commander. He is best associated with his two endeavors into the Arctic and for filling in as acting legislative head of American Samoa. He was additionally president of the Pacific Fleet.",True
2,MPC,DrrF4xbG,LZBi6n6d,86DRMtE8,"After graduating from the U.S. Naval Academy in 1867, Sebree was posted to a number of vessels before being assigned to a rescue mission to find the remaining crew of the missing ""Polaris"" in the Navy's first mission to the Arctic. This attempt was only a partial success—the ""Polaris"" crew was rescued by a British ship rather than the US Navy—but this led to Sebree's selection eleven years later for a second expedition to the Arctic. That mission to rescue Adolphus Greely and the survivors of the Lady Franklin Bay expedition was a success. Sebree was subsequently appointed as the second acting governor of American Samoa. He served in this position for only a year before returning to the United States. In 1907, he was promoted to rear admiral and given command of the Pathfinder Expedition around the South American coast before being appointed commander of the 2nd Division of the Pacific Fleet and then commander-in-chief of the entire fleet. He retired in 1910 and died in Coronado, California, in 1922. Two geographical features in Alaska—Sebree Peak and Sebree Island—are named for Admiral Sebree.","In the wake of moving on from the U.S. Maritime Academy in 1867, Sebree was presented on various vessels before being appointed to a salvage mission to locate the rest of the team of the missing ""Polaris"" in the Navy's first mission to the Arctic. This endeavor was just a fractional achievementâthe ""Polaris"" group was saved by a British ship as opposed to the US Navyâyet this prompted Sebree's choice eleven years after the fact for a second campaign to the Arctic. That mission to safeguard Adolphus Greely and the overcomers of the Lady Franklin Bay endeavor was a triumph. Sebree was in this manner designated as the second acting legislative leader of American Samoa. He served in this situation for just a year prior coming back to the United States. In 1907, he was elevated to raise naval commander and given direction of the Pathfinder Expedition around the South American coast before being named authority of the second Division of the Pacific Fleet and afterward president of the whole armada. He resigned in 1910 and passed on in Coronado, California, in 1922. Two topographical highlights in AlaskaâSebree Peak and Sebree Islandâare named for Admiral Sebree.",True
3,MPC,T2yTozHi,jLQ2bbwa,GoaRoMkd,"""Super Mario 64"" is a 3D platformer in which the player controls Mario through several courses. Each course is an enclosed world in which the player is free to wander in all directions and discover the environment without time limits. The worlds are filled with enemies that attack Mario, as well as friendly creatures that provide assistance, offer information, or ask a favor (such as peaceful pink Bob-omb Buddies). The player gathers stars in each course; some stars only appear after completing certain tasks, often hinted at by the name of the course. These challenges include defeating a boss, solving puzzles, racing an opponent, and gathering coins. As more stars are collected, more areas of the castle hub world become accessible. The player unlocks doors in the castle with keys obtained by defeating Bowser in special courses. There are many hidden mini-courses and other secrets to the game, most containing extra stars required for the full completion of the game.","""Super Mario 64"" is a 3D platformer in which the player controls Mario through a few courses. Each course is an encased world in which the player is allowed to meander every which way and find nature without time limits. The universes are loaded up with foes that assault Mario, just as amicable animals that give help, offer data, or ask some help, (for example, quiet pink Bob-omb Buddies). The player assembles stars in each course; a few stars just show up subsequent to finishing certain assignments, frequently alluded to by the name of the course. These difficulties incorporate overcoming a manager, fathoming perplexes, dashing a rival, and assembling coins. As more stars are gathered, more zones of the stronghold center point world become open. The player opens entryways in the stronghold with keys gotten by overcoming Bowser in uncommon courses. There are many shrouded small scale courses and different insider facts to the diversion, most containing additional stars required for the full finish of the amusement.",True
4,MPC,nNTdFW3G,eA7wL8wN,Pxud2m2C,"There are three special cap power-ups that appear in certain areas on many stages. The Wing Cap allows Mario to fly; the Metal Cap makes him immune to most damage, allows him to withstand wind, walk underwater, and be unaffected by noxious gases; and the Vanish Cap renders him partially immaterial and allows him to walk through some obstacles such as wire mesh, as well as granting invulnerability to some forms of damage. Some courses contain cannons that Mario can access by speaking to a pink Bob-omb Buddy. After entering a cannon, Mario can be shot out to reach distant places. When the player has the Wing Cap equipped, cannons can be used to reach high altitudes or fly across most levels quickly.","There are three exceptional top catalysts that show up in specific zones on numerous stages. The Wing Cap permits Mario to fly; the Metal Cap makes him insusceptible to most harm, enables him to withstand wind, walk submerged, and be unaffected by poisonous gases; and the Vanish Cap renders him halfway unimportant and enables him to stroll through certain impediments, for example, wire work, just as allowing resistance to certain types of harm. A few courses contain guns that Mario can access by addressing a pink Bob-omb Buddy. Subsequent to entering a gun, Mario can be shot out to achieve removed spots. At the point when the player has the Wing Cap prepared, guns can be utilized to achieve high heights or fly crosswise over most dimensions rapidly.",True
...,...,...,...,...,...,...,...
82,SAv2,26o9V3DD,PuymÃ©ras_hFDZagb7,PuymÃ©ras_E3PuTvZR,PuymÃ ras is a commune in the Vaucluse department in the Provence-Alpes-C Ã te d'Azur region in southeastern France .,"PuymÃ ras is a commune of 610 people -LRB- 1999 -RRB- . It is in the Provence-Alpes-C Ã te d'Azur region , in the Vaucluse department in the south of France .",True
83,SAv2,4oSQ4mEe,Gastines_gsGVbKMw,Gastines_HeWm35BD,Gastines is a commune in the Mayenne department in north-western France .,Gastines is a commune of 162 people -LRB- 1999 -RRB- . It is found in the region Pays de la Loire in the Mayenne department in the northwest of France .,True
84,SAv2,Xy4CGBnu,Haitinationalfootballteam_EUgLPHHv,Haitinationalfootballteam_FM76zLim,"The Haiti national football team represents Haiti in association football and is controlled by the FÃ dÃ ration HaÃ tienne de Football , the governing body for football in Haiti .",Haiti national football team is the national football team of Haiti .,True
85,SAv2,FLQ4gUUe,ChenabRiver_Rsk8bkEz,ChenabRiver_XwYzNbYQ,"The Chenab then joins the Indus at Mithankot , Pakistan .",The Chenab then joins the Indus at Mithankot .,True


In [5]:
#Output data to json format
df.to_json(os.path.join(OUT_DIR, "true_data.json"), orient = "index", index = True, indent = 4)
