In [1]:
import os
import pandas as pd
from tqdm import tqdm
from fuzzywuzzy import fuzz
import shortuuid
import xml.etree.ElementTree as ET
import re
import sys
import string
from setup import *

In [2]:
pd.set_option("display.max_colwidth", None)

# For Debugging:
#DATASETS = ["MPC", "SAv2"]

In [8]:
#Bring datasets to the same format (standardized)

df = pd.DataFrame(columns= [DATASET, PAIR_ID, ID1, ID2, TEXT1, TEXT2, PARAPHRASE] )

for dataset in DATASETS:
    path_to_dataset = os.path.join(DATASETS_FOLDER, dataset)
    print("Processing dataset: " + str(path_to_dataset))

    counter = 0

    df_tmp = pd.DataFrame(columns= [DATASET, PAIR_ID, ID1, ID2, TEXT1, TEXT2, PARAPHRASE] )

    if dataset == "MPC":
        dmop_path = os.path.join(path_to_dataset, "wikipedia_documents_train", "machined")      #read train data
        for file in tqdm(os.listdir(os.path.join(dmop_path, "og"))):
            with open(os.path.join(dmop_path, "og", file), encoding="utf8", mode = "r") as f1:
                with open(os.path.join(dmop_path, "mg", str(file.split("-")[0])+"-SPUN.txt"), encoding="utf8", mode = "r") as f2:
                    og_lines = f1.readlines()
                    og_lines = [line.rstrip() for line in og_lines]
                    og_lines = [l for l in og_lines if l != ""]
                    mg_lines = f2.readlines()
                    mg_lines = [line.rstrip() for line in mg_lines]
                    mg_lines = [l for l in mg_lines if l != ""]

                    if len(og_lines) != len(mg_lines):
                        print("ERROR")

                    for i, og_line in enumerate(og_lines):
                        #counter = counter+1
                        #if counter > 30:
                        #    break

                        if og_line != "\n":
                            df_tmp.loc[i] = [dataset, shortuuid.uuid()[:8], shortuuid.uuid()[:8], shortuuid.uuid()[:8], og_line, mg_lines[i], True]
        
        df = pd.concat([df, df_tmp], ignore_index = True)
        df_tmp = pd.DataFrame(columns= [DATASET, PAIR_ID, ID1, ID2, TEXT1, TEXT2, PARAPHRASE] )

        dmop_path = os.path.join(path_to_dataset, "wikipedia_documents_test", "machined")
        for file in tqdm(os.listdir(os.path.join(dmop_path, "og"))):        #read test data (combine as there is no ML process involved)
            with open(os.path.join(dmop_path, "og", file), encoding="utf8", mode = "r") as f1:
                with open(os.path.join(dmop_path, "mg", str(file.split("-")[0])+"-SPUN.txt"), encoding="utf8", mode = "r") as f2:
                    og_lines = f1.readlines()
                    og_lines = [line.rstrip() for line in og_lines]
                    og_lines = [l for l in og_lines if l != ""]
                    mg_lines = f2.readlines()
                    mg_lines = [line.rstrip() for line in mg_lines]
                    mg_lines = [l for l in mg_lines if l != ""]

                    for i, og_line in enumerate(og_lines):
                        #counter = counter+1
                        #if counter > 30:
                        #    break
                        if og_line != "\n":
                            df_tmp.loc[i] = [dataset, shortuuid.uuid()[:8], shortuuid.uuid()[:8], shortuuid.uuid()[:8], og_line, mg_lines[i], True]
    
    if dataset == "ETPC":
        with open(os.path.join(path_to_dataset, "text_pairs.xml"), encoding='utf-8', mode = "r") as file:
            tree = ET.parse(file)
            root = tree.getroot()
            for i, elem in enumerate(tqdm(root)):
                #counter = counter+1
                #if counter > 30:
                #    break
                df_tmp.loc[i] = [dataset, shortuuid.uuid()[:8], elem[1].text, elem[2].text, elem[3].text, elem[4].text, bool(int(elem[8].text))]
    
    if dataset == "SAv2":
        asv2_path = os.path.join(path_to_dataset)      #read train data
        with open(os.path.join(asv2_path, "normal.aligned"), encoding="utf8", mode = "r") as f1:
            with open(os.path.join(asv2_path, "simple.aligned"), encoding="utf8", mode = "r") as f2:
                og_lines = f1.readlines()
                og_lines = [line.rstrip() for line in og_lines]
                og_lines = [l for l in og_lines if l != ""]
                mg_lines = f2.readlines()
                mg_lines = [line.rstrip() for line in mg_lines]
                mg_lines = [l for l in mg_lines if l != ""]

                for i, og_line in enumerate(tqdm(og_lines)):
                    #counter = counter+1
                    #if counter > 30:
                    #    break
                    if og_line != "\n":
                        df_tmp.loc[i] = [
                            dataset, 
                            shortuuid.uuid()[:8],
                            og_line.split("\t")[0].translate(str.maketrans('', '', string.punctuation+" ")) + "_" + shortuuid.uuid()[:8], 
                            mg_lines[i].split("\t")[0].translate(str.maketrans('', '', string.punctuation+" ")) + "_" + shortuuid.uuid()[:8], 
                            og_line.split("\t")[2], 
                            mg_lines[i].split("\t")[2], 
                            True
                        ]

    if dataset == "TURL":
        turl_path = os.path.join(path_to_dataset)      #read train data
        with open(os.path.join(turl_path, "Twitter_URL_Corpus_test.txt"), encoding="utf8", mode = "r") as f1:
            with open(os.path.join(turl_path, "Twitter_URL_Corpus_train.txt"), encoding="utf8", mode = "r") as f2:
                test_lines = f1.readlines()
                test_lines = [line.rstrip() for line in test_lines]
                test_lines = [l for l in test_lines if l != ""]
                train_lines = f2.readlines()
                train_lines = [line.rstrip() for line in train_lines]
                train_lines = [l for l in train_lines if l != ""]
                lines = test_lines + train_lines

                for i, line in enumerate(tqdm(lines)):
                    #counter = counter+1
                    #if counter > 30:
                    #    break
                    if line != "\n":
                        # based on the datasets paper, we value a phrase as paraphrase when >=4 out of 6 amazon workers marked it a such
                        is_paraphrase = int(line.split("\t")[2][1]) >= 4
                        print(line)
                        print(is_paraphrase)

                        df_tmp.loc[i] = [
                            dataset, 
                            shortuuid.uuid()[:8],
                            shortuuid.uuid()[:8], 
                            shortuuid.uuid()[:8], 
                            line.split("\t")[0], 
                            line.split("\t")[1], 
                            is_paraphrase
                        ]

    df = pd.concat([df, df_tmp], ignore_index = True)   #concat the lastly processed dataset to the combined dataset

Processing dataset: datasets\MPC
Processing dataset: datasets\ETPC
Processing dataset: datasets\SAv2
Processing dataset: datasets\TURL


  0%|          | 30/56787 [00:00<02:01, 468.65it/s]

How an unverified but explosive dossier became a crisis for Donald Trump 	How a Sensational , Unverified Dossier Became a Crisis for Donald Trump 	(4,6)	https://t.co/l1zXF9Xd05
True
How an unverified but explosive dossier became a crisis for Donald Trump 	Dossier Donald Trump don't forget , DNC hacked to get this same oppo research on Trump . 	(0,6)	https://t.co/l1zXF9Xd05
False
How an unverified but explosive dossier became a crisis for Donald Trump 	a wealthy Republican donor who strongly opposed Mr. Trump put up the money to build 	(0,6)	https://t.co/l1zXF9Xd05
False
How an unverified but explosive dossier became a crisis for Donald Trump 	How an Unverified Dossier Became a Crisis for @realDonaldTrump 	(6,6)	https://t.co/l1zXF9Xd05
True
How an unverified but explosive dossier became a crisis for Donald Trump 	Outstanding piece puts Russian rumors re Trump into context . Read this @ScottShaneNYT @nickconfessore @AllMattNYT 	(0,6)	https://t.co/l1zXF9Xd05
False
How an unverified but ex




In [9]:
df.head(70000)

Unnamed: 0,dataset,pair_id,id_1,id_2,text_1,text_2,is_paraphrase
0,TURL,gagMsbQY,HgNNak2t,HzFPwTWw,How an unverified but explosive dossier became a crisis for Donald Trump,"How a Sensational , Unverified Dossier Became a Crisis for Donald Trump",True
1,TURL,HJLfyVqo,cQtFdD6J,853NTfEa,How an unverified but explosive dossier became a crisis for Donald Trump,"Dossier Donald Trump don't forget , DNC hacked to get this same oppo research on Trump .",False
2,TURL,6kGn5Ac2,BjF8guzS,RCEA5cPQ,How an unverified but explosive dossier became a crisis for Donald Trump,a wealthy Republican donor who strongly opposed Mr. Trump put up the money to build,False
3,TURL,ann2vx74,FtkZNhCM,UCfJ49eY,How an unverified but explosive dossier became a crisis for Donald Trump,How an Unverified Dossier Became a Crisis for @realDonaldTrump,True
4,TURL,cmmbEtyH,8QBcUD2e,ieUThyTe,How an unverified but explosive dossier became a crisis for Donald Trump,Outstanding piece puts Russian rumors re Trump into context . Read this @ScottShaneNYT @nickconfessore @AllMattNYT,False
5,TURL,MKjzxWSg,3dCV94ZN,NyuqK6pm,How an unverified but explosive dossier became a crisis for Donald Trump,The intelligence credentials of the man behind the Trump dossier & gt,False
6,TURL,VjL75dHv,3kJDafV6,hBr5ZLGu,How an unverified but explosive dossier became a crisis for Donald Trump,Not quite as ridiculous as #realDonaldTrump says . But then again he lies all the time .,False
7,TURL,Exc6XF5J,eC4XPYXj,CCnw8gXR,How an unverified but explosive dossier became a crisis for Donald Trump,"How a Sensational , Unverified Dossier Became a Crisis for @realDonaldTrump",False
8,TURL,Mtp4Rox2,CidFxKmD,DbCfShXU,How an unverified but explosive dossier became a crisis for Donald Trump,A wealthy anti-Trump Republican donor . A former British spy . The tale behind an unverified but explosive dossier htt,False
9,TURL,T4LbDMxd,dVhx86GK,bk9zDFGD,How an unverified but explosive dossier became a crisis for Donald Trump,"The consequences of the dossier, put together by a former British spy named Christopher Steele, are incalculable and will play out long past Inauguration Day.",False


In [10]:
#Output data to json format
df.to_json(os.path.join(OUT_DIR, "true_data.json"), orient = "index", index = True, indent = 4)
