In [0]:
FOLDER = '../data/dataset'
INPUT_PT = '../data/input/pt'
INPUT_EN = '../data/input/en'

Get dataset and requirement

In [0]:
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1Q9j1a83CuKzsHCGaNulSkNxBm7Dkn7Ln' -O assin2-train-only.xml
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1J3FpQaHxpM-FDfBUyooh-sZF-B-bM_lU' -O assin2-test.xml
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1kb7xq6Mb3eaqe9cOAo70BaG9ypwkIqEU' -O assin2-dev.xml
!wget http://nilc.icmc.usp.br/assin/assin.tar.gz

!tar -xzf assin.tar.gz
%rm -rf assin.tar.gz

!pip install xmltodict

%mkdir $FOLDER
%mv *.xml $FOLDER

Import xml files

In [0]:
import pandas as pd
import xmltodict
import os
import re

files_xml = [f for f in os.listdir(FOLDER) if 'xml' in f]
names = list()
xmls = list()

for filename in files_xml:
    with open(f'{FOLDER}/{filename}') as f:
        xml = xmltodict.parse(f.read())
        name = re.sub(r'(.*).xml', r'\1', filename)
        xmls.append(xml)
        names.append(name)

xml_names = dict(zip(names, xmls))

Generate tsv in Portuguese

In [0]:
header = ['id', 'label','premise', 'hipothesis']
output_names = []
output_files = []
for name in names:
    place = xml_names[name]['entailment-corpus']['pair'] 

    if 'only' in name:
        name = name.replace('-only', '')

    output_names.append(re.sub(r'(.+)-(.+)', r'\1-rte_\2', name))
    output_names.append(re.sub(r'(.+)-(.+)', r'\1-sts_\2', name))
    rte = list()
    sts = list()

    for idx, item in enumerate(place):
        rte.append((item['@id'],item['@entailment'],item['t'],item['h']))
        sts.append((item['@id'],item['@similarity'],item['t'],item['h']))

    rte_df = pd.DataFrame(rte, index = None, columns = header)
    sts_df = pd.DataFrame(sts, index = None, columns = header)

    output_files.append(rte_df)
    output_files.append(sts_df)

for idx, output in enumerate(output_files):
    output_name = f'{INPUT_PT}/{output_names[idx]}.tsv'
    output.to_csv(output_name, sep = '\t', index = False, header = None)

Get Portuguese inputs dictionary and translation dictionary

In [27]:
table_dict = dict(zip(output_names, output_files))
!wget https://raw.githubusercontent.com/ruanchaves/assin/master/sources/dictionary.json -O assin-dic.json

import json

with open('assin-dic.json') as json_file:
    translation = json.load(json_file)

--2020-04-17 16:09:21--  https://raw.githubusercontent.com/ruanchaves/assin/master/sources/dictionary.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4322760 (4.1M) [text/plain]
Saving to: ‘assin-dic.json’


2020-04-17 16:09:21 (40.8 MB/s) - ‘assin-dic.json’ saved [4322760/4322760]



Map Portuguese to English and Save

In [0]:
for key in table_dict.keys():
    for col in ['premise', 'hipothesis']:
        table_dict[key][col] = table_dict[key][col].map(translation)

    output_name = f'{INPUT_EN}/{output_names[idx]}.tsv'
    table_dict[key].to_csv(output_name, sep = '\t', index = False, header = None)


Remove HTML marks from translation

In [36]:
!pip install ftfy
%cp  ftfy_assin.sh {INPUT_EN}/ftfy_assin.sh
%cd $INPUT_EN
!bash ftfy_assin.sh
%rm -f /ftfy_assin.sh 

/content/mt-dnn_port/mt-dnn_port/data/input/en
./assin2-rte_train.tsv
./assin-ptbr-rte_train.tsv
./assin-ptpt-sts_test.tsv
./assin-ptpt-rte_dev.tsv
./assin2-rte_dev.tsv
./assin-ptbr-rte_dev.tsv
./assin2-sts_dev.tsv
./assin2-sts_test.tsv
./assin-ptpt-rte_train.tsv
./assin-ptpt-sts_dev.tsv
./assin2-rte_test.tsv
./assin-ptbr-sts_test.tsv
./assin-ptbr-rte_test.tsv
./assin-ptbr-sts_dev.tsv
./assin-ptpt-sts_train.tsv
./assin-ptbr-sts_train.tsv
./assin-ptpt-rte_test.tsv
./assin2-sts_train.tsv


Remove quotes incorrect in *assin-ptbr-train* line 1711 

In [0]:
for filepath in ['assin-ptbr-rte_train.tsv', 'assin-ptbr-sts_train.tsv']:
    with open(filepath, 'r') as f:
        corpus = f.read()
        corpus = corpus.replace('"As long as','As long as')

    with open(filepath, 'w') as f:   
        f.write(corpus)