In [0]:
FOLDER = '../data/dataset'
INPUT_PT = '../data/input/pt'
INPUT_EN = '../data/input/en'
DICT_PATH = '../translate'

Extract dataset

In [65]:
!unzip tweetSentBR_extracted.zip

Archive:  tweetSentBR_extracted.zip
   creating: tweetSentBR_extracted/
  inflating: tweetSentBR_extracted/testTT.neg  
  inflating: tweetSentBR_extracted/testTT.neu  
  inflating: tweetSentBR_extracted/testTT.pos  
  inflating: tweetSentBR_extracted/trainTT.neg  
  inflating: tweetSentBR_extracted/trainTT.neu  
  inflating: tweetSentBR_extracted/trainTT.pos  
  inflating: tweetSentBR_extracted/tweets.none  
  inflating: tweetSentBR_extracted/tweets.neg  
  inflating: tweetSentBR_extracted/tweets.neu  
  inflating: tweetSentBR_extracted/tweets.pos  


Separate files in tab

In [66]:
import pandas as pd
import os
import re

folder = 'tweetSentBR_extracted'
corpus = [f for f in os.listdir(folder) if 'TT' in f and not 'tab' in f]

for split in corpus:
    print(split)
    with open(f'{folder}/{split}') as f:
        text = f.read()

    assert '\t' not in text 
    
    outtext = re.sub(r'(.+?) (.+)',r'\1\t\2', text)
    outfile = re.sub(r'(.+)\.(.+)', r'\1_tab.\2',split)
    with open(f'{folder}/{outfile}', 'w') as f:
        f.write(outtext)
        print(f'{folder}/{outfile}')

testTT.pos
tweetSentBR_extracted/testTT_tab.pos
trainTT.pos
tweetSentBR_extracted/trainTT_tab.pos
trainTT.neu
tweetSentBR_extracted/trainTT_tab.neu
trainTT.neg
tweetSentBR_extracted/trainTT_tab.neg
testTT.neg
tweetSentBR_extracted/testTT_tab.neg
testTT.neu
tweetSentBR_extracted/testTT_tab.neu


Generate tsv in Portuguese

In [67]:
import pandas as pd

splits = {
    'train' : [f for f in os.listdir(folder) if 'train' in f and 'tab' in f],
    'test'  : [f for f in os.listdir(folder) if 'test' in f and 'tab' in f],
}

header = ['id', 'premise']

abbr = {
    'neg': 'Negative', 
    'neu': 'Neutral',
    'pos': 'Positive',
} 

output_header = ['id', 'label', 'premise']

for name, files in splits.items():

    output = pd.DataFrame()
    for f in files:
        filepath = f'{folder}/{f}'
        table = pd.read_csv(filepath, index_col = None, sep = '\t', names = header)

        posfix = f.split('.')[1]
        label = abbr[posfix]
        table['label'] = label

        output = table if output.empty else output.append(table)

    output = output.reindex(columns = output_header)
    output = output.sample(frac = 1)
    
    outpath = f'{INPUT_PT}/tweetsent_{name}.tsv'
    output.to_csv(outpath, index = None, header = None, sep = '\t')

    print(outpath)
    print('{} tweets\n'.format(output.shape[0]))

../data/input/pt/tweetsent_train.tsv
10980 tweets

../data/input/pt/tweetsent_test.tsv
2010 tweets



Generate tsv in English

In [68]:
import json

files = [f for f in os.listdir(INPUT_PT) if 'tweetsent' in f]

with open(f'{DICT_PATH}/tweetsent-dic.json') as f:
    translation = json.load(f)

for file in files:
    filepath = f'{INPUT_PT}/{file}'
    table = pd.read_csv(filepath, names = output_header, header = None, sep = '\t')
    table['premise'] = table['premise'].map(translation)
    
    outpath = filepath.replace(INPUT_PT, INPUT_EN)
    table.to_csv(outpath, index = None, sep = '\t', header = None)
    
    print(outpath)

../data/input/en/tweetsent_test.tsv
../data/input/en/tweetsent_train.tsv


Remove HTML marks from translation

In [0]:
!pip install ftfy
%cp  fix_html.sh {INPUT_EN}/
%cd $INPUT_EN
!bash fix_html.sh
%rm -f fix_html.sh

/content/mt-dnn_port/data/input/en
./assin2-rte_train.tsv
./assin-ptbr-rte_train.tsv
./assin-ptpt-sts_test.tsv
./assin-ptpt-rte_dev.tsv
./assin2-rte_dev.tsv
./assin-ptbr-rte_dev.tsv
./assin2-sts_dev.tsv
./assin2-sts_test.tsv
./tweetsent_test.tsv
./assin-ptpt-rte_train.tsv
./assin-ptpt-sts_dev.tsv
./assin2-rte_test.tsv
./assin-ptbr-sts_test.tsv
./assin-ptbr-rte_test.tsv
./assin-ptbr-sts_dev.tsv
./assin-ptpt-sts_train.tsv
