Prepare input

In [1]:
!pip install transformers --upgrade
!pip install mosestokenizer
!pip install sentence-transformers

# Translate sentences to English and save mapping dicitionary as json.
# Based on https://github.com/ruanchaves/assin/blob/master/sources/translate.py

import json
import math
import os

from tqdm.notebook import tqdm
from transformers import MarianMTModel, MarianTokenizer

model_name = 'Helsinki-NLP/opus-mt-en-ROMANCE'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

def decode(text):
    decoded = tokenizer.decode(
                    text, 
                    skip_special_tokens=True,
                    )

    return decoded

def needs_translation(sample, translations):
    needs = [text for text in sample if text not in translations.keys()]
    
    return needs

def translation(sample):
    src_txts = [f'>>pt_BR<< {text}' for text in sample]

    translation_sample = tokenizer.prepare_translation_batch(src_txts)
    translated = model.generate(**translation_sample)
    
    translated = [decode(text) for text in translated]

    return translated


def translate2dict(sentences, dictpath, batch_size):
    if not os.path.isfile(dictpath):
        with open(dictpath, 'w') as f:
            json.dump({}, f)

    with open(dictpath) as f:
        translations = json.load(f)
    
    remaining = needs_translation(sentences, translations)
    batch = math.ceil(len(remaining)/batch_size)

    for idx in tqdm(range(batch), "Translating"):
        keys = remaining[idx*batch_size:(idx+1)*batch_size] 
        values = translation(keys)
    
        new_translations = dict(zip(keys, values))
        translations.update(new_translations)

        with open(dictpath, 'w+') as f:
            json.dump(translations, f)

from pprint import pprint
import pandas as pd
import os

sentences = list()
label_idx = [
             'sentence1',
             'sentence2',
            ]
length = 0

TASK = 'MNLI'

splits = [
 f'{TASK}/train.tsv', 
 f'{TASK}/test.tsv', 
 f'{TASK}/dev.tsv'
]

for f in splits:
    table = pd.read_csv(f, sep = '\t', quoting=3, error_bad_lines=False)
    print(f) 
    #print(table.head())
    #print('\n'*3)
    
    for idx in label_idx:
        label = table[idx].copy()
        print(label)
        print('\n'*2)

        sentences.extend(list(label))
        length = length + label.size


assert length == len(sentences)
sentences = set(sentences)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive
/content/drive/My Drive
Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/12/b5/ac41e3e95205ebf53439e4dd087c58e9fd371fd8e3724f2b9b4cdb8282e5/transformers-2.10.0-py3-none-any.whl (660kB)
[K     |████████████████████████████████| 665kB 8.6MB/s 
Collecting tokenizers==0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/14/e5/a26eb4716523808bb0a799fcfdceb6ebf77a18169d9591b2f46a9adb87d9/tokenizers-0.7.0-cp36-cp36m-ma

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=779155.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=799001.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1457360.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=265.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1113.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=312087009.0, style=ProgressStyle(descri…




b'Skipping line 13: expected 11 fields, saw 15\nSkipping line 18: expected 11 fields, saw 15\nSkipping line 28: expected 11 fields, saw 15\nSkipping line 31: expected 11 fields, saw 15\nSkipping line 36: expected 11 fields, saw 15\nSkipping line 40: expected 11 fields, saw 15\nSkipping line 45: expected 11 fields, saw 15\nSkipping line 89: expected 11 fields, saw 15\nSkipping line 111: expected 11 fields, saw 15\nSkipping line 134: expected 11 fields, saw 15\nSkipping line 157: expected 11 fields, saw 14\nSkipping line 327: expected 11 fields, saw 15\nSkipping line 344: expected 11 fields, saw 15\nSkipping line 352: expected 11 fields, saw 15\nSkipping line 356: expected 11 fields, saw 15\nSkipping line 360: expected 11 fields, saw 15\nSkipping line 375: expected 11 fields, saw 15\nSkipping line 389: expected 11 fields, saw 15\nSkipping line 400: expected 11 fields, saw 14\nSkipping line 411: expected 11 fields, saw 15\nSkipping line 414: expected 11 fields, saw 15\nSkipping line 420: 

SNLI/train.tsv
0         A person on a horse jumps over a broken down a...
1         A person on a horse jumps over a broken down a...
2         A person on a horse jumps over a broken down a...
3                     Children smiling and waving at camera
4                     Children smiling and waving at camera
                                ...                        
510706                  Four dirty and barefooted children.
510707                  Four dirty and barefooted children.
510708    A man is surfing in a bodysuit in beautiful bl...
510709    A man is surfing in a bodysuit in beautiful bl...
510710    A man is surfing in a bodysuit in beautiful bl...
Name: sentence1, Length: 510711, dtype: object



0         A person is training his horse for a competition.
1             A person is at a diner, ordering an omelette.
2                         A person is outdoors, on a horse.
3                         They are smiling at their parents
4                                Th

Translate

In [2]:
translate2dict(list(sentences),  'snli.json', 20)

HBox(children=(FloatProgress(value=0.0, description='Translating', max=438.0, style=ProgressStyle(description_…




Replace with translation

In [8]:
from os import makedirs
import json

with open('snli.json') as reader:
    dictionary = json.load(reader)

translation_folder = f'translation/{TASK}'
makedirs(translation_folder)
for f in splits:
    table = pd.read_csv(f, sep = '\t', quoting=3, error_bad_lines=False)
    for idx in label_idx:
        table[idx] = table[idx].map(dictionary)
    table.to_csv(f'translation/{f}', index = None, sep = '\t')

b'Skipping line 13: expected 11 fields, saw 15\nSkipping line 18: expected 11 fields, saw 15\nSkipping line 28: expected 11 fields, saw 15\nSkipping line 31: expected 11 fields, saw 15\nSkipping line 36: expected 11 fields, saw 15\nSkipping line 40: expected 11 fields, saw 15\nSkipping line 45: expected 11 fields, saw 15\nSkipping line 89: expected 11 fields, saw 15\nSkipping line 111: expected 11 fields, saw 15\nSkipping line 134: expected 11 fields, saw 15\nSkipping line 157: expected 11 fields, saw 14\nSkipping line 327: expected 11 fields, saw 15\nSkipping line 344: expected 11 fields, saw 15\nSkipping line 352: expected 11 fields, saw 15\nSkipping line 356: expected 11 fields, saw 15\nSkipping line 360: expected 11 fields, saw 15\nSkipping line 375: expected 11 fields, saw 15\nSkipping line 389: expected 11 fields, saw 15\nSkipping line 400: expected 11 fields, saw 14\nSkipping line 411: expected 11 fields, saw 15\nSkipping line 414: expected 11 fields, saw 15\nSkipping line 420: 