In [3]:
!pip install sentencepiece transformers==4.33 datasets sacremoses sacrebleu -q

In [4]:
import pandas as pd
dataset = pd.read_csv('output_3.csv')
print(dataset.head())

                                                 awa  \
0  Ju ainawai Jisukristu muunji aajakajua nuna da...   
1  Abragka uchijiyai Isaac, Isaaka uchijiyai Jaco...   
2  Juda Tamarai akiauwai Faresan, Zarajai. Faresa...   
3  Arama uchijiyai Aminadab, Aminadapa uchijiyai ...   
4  Salmon Rahapa juki akiauwai Boozan, Booz Rutan...   

                                                 spa split  
0  El libro de la genealogia de Jesucristo, hijo ...   dev  
1  Abraham se convirtio en padre de Isaac. Isaac ...   dev  
2  Juda engendro a Perez y a Zera con Tamar. Pere...   dev  
3  Ram engendro a Aminadab. Aminadab engendro a N...   dev  
4  Salmon engendro a Booz con Rahab. Booz engendr...   dev  


In [5]:
df_train = dataset[dataset.split=='train'].copy()
df_dev = dataset[dataset.split=='dev'].copy()
df_test = dataset[dataset.split=='test'].copy()

In [6]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
model_name = "facebook/nllb-200-distilled-600M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

tokenizer.src_lang = "spa_Latn"
inputs = tokenizer(text="Hola a todos mis amigos", return_tensors="pt")
translated_tokens = model.generate(
    **inputs, forced_bos_token_id=tokenizer.lang_code_to_id["eng_Latn"]
)
print(tokenizer.decode(translated_tokens[0], skip_special_tokens=True))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

  torch.utils._pytree._register_pytree_node(


pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

Hello to all my friends


In [7]:
import re

def word_tokenize(text):

    return re.findall('(\w+|[^\w\s])', text)

smpl = df_train.sample(10000, random_state=1)
smpl['spa_toks'] = smpl.spa.apply(tokenizer.tokenize)
smpl['awa_toks'] = smpl.awa.apply(tokenizer.tokenize)
smpl['spa_words'] = smpl.spa.apply(word_tokenize)
smpl['awa_words'] = smpl.awa.apply(word_tokenize)

In [8]:
stats = smpl[
    ['spa_toks', 'awa_toks', 'spa_words', 'awa_words']
].applymap(len).describe()
print(stats.spa_toks['mean'] / stats.spa_words['mean'])  # 2.0349
print(stats.awa_toks['mean'] / stats.awa_words['mean'])  # 2.4234
stats

1.22606332069982
2.1671490692602093


Unnamed: 0,spa_toks,awa_toks,spa_words,awa_words
count,10000.0,10000.0,10000.0,10000.0
mean,28.8234,47.3949,23.5089,21.8697
std,14.73091,26.210331,12.162069,12.154435
min,1.0,1.0,1.0,1.0
25%,19.0,30.0,15.0,14.0
50%,27.0,45.0,22.0,21.0
75%,38.0,63.0,31.0,29.0
max,189.0,225.0,152.0,107.0


In [9]:
from tqdm.auto import tqdm, trange
import random
texts_with_unk = [
    text for text in tqdm(dataset.awa)
    if tokenizer.unk_token_id in tokenizer(text).input_ids
]
print(len(texts_with_unk))
s = random.sample(texts_with_unk, 5)
print(s)

  0%|          | 0/24215 [00:00<?, ?it/s]

8268
['“¡Amina uchijum jiikta, maami! ¡Wagki Baal dakumka najanamuwe nuna, tuja tikich nuna yantamen numi Ashera dakumka najanamu wajama nujai nii tsaike!” tuidau.', 'Tabaun Jisus antuk: —Puyatkaipa, ayatak kajintsam anentaimjata, nawanjumik aneantatui, —tau.', '“¡Fariseo aidautigminak wait anentajime! Atumek jega ijuntainum wayakjumesh, dekas apu ekeemtai shiig iwajamu aidau etegkajum ekeemin ainagme, nuigtushkam aents kuashat ayamunum igkuntuinakush, shiig ajantujus kumpamjuatnume tusajum wakejin ainajum nuadui.', 'Tuja umutai yaunchuk najanamun umin aidauk, yamagmanak wakegainatsui, ditak: ‘Yaunchuk najanamua nu ima pegkejai’ tuidau asag, —tiuwai.', 'Tusa tuinai aents aidau Jonatan niina kumpajijai wajattaman untsuinak: “¡Juwi wakatajum, nunikagmin wajuk maanitaimpaita nuna jintintuatjime!”, tuidau. Tusa tiagtai Jonatan niina kumpajin chichajak: “Wi emka weakui mina ukujui minita, auk Tuke Pujuu iina amase, depetuk amuktinme tusa”, tiuwai.']


In [10]:
import re
import sys
import unicodedata
from sacremoses import MosesPunctNormalizer

mpn = MosesPunctNormalizer(lang="en")
mpn.substitutions = [
    (re.compile(r), sub) for r, sub in mpn.substitutions
]

def get_non_printing_char_replacer(replace_by: str = " "):
    non_printable_map = {
        ord(c): replace_by
        for c in (chr(i) for i in range(sys.maxunicode + 1))
        if unicodedata.category(c) in {"C", "Cc", "Cf", "Cs", "Co", "Cn"}
    }

    def replace_non_printing_char(line) -> str:
        return line.translate(non_printable_map)

    return replace_non_printing_char

replace_nonprint = get_non_printing_char_replacer(" ")

def preproc(text):
    clean = mpn.normalize(text)
    clean = replace_nonprint(clean)
    clean = unicodedata.normalize("NFKC", clean)
    return clean

In [11]:
texts_with_unk_normed = [
    text for text in tqdm(texts_with_unk)
    if tokenizer.unk_token_id in tokenizer(preproc(text)).input_ids
]
print(len(texts_with_unk_normed))

  0%|          | 0/8268 [00:00<?, ?it/s]

0


In [12]:
from transformers.optimization import Adafactor
from transformers import get_constant_schedule_with_warmup
model.cuda();
optimizer = Adafactor(
    [p for p in model.parameters() if p.requires_grad],
    scale_parameter=False,
    relative_step=False,
    lr=1e-4,
    clip_threshold=1.0,
    weight_decay=1e-3,
)
scheduler = get_constant_schedule_with_warmup(optimizer, num_warmup_steps=1000)

In [13]:
import random
LANGS = [('spa', 'spa_Latn'), ('awa', 'awa_Latn')]

def get_batch_pairs(batch_size, data=df_train):
    (l1, long1), (l2, long2) = random.sample(LANGS, 2)
    xx, yy = [], []
    for _ in range(batch_size):
        item = data.iloc[random.randint(0, len(data)-1)]
        xx.append(preproc(item[l1]))
        yy.append(preproc(item[l2]))
    return xx, yy, long1, long2

print(get_batch_pairs(1))

(['Porque David, despues de haber servido a su generacion segun la voluntad de Dios, cayo dormido, fue acostado con sus padres y vio corrupcion.'], ['Dekaskenmag duka David pachisa tichamui, nigka ni wegantu aina nuwig Apajui dutikata tibaunak ashi imatiksag umik jakauwai, nunikmatai niina muunji ukumataijinig ukusam kaujui.'], 'spa_Latn', 'awa_Latn')


In [14]:
import gc
import torch

def cleanup():
    gc.collect()
    torch.cuda.empty_cache()

In [23]:
batch_size = 16
max_length = 128
training_steps = 300
losses = []
MODEL_SAVE_PATH = '/content/outputs/nllb-spa-awa-v2'

In [24]:
import numpy as np
model.train()
x, y, loss = None, None, None
cleanup()

tq = trange(len(losses), training_steps)
for i in tq:
    xx, yy, lang1, lang2 = get_batch_pairs(batch_size)
    try:
        tokenizer.src_lang = lang1
        x = tokenizer(xx, return_tensors='pt', padding=True, truncation=True, max_length=max_length).to(model.device)
        tokenizer.src_lang = lang2
        y = tokenizer(yy, return_tensors='pt', padding=True, truncation=True, max_length=max_length).to(model.device)
        y.input_ids[y.input_ids == tokenizer.pad_token_id] = -100

        loss = model(**x, labels=y.input_ids).loss
        loss.backward()
        losses.append(loss.item())

        optimizer.step()
        optimizer.zero_grad(set_to_none=True)
        scheduler.step()

    except RuntimeError as e:
        optimizer.zero_grad(set_to_none=True)
        x, y, loss = None, None, None
        cleanup()
        print('error', max(len(s) for s in xx + yy), e)
        continue

    if i % 100 == 0:
        print(i, np.mean(losses[-100:]))

    if i % 100 == 0 and i > 0:
        model.save_pretrained(MODEL_SAVE_PATH)
        tokenizer.save_pretrained(MODEL_SAVE_PATH)

  0%|          | 0/300 [00:00<?, ?it/s]

0 3.1208384037017822
100 3.2623483514785767
200 3.1095565533638


In [25]:
from transformers import NllbTokenizer, AutoModelForSeq2SeqLM
model_load_name = '/content/outputs/nllb-spa-awa-v2'
model = AutoModelForSeq2SeqLM.from_pretrained(model_load_name).cuda()
tokenizer = NllbTokenizer.from_pretrained(model_load_name)



In [26]:
def translate(
    text, src_lang='spa_Latn', tgt_lang='awa_Latn',
    a=32, b=3, max_input_length=1024, num_beams=4, **kwargs
):

    tokenizer.src_lang = src_lang
    tokenizer.tgt_lang = tgt_lang
    inputs = tokenizer(
        text, return_tensors='pt', padding=True, truncation=True,
        max_length=max_input_length
    )
    model.eval()
    result = model.generate(
        **inputs.to(model.device),
        forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_lang),
        max_new_tokens=int(a + b * inputs.input_ids.shape[1]),
        num_beams=num_beams, **kwargs
    )
    return tokenizer.batch_decode(result, skip_special_tokens=True)

t = 'Hola a todos mis amigos'
print(translate(t, 'spa_Latn', 'awa_Latn'))

['Mina aents aidau ashi anentaimsajum']


In [39]:
import pandas as pd
import sacrebleu

dataset = pd.read_csv('output_3.csv')
df_dev = dataset[dataset['split'] == 'dev'][:10]

df_dev['awa_translated'] = df_dev['awa'].apply(lambda x: translate(x, 'spa_Latn', 'awa_Latn')[0])
sampled_df = df_dev.sample(10, random_state=5)[["spa", "awa",  "awa_translated"]]
print(sampled_df)

bleu_calc = sacrebleu.BLEU()
chrf_calc = sacrebleu.CHRF(word_order=2)

print(bleu_calc.corpus_score(df_dev['awa_translated'].tolist(), [df_dev['awa'].tolist()]))
print(chrf_calc.corpus_score(df_dev['awa_translated'].tolist(), [df_dev['awa'].tolist()]))

                                                 spa  \
9  Ezequias fue el padre de Manases. Manases fue ...   
5  Jese se convirtio en el padre del rey David. D...   
2  Juda engendro a Perez y a Zera con Tamar. Pere...   
4  Salmon engendro a Booz con Rahab. Booz engendr...   
7  Asa engendro a Josafat. Josafat engendro a Jor...   
1  Abraham se convirtio en padre de Isaac. Isaac ...   
0  El libro de la genealogia de Jesucristo, hijo ...   
8  Uzias engendro a Jotam. Jotam fue el padre de ...   
6  Salomon fue el padre de Roboam. Roboam engendr...   
3  Ram engendro a Aminadab. Aminadab engendro a N...   

                                                 awa  \
9  Ezequiasa uchijiyai Manases, Manasesa uchijiya...   
5  Isai uchijiyai apu David, tuja David Uriasa nu...   
2  Juda Tamarai akiauwai Faresan, Zarajai. Faresa...   
4  Salmon Rahapa juki akiauwai Boozan, Booz Rutan...   
7  Asa uchijiyai Josafat, Josafata uchijiyai Jora...   
1  Abragka uchijiyai Isaac, Isaaka uchijiyai Ja

In [41]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
Your token has been saved to /root/.c

In [43]:
upload_repo = "hectordiazgomez/nllb-spa-awa-v1"
tokenizer.push_to_hub(upload_repo)
model.push_to_hub(upload_repo)

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/hectordiazgomez/nllb-spa-awa-v1/commit/f50df78509d77a41b2d709277c4b5487430f21a1', commit_message='Upload M2M100ForConditionalGeneration', commit_description='', oid='f50df78509d77a41b2d709277c4b5487430f21a1', pr_url=None, pr_revision=None, pr_num=None)