In [1]:
from huggingface_hub import login

login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
from datasets import load_dataset
from transformers import AutoTokenizer, M2M100ForConditionalGeneration, AutoModelForCausalLM, BitsAndBytesConfig, AutoModelForSeq2SeqLM, NllbTokenizerFast
from tokenization_small100 import SMALL100Tokenizer
from peft import LoraModel

In [4]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype="bfloat16",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

In [5]:
from peft import LoraConfig, get_peft_model

FAIRSEQ_LANGUAGE_CODES = ['ace_Arab', 'ace_Latn', 'acm_Arab', 'acq_Arab', 'aeb_Arab', 'afr_Latn', 'ajp_Arab', 'aka_Latn', 'amh_Ethi', 'apc_Arab', 'arb_Arab', 'ars_Arab', 'ary_Arab', 'arz_Arab', 'asm_Beng', 'ast_Latn', 'awa_Deva', 'ayr_Latn', 'azb_Arab', 'azj_Latn', 'bak_Cyrl', 'bam_Latn', 'ban_Latn', 'bel_Cyrl', 'bem_Latn', 'ben_Beng', 'bho_Deva', 'bjn_Arab', 'bjn_Latn', 'bod_Tibt', 'bos_Latn', 'bug_Latn', 'bul_Cyrl', 'cat_Latn', 'ceb_Latn', 'ces_Latn', 'cjk_Latn', 'ckb_Arab', 'crh_Latn', 'cym_Latn', 'dan_Latn', 'deu_Latn', 'dik_Latn', 'dyu_Latn', 'dzo_Tibt', 'ell_Grek', 'eng_Latn', 'epo_Latn', 'est_Latn', 'eus_Latn', 'ewe_Latn', 'fao_Latn', 'pes_Arab', 'fij_Latn', 'fin_Latn', 'fon_Latn', 'fra_Latn', 'fur_Latn', 'fuv_Latn', 'gla_Latn', 'gle_Latn', 'glg_Latn', 'grn_Latn', 'guj_Gujr', 'hat_Latn', 'hau_Latn', 'heb_Hebr', 'hin_Deva', 'hne_Deva', 'hrv_Latn', 'hun_Latn', 'hye_Armn', 'ibo_Latn', 'ilo_Latn', 'ind_Latn', 'isl_Latn', 'ita_Latn', 'jav_Latn', 'jpn_Jpan', 'kab_Latn', 'kac_Latn', 'kam_Latn', 'kan_Knda', 'kas_Arab', 'kas_Deva', 'kat_Geor', 'knc_Arab', 'knc_Latn', 'kaz_Cyrl', 'kbp_Latn', 'kea_Latn', 'khm_Khmr', 'kik_Latn', 'kin_Latn', 'kir_Cyrl', 'kmb_Latn', 'kon_Latn', 'kor_Hang', 'kmr_Latn', 'lao_Laoo', 'lvs_Latn', 'lij_Latn', 'lim_Latn', 'lin_Latn', 'lit_Latn', 'lmo_Latn', 'ltg_Latn', 'ltz_Latn', 'lua_Latn', 'lug_Latn', 'luo_Latn', 'lus_Latn', 'mag_Deva', 'mai_Deva', 'mal_Mlym', 'mar_Deva', 'min_Latn', 'mkd_Cyrl', 'plt_Latn', 'mlt_Latn', 'mni_Beng', 'khk_Cyrl', 'mos_Latn', 'mri_Latn', 'zsm_Latn', 'mya_Mymr', 'nld_Latn', 'nno_Latn', 'nob_Latn', 'npi_Deva', 'nso_Latn', 'nus_Latn', 'nya_Latn', 'oci_Latn', 'gaz_Latn', 'ory_Orya', 'pag_Latn', 'pan_Guru', 'pap_Latn', 'pol_Latn', 'por_Latn', 'prs_Arab', 'pbt_Arab', 'quy_Latn', 'ron_Latn', 'run_Latn', 'rus_Cyrl', 'sag_Latn', 'san_Deva', 'sat_Beng', 'scn_Latn', 'shn_Mymr', 'sin_Sinh', 'slk_Latn', 'slv_Latn', 'smo_Latn', 'sna_Latn', 'snd_Arab', 'som_Latn', 'sot_Latn', 'spa_Latn', 'als_Latn', 'srd_Latn', 'srp_Cyrl', 'ssw_Latn', 'sun_Latn', 'swe_Latn', 'swh_Latn', 'szl_Latn', 'tam_Taml', 'tat_Cyrl', 'tel_Telu', 'tgk_Cyrl', 'tgl_Latn', 'tha_Thai', 'tir_Ethi', 'taq_Latn', 'taq_Tfng', 'tpi_Latn', 'tsn_Latn', 'tso_Latn', 'tuk_Latn', 'tum_Latn', 'tur_Latn', 'twi_Latn', 'tzm_Tfng', 'uig_Arab', 'ukr_Cyrl', 'umb_Latn', 'urd_Arab', 'uzn_Latn', 'vec_Latn', 'vie_Latn', 'war_Latn', 'wol_Latn', 'xho_Latn', 'ydd_Hebr', 'yor_Latn', 'yue_Hant', 'zho_Hans', 'zho_Hant', 'zul_Latn']  # fmt: skip

tokenizer = NllbTokenizerFast.from_pretrained("facebook/nllb-200-distilled-600M",
                                                src_lang="tgl_Latn",
                                                tgt_lang="eng_Latn",)
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M", quantization_config=bnb_config)

In [5]:
lora_config = LoraConfig(
    r=8,
    target_modules=["q_proj", "v_proj", "k_proj"],
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_2_SEQ_LM",
)

peft_model = get_peft_model(model, lora_config)
peft_model.print_trainable_parameters()

trainable params: 1,769,472 || all params: 616,843,264 || trainable%: 0.2869


In [6]:
tokens = tokenizer("How is you day today?", return_tensors="pt").to(model.device)
translated_tokens = model.generate(
    **tokens, forced_bos_token_id=tokenizer.convert_tokens_to_ids("tgl_Latn"), max_length=30,
)
text = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)

In [7]:
text

['Kumusta ka ngayon?']

In [6]:
import pandas as pd
import datasets

ceb_eng = pd.read_csv("cebuano-to-english-corpora.csv")
tgl_eng = pd.read_csv("tagalog-to-english-corpora.csv")

parallel_corpora = pd.concat([ceb_eng, tgl_eng], ignore_index=True)

def preprocess(batch):
    model_inputs = tokenizer(
        batch["language1_text"],
        truncation=True,
        padding="max_length",
        max_length=128,
    )

    labels = tokenizer(
        batch["language2_text"],
        truncation=True,
        padding="max_length",
        max_length=128,
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

corpora = parallel_corpora.sample(frac=1, random_state=42)


train_df = corpora.sample(frac=0.90, random_state=42)
eval_df = tgl_eng[~tgl_eng.index.isin(train_df.index)] # only use tagalog for eval, lower size bc of time constraint
train_df = train_df.reset_index(drop=True)

train_dataset = datasets.Dataset.from_pandas(train_df)
eval_dataset = datasets.Dataset.from_pandas(eval_df)

train_dataset_processed = train_dataset.map(preprocess, batched=True, remove_columns=['language1_text', 'language2_text'])
eval_dataset_processed = eval_dataset.map(preprocess, batched=True, remove_columns=['language1_text', 'language2_text'])

Map:   0%|          | 0/52642 [00:00<?, ? examples/s]

Map:   0%|          | 0/2932 [00:00<?, ? examples/s]

In [10]:
train_df

Unnamed: 0.1,Unnamed: 0,language1_text,language2_text,language1_text_words,language2_text_words
0,531967,Busa tumana ninyo ang iyang mga lagda ug ang i...,"Thou shalt keep therefore his statutes, and hi...",Busa tumana ninyo ang iyang mga lagda ug ang i...,Thou shalt keep therefore his statutes and his...
1,974592,Kung iyong diringgin ang lahat ng aking iniuut...,"And it shall be, if thou wilt hearken unto all...",Kung iyong diringgin ang lahat ng aking iniuut...,And it shall be if thou wilt hearken unto all ...
2,542218,Ayaw pagpakighigala sa usa ka tawo nga daling ...,Make no friendship with an angry man; And with...,Ayaw pagpakighigala sa usa ka tawo nga daling ...,Make no friendship with an angry man And with ...
3,531214,"""Kini mao ang lagda sa balaod nga gisugo sa Gi...",This is the ordinance of the law which the LOR...,Kini mao ang lagda sa balaod nga gisugo sa Gin...,This is the ordinance of the law which the LOR...
4,534381,"Apan si Saul miingon, ""Walay usa ka tawo nga p...","And Saul said, There shall not a man be put to...",Apan si Saul miingon Walay usa ka tawo nga pat...,And Saul said There shall not a man be put to ...
...,...,...,...,...,...
26316,984398,"""Gayunma'y sinasabi ninyo, 'Bakit hindi magdur...","Yet say ye, Why? doth not the son bear the ini...",Gayunma'y sinasabi ninyo 'Bakit hindi magdurus...,Yet say ye Why doth not the son bear the iniqu...
26317,528999,Ayaw kamo pagbuhat ug mga diosdios nga hinimo ...,"Ye shall not make with me gods of silver, neit...",Ayaw kamo pagbuhat ug mga diosdios nga hinimo ...,Ye shall not make with me gods of silver neith...
26318,538850,Ang iyang pag-ampo ug kon giunsa sa pagdawat s...,"His prayer also, and how God was intreated of ...",Ang iyang pag ampo ug kon giunsa sa pagdawat s...,His prayer also and how God was intreated of h...
26319,978122,ay naghanda para kay Tobias ng isang malaking ...,"and he had prepared for him a great chamber, w...",ay naghanda para kay Tobias ng isang malaking ...,and he had prepared for him a great chamber wh...


In [23]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="training-nllb-tgl-to-english-v2-working",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    eval_strategy="steps",
    eval_steps=1000,
    save_strategy="steps",
    save_steps=1000,
    load_best_model_at_end=True,
    logging_strategy="steps",
    logging_steps=1000,
    logging_first_step=True,
    report_to="none",
    predict_with_generate=True,
)

In [24]:
from transformers import Seq2SeqTrainer
import numpy as np
import evaluate

sacrebleu = evaluate.load("sacrebleu")


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = sacrebleu.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]

    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}

    return result

trainer = Seq2SeqTrainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset_processed,
    eval_dataset=eval_dataset_processed,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Seq2SeqTrainer(


In [26]:
trainer.train()

Step,Training Loss,Validation Loss,Bleu,Gen Len
1000,6.6093,6.503906,40.4945,36.9076
2000,6.6208,6.5,41.0185,36.7558
3000,6.6078,6.503906,41.2638,36.8469
4000,6.6072,6.507812,41.4168,36.8987
5000,6.6121,6.503906,41.5364,36.8632
6000,6.6153,6.507812,41.5901,36.9403


TrainOutput(global_step=6581, training_loss=6.6129518215316825, metrics={'train_runtime': 7975.0222, 'train_samples_per_second': 6.601, 'train_steps_per_second': 0.825, 'total_flos': 1.433162814062592e+16, 'train_loss': 6.6129518215316825, 'epoch': 1.0})

In [27]:
trainer.save_model("nllb-tgl-to-eng-seq2seq-model-v3")    