In [None]:
from transformers import AutoTokenizer, MT5ForConditionalGeneration
import torch
tokenizer = AutoTokenizer.from_pretrained("UBC-NLP/toucan-1.2B")
model = MT5ForConditionalGeneration.from_pretrained("UBC-NLP/toucan-1.2B", torch_dtype=torch.float16, device_map="auto")


HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': './drive/MyDrive/toucan-finetuned/final'. Use `repo_type` argument if needed.

In [None]:
model.train()

MT5ForConditionalGeneration(
  (shared): Embedding(250112, 1024)
  (encoder): MT5Stack(
    (embed_tokens): Embedding(250112, 1024)
    (block): ModuleList(
      (0): MT5Block(
        (layer): ModuleList(
          (0): MT5LayerSelfAttention(
            (SelfAttention): MT5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 16)
            )
            (layer_norm): MT5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): MT5LayerFF(
            (DenseReluDense): MT5DenseGatedActDense(
              (wi_0): Linear(in_features=1024, out_features=2816, bias=False)
              (wi_1): Linear(in_features=1024, out_features=2816, bias=Fals

In [None]:
#Translate from Enlglish to Zulu
text="bas: bonjour"
input_ids = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True).to("cuda:0")
with torch.no_grad():
    generated_ids = model.generate(**input_ids, num_beams=2, max_new_tokens=len(text), do_sample=True, temperature=0.6, top_p=0.9)
print("Toucan-1.2B - translation:", tokenizer.batch_decode(generated_ids, skip_special_tokens=True,  skip_prompt=True)[0])

Toucan-1.2B - translation: ndék


# Fine-Tuning

In [None]:
import numpy as np
from datasets import Dataset, concatenate_datasets
from transformers import (
    AutoTokenizer,
    MT5ForConditionalGeneration,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    get_linear_schedule_with_warmup,
)
import torch
import json

In [None]:
with open("/content/drive/MyDrive/bas_fr.json", "r") as bas_fr, open("/content/drive/MyDrive/bas_en.json", "r") as bas_en:
    fr_align = json.load(bas_fr)
    en_align = json.load(bas_en)

In [None]:
# 1. Load model & tokenizer
MODEL_NAME = "UBC-NLP/toucan-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model     = MT5ForConditionalGeneration.from_pretrained(MODEL_NAME)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/4.80M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/18.0M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/868 [00:00<?, ?B/s]

You are using a model of type t5 to instantiate a model of type mt5. This is not supported for all configurations of models and can yield errors.


model.safetensors:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

In [None]:
# 2. Prepare Datasets
def make_dataset(raw, src_key, tgt_key, tgt_code):
    # rename user keys to ISO-3 codes
    d = { src_key: raw[src_key], tgt_code: raw.pop(tgt_key) }
    ds = Dataset.from_dict(d)
    # split off a small dev set
    return ds.train_test_split(test_size=0.1, seed=42)

# English data: rename 'en'→'eng'
en_raw = en_align.copy()
en_raw['eng'] = en_raw.pop('en')
ds_en = make_dataset(en_raw, src_key='bas', tgt_key='eng', tgt_code='eng')

# French data: rename 'fr'→'fra'
fr_raw = fr_align.copy()
fr_raw['fra'] = fr_raw.pop('fr')
ds_fr = make_dataset(fr_raw, src_key='bas', tgt_key='fra', tgt_code='fra')

In [None]:
# 3. Build bi-directional pairs
def flip(ds, src, tgt):
    return ds.map(lambda ex: {src: ex[tgt], tgt: ex[src]}, remove_columns=[src, tgt])

# Bas→Eng & Eng→Bas
bas_eng = ds_en['train']
eng_bas = flip(ds_en['train'], 'bas', 'eng')
# Bas→Fra & Fra→Bas
bas_fra = ds_fr['train']
fra_bas = flip(ds_fr['train'], 'bas', 'fra')

# Combine into one dataset
train_ds = concatenate_datasets([bas_eng, eng_bas, bas_fra, fra_bas])
eval_ds  = concatenate_datasets([ds_en['test'], ds_fr['test']])

Map:   0%|          | 0/36775 [00:00<?, ? examples/s]

Map:   0%|          | 0/78105 [00:00<?, ? examples/s]

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


In [None]:
# 1) Bas→Eng
def preprocess_bas2eng(examples):
    inputs  = ["eng: " + b for b in examples["bas"]]
    targets = examples["eng"]
    return tokenizer(
        inputs,
        max_length=512,
        truncation=True,
        text_target=targets,
    )

tokenized_bas2eng = bas_eng.map(
    preprocess_bas2eng,
    batched=True,
    remove_columns=bas_eng.column_names,
)

# 2) Eng→Bas
def preprocess_eng2bas(examples):
    inputs  = ["bas: " + e for e in examples["eng"]]
    targets = examples["bas"]
    return tokenizer(
        inputs,
        max_length=512,
        truncation=True,
        text_target=targets,
    )

tokenized_eng2bas = eng_bas.map(
    preprocess_eng2bas,
    batched=True,
    remove_columns=eng_bas.column_names,
)

# 3) Bas→Fra
def preprocess_bas2fra(examples):
    inputs  = ["fra: " + b for b in examples["bas"]]
    targets = examples["fra"]
    return tokenizer(
        inputs,
        max_length=512,
        truncation=True,
        text_target=targets,
    )

tokenized_bas2fra = bas_fra.map(
    preprocess_bas2fra,
    batched=True,
    remove_columns=bas_fra.column_names,
)

# 4) Fra→Bas
def preprocess_fra2bas(examples):
    inputs  = ["bas: " + f for f in examples["fra"]]
    targets = examples["bas"]
    return tokenizer(
        inputs,
        max_length=512,
        truncation=True,
        text_target=targets,
    )

tokenized_fra2bas = fra_bas.map(
    preprocess_fra2bas,
    batched=True,
    remove_columns=fra_bas.column_names,
)

# Check!
print(tokenized_bas2eng.column_names)
# → [ 'input_ids', 'attention_mask', 'labels' ]

# 5) Now concatenate the tokenized sets
from datasets import concatenate_datasets
train_ds = concatenate_datasets([
    tokenized_bas2eng,
    tokenized_eng2bas,
    tokenized_bas2fra,
    tokenized_fra2bas,
])

# 6) Build a small eval set (e.g. 10% of each)
eval_ds = concatenate_datasets([
    tokenized_bas2eng.select(range(len(tokenized_bas2eng)//10)),
    tokenized_bas2fra.select(range(len(tokenized_bas2fra)//10)),
])

Map:   0%|          | 0/36775 [00:00<?, ? examples/s]

Map:   0%|          | 0/36775 [00:00<?, ? examples/s]

Map:   0%|          | 0/78105 [00:00<?, ? examples/s]

Map:   0%|          | 0/78105 [00:00<?, ? examples/s]

['input_ids', 'attention_mask', 'labels']


  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


In [None]:
# 6. Create a data collator that will dynamically pad inputs & labels
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# 7. Set up training arguments (following Section 5.2 of the paper)
training_args = Seq2SeqTrainingArguments(
    output_dir               = "./toucan-finetuned",
    per_device_train_batch_size = 64,
    per_device_eval_batch_size  = 64,
    learning_rate            = 5e-5,
    num_train_epochs         = 5,
    weight_decay             = 0.01,
    eval_strategy            = "epoch",
    save_strategy            = "epoch",
    save_total_limit         = 2,
    logging_dir              = "./logs",
    logging_steps            = 100,
    predict_with_generate    = True,
    fp16                     = True,
    # for very small GPUs you can add:
    #gradient_accumulation_steps = 2,
)

# 8. Initialize the Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model             = model,
    args              = training_args,
    train_dataset     = train_ds,
    eval_dataset      = eval_ds,
    tokenizer         = tokenizer,
    data_collator     = data_collator,
)

  trainer = Seq2SeqTrainer(


In [None]:
# 10. After training, save your final model & tokenizer:
#trainer.save_model("/content/drive/MyDrive/JIANTS/basaa/toucan-finetuned/final")
#tokenizer.save_pretrained("/content/drive/MyDrive/JIANTS/basaa/toucan-finetuned/final")

In [None]:
# 9. Launch training
trainer.train()

# 10. After training, save your final model & tokenizer:
trainer.save_model("drive/MyDrive/JIANTS/basaa/toucan-finetuned/final")
tokenizer.save_pretrained("drive/MyDrive/JIANTS/basaa/toucan-finetuned/final")

[34m[1mwandb[0m: Currently logged in as: [33mandrekevin[0m ([33mjiants-research[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.0024,0.000121


In [None]:
print(tokenized_bas2eng.column_names)

In [None]:
torch.cuda.empty_cache()