In [1]:
import pandas as pd
from transformers import pipeline
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sacrebleu import corpus_bleu
import tqdm
import evaluate
import torch
from transformers import AutoModelForCausalLM
from llama import LlamaForCausalLM
from llama import MultiTaskLlamaForCausalLM
import lightning as L
from torch.utils.data import DataLoader

ðŸš¨ `num_layers` is part of LlamaModel.__init__'s signature, but not documented. Make sure to add it to the docstring of the function in /workspace/en-es/llama.py.


In [2]:
num_frozen_layers = 10
num_task_layers = 0
max_length = 128
effective_batch_size = 32
batch_size = 32
accumulate_grad_batches = effective_batch_size // batch_size
weight_decay = 0.01
epochs = 3
learning_rate = 5e-5
grad_clip_val = 1.0

In [3]:

checkpoint = "HuggingFaceTB/SmolLM-135M"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
pretrained_model = LlamaForCausalLM.from_pretrained(checkpoint)
config = pretrained_model.config

config.num_frozen_layers = num_frozen_layers
config.num_task_layers = num_task_layers
model = MultiTaskLlamaForCausalLM(config)
model.load_state_dict(pretrained_model.state_dict(), strict=False)

for param in model.model.parameters():
    param.requires_grad = False

In [4]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters())

total_params = count_parameters(pretrained_model)
print(f"Total number of parameters in pretrained model: {total_params:,}")
total_params = count_parameters(model)
print(f"Total number of parameters in multi-task model: {total_params:,}")

Total number of parameters in pretrained model: 134,515,008
Total number of parameters in multi-task model: 63,713,088


In [5]:
data_df = pd.read_parquet("exp-data/en-es.parquet")

In [6]:
data_df = data_df[data_df["split"] == "train"].reset_index(drop=True)
print(f"Number of samples: {len(data_df)}")
# sample 100_000
data_df = data_df.sample(100_000, random_state=42).reset_index(drop=True)

Number of samples: 171412


In [7]:
class TranslationDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_length=128):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.template = "English: {} Spanish:"

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        en_text = self.df.iloc[idx]["EN"]
        es_text = self.df.iloc[idx]["ES"]
        prompt = self.template.format(en_text)
        full_text = f"{prompt} {es_text}"

        # Tokenize prompt and full text
        prompt_ids = self.tokenizer(prompt, truncation=True, max_length=self.max_length, add_special_tokens=False)["input_ids"]
        full_ids = self.tokenizer(full_text, truncation=True, max_length=self.max_length, add_special_tokens=False)["input_ids"]

        # Labels: -100 for prompt, actual tokens for output
        labels = [-100] * len(prompt_ids) + full_ids[len(prompt_ids):]
        input_ids = full_ids

        # Pad if necessary
        pad_len = self.max_length - len(input_ids)
        if pad_len > 0:
            input_ids += [self.tokenizer.pad_token_id] * pad_len
            labels += [-100] * pad_len
        else:
            input_ids = input_ids[:self.max_length]
            labels = labels[:self.max_length]

        return {
            "input_ids": torch.tensor(input_ids, dtype=torch.long),
            "labels": torch.tensor(labels, dtype=torch.long)
        }

print(data_df['EN'][0])
# Example usage
dataset = TranslationDataset(data_df, tokenizer, max_length=max_length)
sample = dataset[0]
print(sample)


Levin knew he was aiming behind the snipe, but fired, nevertheless.
{'input_ids': tensor([14901,    42, 48009,  5341,   384,   436, 18572,  2893,   260,  3334,
         6337,    28,   564, 15615,    28, 18698,    30,  5071,    42, 48009,
           28,   253,   419,  3806,  1590, 18155, 10168,  4606,  2189, 20534,
          253,  2618,  2204, 14884,   551,   273, 22027,   278, 25032,    28,
         8276,  7342,    30,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,    

In [8]:
# Inference example using the model and tokenizer
en_sentence = data_df['EN'][0]
prompt = f"English: {en_sentence} Spanish:"
input_ids = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_length)["input_ids"]

# Move input_ids to the same device as the model if needed
device = next(model.parameters()).device
input_ids = input_ids.to(device)

# Generate output
with torch.no_grad():
    output_ids = model.generate(input_ids, max_length=max_length, pad_token_id=tokenizer.pad_token_id)

# Decode the generated Spanish translation
generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print("Prompt:", prompt)
print("Generated:", generated_text)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Prompt: English: Levin knew he was aiming behind the snipe, but fired, nevertheless. Spanish:
Generated: English: Levin knew he was aiming behind the snipe, but fired, nevertheless. Spanish: somewhere else, where even ariminals also loop their own.
 interspers between between between parsing and/ day or even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even even


In [9]:
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [10]:
class LitTranslationModel(L.LightningModule):
    def __init__(self, model, tokenizer, lr=2e-5):
        super().__init__()
        self.model = model
        self.tokenizer = tokenizer
        self.lr = lr

    def forward(self, input_ids, labels=None):
        return self.model(input_ids=input_ids, labels=labels)

    def training_step(self, batch, batch_idx):
        outputs = self.model(input_ids=batch["input_ids"], labels=batch["labels"])
        loss = outputs.loss
        self.log("train_loss", loss, prog_bar=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        outputs = self.model(input_ids=batch["input_ids"], labels=batch["labels"])
        loss = outputs.loss
        self.log("val_loss", loss, prog_bar=True)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.model.parameters(), lr=self.lr, weight_decay=weight_decay)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs * len(train_loader))
        return [optimizer], [scheduler]



lit_model = LitTranslationModel(model, tokenizer, lr=learning_rate)

trainer = L.Trainer(
        max_epochs=epochs, 
        accelerator="auto", 
        devices="auto",
        accumulate_grad_batches=accumulate_grad_batches,
        gradient_clip_algorithm="norm",
        gradient_clip_val=grad_clip_val
    )

ðŸ’¡ Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
/workspace/.venv/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default


In [11]:
trainer.fit(lit_model, train_loader)

/workspace/.venv/lib/python3.12/site-packages/lightning/pytorch/trainer/configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
  return _C._get_float32_matmul_precision()
You are using a CUDA device ('NVIDIA GeForce RTX 4090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                      | Params | Mode 
------------------------------------------------------------
0 | model | MultiTaskLlamaForCausalLM | 63.7 M | train
------------------------------------------------------------
0         Trainable params
63.7 M    Non-trainable params
63.7 M    Total params
254.852   Total estimated model params size (MB)
137

Training: |          | 0/? [00:00<?, ?it/s]

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn