In [1]:
# !pip uninstall -y torch torchvision torchaudio
# !pip install --upgrade --no-cache-dir --no-deps unsloth
# !pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0
# !MAX_JOBS=4 pip install flash-attn --no-build-isolation

In [2]:
# !pip install -U bitsandbytes transformers peft accelerate trl datasets sentencepiece wandb

In [None]:
import torch
print(torch.__version__)

2.4.0+cu121


In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig
import torch
from datasets import load_dataset
from trl import SFTTrainer

In [None]:
USE_GPU = True
if USE_GPU:
  device = "cuda:0"
else:
  device = "cpu"

In [None]:
QUANTIZE_4BIT = True
USE_GRAD_CHECKPOINTS = True
TRAIN_BATCH_SIZE = 8
TRAIN_MAX_SEQ_LENGTH = 512
USE_FLASH_ATTENTION = True
GRAD_CC_STEPS = 2

In [None]:
MODEL_NAME = "CohereForAI/aya-101"

In [None]:
quantization_config = None
if QUANTIZE_4BIT:
  quantization_config = BitsAndBytesConfig(
      load_in_4bit=True,
      bnb_4bit_quant_type="nf4",
      bnb_4bit_use_double_quant=True,
      bnb_4bit_compute_dtype=torch.bfloat16
  )

In [None]:
attn_implentation = None
if USE_FLASH_ATTENTION:
  attn_implementation="flash_attention_2"

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL_NAME,
    quantization_config=quantization_config,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True
)

model = model.to(device)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [None]:
from datasets import load_dataset
dataset_dict = load_dataset("csv", data_files="/content/train_parallel_for_llm_df.csv")

In [None]:
train_dataset = dataset_dict["train"]

In [None]:
train_dataset

Dataset({
    features: ['inputs', 'targets'],
    num_rows: 2091
})

In [None]:
MAX_SOURCE_LENGTH = 512
MAX_TARGET_LENGTH = 512
TRAIN_MAX_SEQ_LENGTH = 512

def tokenize_function(examples):
    model_inputs = tokenizer(examples["inputs"], max_length=MAX_SOURCE_LENGTH, padding=False, truncation=True)

    labels = tokenizer(text_target=examples["targets"], max_length=MAX_TARGET_LENGTH, padding=False, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_dataset = train_dataset.map(tokenize_function, batched=True)

In [None]:
from transformers import TrainingArguments
from trl import SFTConfig, SFTTrainer, DataCollatorForCompletionOnlyLM
from transformers import DataCollatorForSeq2Seq, DataCollatorWithPadding



TRAIN_BATCH_SIZE = 4
GRAD_ACC_STEPS = 1
USE_GRAD_CHECKPOINTING = False
MAX_SEQ_LENGTH = 512
LEARNING_RATE = 5e-5

training_arguments = TrainingArguments(
    output_dir="results",
    num_train_epochs=15,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    optim="adamw_torch",
    save_steps=400,
    logging_steps=50,
    learning_rate=LEARNING_RATE,
    weight_decay=0.001,
    fp16=False,
    bf16=True,
    warmup_ratio=0.05,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="none",
    label_names=["labels"]
)

In [None]:
peft_config = LoraConfig(
    lora_alpha=32,
    r=32,
    bias="none",
    task_type="SEQ_2_SEQ_LM",
    target_modules=["q", "v"]
)


In [None]:
tokenizer.model_max_length = MAX_SEQ_LENGTH

In [None]:
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    peft_config=peft_config,
    data_collator=data_collator,
    args=training_arguments,
)



Truncating train dataset:   0%|          | 0/2091 [00:00<?, ? examples/s]

In [None]:
print(f"Tokenizer length before resizing: {len(tokenizer)}")
model.resize_token_embeddings(len(tokenizer))
print(f"Tokenizer length after resizing: {len(tokenizer)}")

Tokenizer length before resizing: 250100
Tokenizer length after resizing: 250100


In [None]:
trainer.train()

trainer.model.save_pretrained(save_directory='/content/aya-101_training')

model.config.use_cakthe = True

model.eval()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
50,0.5661
100,0.3412
150,0.3278
200,0.2528
250,0.279
300,0.2225
350,0.2513
400,0.2297
450,0.2266
500,0.2386


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast

T5ForConditionalGeneration(
  (shared): Embedding(250100, 4096)
  (encoder): T5Stack(
    (embed_tokens): Embedding(250100, 4096)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k): Linear4bit(in_feat

In [None]:
hf_repo_name = "linndfors/uk-sent-gender-swapper_aya-101"

local_save_path = '/content/aya-101_training'
trainer.model.save_pretrained(local_save_path)
print(f"Adapter saved locally to {local_save_path}")

trainer.model.push_to_hub(hf_repo_name)
print(f"Adapter pushed to Hugging Face Hub at {hf_repo_name}")

Adapter saved locally to /content/aya-101_training


adapter_model.safetensors:   0%|          | 0.00/6.30G [00:00<?, ?B/s]

Adapter pushed to Hugging Face Hub at linndfors/uk-sent-gender-swapper_aya-101
