<a href="https://colab.research.google.com/github/jonathanrbelanger-lang/Exorobourii.com/blob/main/PyachamamaTrain.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Cell 1: Setup and Installation (Final)
# The -U flag ensures we upgrade all packages to the latest versions.
!pip install -q -U transformers datasets accelerate peft bitsandbytes trl

import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import LoraConfig
from trl import SFTTrainer

print("All libraries installed and upgraded successfully.")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.1/40.1 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m111.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m503.6/503.6 kB[0m [31m43.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m35.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m564.7/564.7 kB[0m [31m45.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m57.8 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf-cu12 25.6.0 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x86_64", but you hav

In [2]:
# Cell 2: Configuration
base_model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
new_model_name = "Pyachamama-v1-adapters"
dataset_path = "pyachamama_corpus_clean.txt"

In [3]:
# Cell 3: Load Dataset and Tokenizer
dataset = load_dataset("text", data_files={"train": dataset_path}, split="train")
tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

print("Dataset and tokenizer loaded.")
print(f"Dataset preview: {dataset[0]}")

Generating train split: 0 examples [00:00, ? examples/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

Dataset and tokenizer loaded.
Dataset preview: {'text': '#!/usr/bin/env python'}


In [4]:
# Cell 4: Configure QLoRA and Load Model
compute_dtype = getattr(torch, "float16")
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=quant_config,
    device_map="auto"
)
model.config.use_cache = False
model.config.pretraining_tp = 1

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

print("Base model loaded in 4-bit with LoRA configuration ready.")

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Base model loaded in 4-bit with LoRA configuration ready.


In [5]:
# Cell 5: Set Training Arguments and Start Training (Failsafe Version)

# Training arguments (unchanged)
training_args = TrainingArguments(
    output_dir="./pyachamama-results",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    logging_steps=50,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="none"
)

# Initialize the trainer with the most basic, compatible arguments.
# This avoids all the version conflicts we've been seeing.
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    peft_config=peft_config,
)

# We must manually set the tokenizer and max_seq_length AFTER initialization
# for this older version of the library.
trainer.tokenizer = tokenizer
trainer.max_seq_length = 512

# Start training!
print("Starting training...")
trainer.train()
print("Training complete.")

Adding EOS to train dataset:   0%|          | 0/119350 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/119350 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/119350 [00:00<?, ? examples/s]

Trainer.tokenizer is now deprecated. You should use `Trainer.processing_class = processing_class` instead.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 2}.


Starting training...


Step,Training Loss
50,2.8425
100,2.5622
150,2.1712
200,2.2215
250,2.2535
300,2.1009
350,2.2276
400,2.2347
450,2.0704
500,2.1374


Training complete.


In [6]:
# Cell 6: Save the Fine-Tuned Model
print(f"Saving the fine-tuned LoRA adapters to ./{new_model_name}")
trainer.model.save_pretrained(new_model_name)
tokenizer.save_pretrained(new_model_name)
print("Model saved successfully.")

Saving the fine-tuned LoRA adapters to ./Pyachamama-v1-adapters
Model saved successfully.
