In [None]:
#this is the code used on google colab to finetune tiny-llma

In [None]:
!pip install -q -U bitsandbytes transformers peft accelerate datasets
!pip install -q huggingface_hub

In [None]:
!pip install gcsfs==2025.3.0
!pip install fsspec==2025.3.0

In [None]:
from google.colab import files
import os

# Upload the file
uploaded = files.upload()

# Check if the file was uploaded successfully
if 'CameroonLaw.txt' in uploaded:
    print("CameroonLaw.txt uploaded successfully!")
    # Verify the file exists in the current directory (which is /content/ in Colab)
    if os.path.exists('CameroonLaw.txt'):
        print("CameroonLaw.txt found in the current directory.")
    else:
        print("CameroonLaw.txt not found in the current directory after upload.")
else:
    print("CameroonLaw.txt was not found in the uploaded files.")

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Try with a custom device_map to force all layers to GPU
device_map = {"": 0}  # This forces all layers to GPU 0

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map=device_map,  # Use our custom device_map
    trust_remote_code=True
)

tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    model_max_length=256,
    padding_side="left",
    add_eos_token=True
)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
from datasets import load_dataset, Dataset, DatasetDict
import transformers

# Load the dataset from the text file
# This returns a DatasetDict, typically with a 'train' split
loaded_dataset = load_dataset('text', data_files='CameroonLaw.txt')

# loaded_dataset is a DatasetDict. We want to split this into train and test.
# The train_test_split method works directly on a DatasetDict
# and will apply the split to each existing split (in this case, 'train').
# The result will be a new DatasetDict with 'train' and 'test' keys.
dataset = loaded_dataset['train'].train_test_split(test_size=0.1)

# Print the type of the dataset variable to verify it's a DatasetDict after splitting
# Accessing dataset['train'] or dataset['test'] will give you a Dataset object
print(f"Type of dataset after splitting: {type(dataset)}")
print(f"Type of dataset['train'] after splitting: {type(dataset['train'])}")


def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        #label_names=["labels"],
        truncation=True,
        max_length=512,
        padding="max_length",
        add_special_tokens=True
    )

# tokenized_dataset should be created by mapping the tokenize_function to the dataset object
# Since dataset is now a DatasetDict with 'train' and 'test' keys,
# the map method applied to the DatasetDict will apply the function to both splits.
tokenized_dataset = dataset.map(tokenize_function, batched=True)
print("Dataset tokenization mapping applied successfully.")

In [None]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

model = prepare_model_for_kbit_training(model)

peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

In [None]:
from google.colab import drive

# Mount Google Drive again with force_remount=True to ensure a new prompt
drive.mount('/content/drive/mount', force_remount=True)
# The rest of your code to load dataset, tokenize, setup trainer, and train
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
)

# Load Tokenizer
# Note: You might want to load the tokenizer from the model_id again
# if you are using a fresh environment after mounting the drive.
# Assuming 'model_id' is still in scope from a previous cell.
# If not, you'll need to define it again: model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
tokenizer.pad_token = tokenizer.eos_token  # Critical for padding in causal LM

# Load Plain Text Dataset
# Ensure 'CameroonLaw.txt' is accessible from the new mount if it's not in /content
# For example, if it's in a specific folder in your new Google Drive:
# data_files="/content/drive/MyDrive/path/to/CameroonLaw.txt"
dataset = load_dataset("text", data_files="/content/drive/MyDrive/CameroonLaw.txt")["train"]
dataset = dataset.train_test_split(test_size=0.1)  # Split into train/test

# Tokenization (No labels needed)
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=512,
        padding="max_length",
    )

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Training Arguments (No label_names required)
training_args = TrainingArguments(
    # Update the output directory path if needed based on the new mount
    output_dir="/content/drive/MyDrive/tinyllama-checkpoint",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-5,
    fp16=True,
    logging_steps=50,
    eval_strategy="steps",
    eval_steps=1000,
    save_strategy="steps",
    save_steps=1000,
    report_to="none",
    optim="paged_adamw_8bit",
    gradient_checkpointing=True,
    resume_from_checkpoint=True,
    # No need for label_names in unsupervised LM tasks
)

# Trainer for Causal LM
# Ensure 'model' is still in scope from a previous cell.
# If not, you'll need to reload the model as well.
trainer = Trainer(
    model=model, # Assuming 'model' object is available from a previous cell
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=DataCollatorForLanguageModeling(
        tokenizer, mlm=False  # mlm=False for causal LM
    ),
)

# Start training
trainer.train()

# Save final model (adapter only)
model.save_pretrained("/content/drive/MyDrive/tinyllama-final-lora")
print("Training completed! LoRA adapter saved.")