In [1]:
from datasets import load_dataset, DatasetDict
from config import HUGGING_FACE_TOKEN as token
from torch import cuda
import os
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

cache_dir = "models/"
model_name = "meta-llama/Llama-2-7b-chat-hf"

# Load Yahoo Answers Topics dataset
full_dataset = load_dataset("yahoo_answers_topics")

In [2]:
import random

# Set the desired size
train_size = 100000  # Reduce to 100,000 items
test_size = 10000    # Reduce to 10,000 items

# Generate random indices for train and test sets
train_indices = random.sample(range(len(full_dataset["train"])), train_size)
test_indices = random.sample(range(len(full_dataset["test"])), test_size)


# Select subsets based on the sampled indices
reduced_train_dataset = full_dataset["train"].select(train_indices)
reduced_test_dataset = full_dataset["test"].select(test_indices)

# Create a new DatasetDict
reduced_dataset = DatasetDict({
    "train": reduced_train_dataset,
    "test": reduced_test_dataset
})


In [3]:
def format_data(item):
    system_prompt = """
    <s>[INST] <<SYS>>
    You are a helpful, respectful and honest assistant for labeling topics.
    <</SYS>>
    """

    user_prompt = item["best_answer"]
    model_answer = item["topic"]

    # Combine the parts into the structured format
    formatted_data = f"{system_prompt}\n{user_prompt}\n\n{model_answer}"
    return {"formatted_data": formatted_data}

In [4]:
# Apply the formatting function
formatted_dataset = reduced_dataset.map(lambda items: format_data(items))

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [5]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [6]:
# Use a suitable tokenizer from Hugging Face
# token = "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"

tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'



In [7]:
# import torch
# import transformers

# Quantization to load an LLM with less GPU memory
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # 4-bit quantization
    bnb_4bit_quant_type='nf4',  # Normalized float 4
    bnb_4bit_use_double_quant=True,  # Second quantization after the first
    bnb_4bit_compute_dtype="float16"  # Computation type
)

In [8]:
# from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
from peft import get_peft_model
# from trl import SFTTrainer

# device = torch.device("cuda" if torch.cuda.is_available() else "CPU")

device_map = {"": 0}

# Load the model for sequence classification
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    num_labels=20,
    quantization_config=bnb_config,
    use_auth_token=True,
    device_map=device_map,
    trust_remote_code=True,
    cache_dir=cache_dir
)

model.config.use_cache = False
# More info: https://github.com/huggingface/transformers/pull/24906
model.config.pretraining_tp = 1

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="Causal_LM"
)

lora_model = get_peft_model(model, peft_config)
lora_model.print_trainable_parameters()



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 33,554,432 || all params: 6,771,970,048 || trainable%: 0.4955


In [9]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    learning_rate=2e-4,
    per_device_train_batch_size=4,
    optim="paged_adamw_32bit",
    gradient_accumulation_steps=16,
    num_train_epochs=1,
    save_steps=300,
    logging_steps=100,
    eval_steps=300,
    remove_unused_columns=False
)

# Create a Trainer instance
trainer = SFTTrainer(
    model=lora_model,
    args=training_args,
    peft_config=peft_config,
    train_dataset=formatted_dataset["train"],
    eval_dataset=formatted_dataset["test"],
    dataset_text_field="formatted_data",
    max_seq_length=512,
    tokenizer=tokenizer,
    packing=False
)

# Train the model
trainer.train()



Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Currently logged in as: [33mhassanrasheed[0m. Use [1m`wandb login --relogin`[0m to force relogin


ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`question_title` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [None]:
import os

output_dir = "final_dir/"
model_path = "final_model/"
new_model = "llama-2-7b-finetuned"

output_dir = os.path.join(output_dir, "final_checkpoint")
trainer.model.save_pretrained(output_dir)

In [None]:
import torch

# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
    use_auth_token=True
)
model = PeftModel.from_pretrained(base_model, "final_dir/final_checkpoint")
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model_path = "final_model/"

# Save the merged model
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)