In [1]:
!pip install accelerate # ==0.21.0
!pip install peft==0.4.0
!pip install bitsandbytes # ==0.40.2
!pip install transformers # ==4.31.0
!pip install trl # ==0.4.7
!pip install datasets

Collecting peft==0.4.0
  Using cached peft-0.4.0-py3-none-any.whl.metadata (21 kB)
Using cached peft-0.4.0-py3-none-any.whl (72 kB)
Installing collected packages: peft
  Attempting uninstall: peft
    Found existing installation: peft 0.13.3.dev0
    Uninstalling peft-0.13.3.dev0:
      Successfully uninstalled peft-0.13.3.dev0
Successfully installed peft-0.3.0.dev0


In [3]:
import os
import torch
from datasets import Dataset, load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
import pandas as pd

In [46]:
dataset = load_dataset("csv", data_files="./dataset/dataset.csv")

In [47]:
def generate_prompt(data_point):
    full_prompt = f"""<s>
[INST]
<<SYS>>
{data_point["input_ids"]}
<</SYS>>
{data_point["input_ids"]}
[/INST]
{data_point["labels"]}
</s>
"""
    data_point["text"] = full_prompt
    return data_point

# The dataset actually contains 3 diff splits: train, validation, test.
# The tokenize_function code is handling all data across all splits in batches.
dataset = dataset.map(generate_prompt)
dataset

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels', 'text'],
        num_rows: 5
    })
})

In [41]:
# Base model identifier from Hugging Face
model_name = "codellama/CodeLlama-7b-python-hf"

# LoRA settings for modifying attention mechanisms
lora_r = 64  # Dimension of LoRA attention
lora_alpha = 16  # Scaling factor for LoRA
lora_dropout = 0.1  # Dropout rate in LoRA layers

# 4-bit precision settings for model efficiency
use_4bit = True  # Enable 4-bit precision
bnb_4bit_compute_dtype = "float16"  # Data type for computations
bnb_4bit_quant_type = "nf4"  # Quantization method
use_nested_quant = False  # Enable double quantization for more compression

# Training settings
output_dir = "./peft-code-llama-training"  # Where to save model and results
num_train_epochs = 1  # Total number of training epochs
fp16 = False  # Use mixed precision training
bf16 = False  # Use bfloat16 precision with A100 GPUs
per_device_train_batch_size = 4  # Training batch size per GPU
per_device_eval_batch_size = 4  # Evaluation batch size per GPU
gradient_accumulation_steps = 1  # Number of steps for gradient accumulation
gradient_checkpointing = True  # Save memory during training
max_grad_norm = 0.3  # Max norm for gradient clipping
learning_rate = 2e-4  # Initial learning rate
weight_decay = 0.001  # Weight decay for regularization
optim = "paged_adamw_32bit"  # Optimizer choice
lr_scheduler_type = "cosine"  # Learning rate scheduler
max_steps = -1  # Set total number of training steps
warmup_ratio = 0.03  # Warmup ratio for learning rate
group_by_length = True  # Group sequences by length for efficiency
save_steps = 0  # Checkpoint save frequency
logging_steps = 25  # Logging frequency

# Sequence-to-sequence (SFT) training settings
max_seq_length = None  # Max sequence length
packing = False  # Pack short sequences together
device_map = {"": 0}  # Load model on specific GPU

In [7]:
# Loading the specified model with the above quantization configuration
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map=device_map,
    torch_dtype = torch.float16,
    low_cpu_mem_usage = True,
)
model.config.use_cache = False  # Disable caching to save memory
model.config.pretraining_tp = 1  # Setting pre-training task parallelism

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
# Initializing the tokenizer for the model and setting padding configurations
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token  # Setting the pad token
tokenizer.padding_side = "right"  # Adjusting padding to the right to avoid issues during training

In [9]:
# Configuring LoRA parameters for the model to fine-tune its attention mechanisms
lora_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",  # Setting the bias option for LoRA
    task_type="CAUSAL_LM",  # Defining the task type as causal language modeling
)

In [43]:
# Defining various training parameters such as directory, epochs, batch sizes, optimization settings, etc.
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,  # Grouping by length for efficient batching
    lr_scheduler_type=lr_scheduler_type,
    # no_cuda=True,
    report_to="tensorboard"  # Reporting to TensorBoard for monitoring
)

In [48]:
# Setting up the fine-tuning trainer with the specified model, dataset, tokenizer, and training arguments
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    peft_config=lora_config,
    dataset_text_field="text",  # Specifying which dataset field to use for text
    max_seq_length=max_seq_length,  # Setting the maximum sequence length
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,  # Enabling packing for efficiency
)

In [49]:
# Starting the training process
trainer.train()

ValueError: type of input params to POST changed to food and ingredient unknown: <class 'str'>. Should be one of a python, numpy, pytorch or tensorflow object.

In [None]:
eval_prompt = f"""<s>
[INST]
<<SYS>>
inputs params to POST changed to city and area
<</SYS>>
full_prompt
[/INST]
</s>
"""

model_input = tokenizer(eval_prompt, return_tensors="pt")

peft_model.eval()
with torch.no_grad():
    print(tokenizer.decode(peft_model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=False))