In [None]:
pip install torch==2.3.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [None]:
!pip3 install -U xformers --index-url https://download.pytorch.org/whl/cu121

In [None]:
%%capture
import torch
major_version, minor_version = torch.cuda.get_device_capability()
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
if major_version >= 8:
    !pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
else:
    !pip install --no-deps xformers trl peft accelerate bitsandbytes
pass

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! Llama 3 is up to 8k
dtype = torch.float16
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

fourbit_models = [
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
    "unsloth/llama-2-7b-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",
    "unsloth/gemma-7b-it-bnb-4bit",
    "unsloth/gemma-2b-bnb-4bit",
    "unsloth/gemma-2b-it-bnb-4bit",
    "unsloth/llama-3-8b-bnb-4bit",
]

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit", # Llama-3 70b also works (just change the model name)
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = "Your HF Token", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Unsloth 2024.4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:
tokenizer.eos_token_id

128001

In [None]:
terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd

df = pd.read_csv("/content/data - Sheet1.csv")

In [None]:
prompts = []
system = 'Given the instruction below, provide a clear and concise answer that directly addresses the request, taking into account any additional context provided'
for index, row in df.iterrows():
    prompt = f"""<|start_header_id|>system<|end_header_id|> {system}<|eot_id|><|start_header_id|>user<|end_header_id|> This is the question: {row['instruction']}<|eot_id|><|start_header_id|>assistant<|end_header_id|> {row['output']}<|eot_id|>"""
    prompts.append(prompt)

In [None]:
df = pd.DataFrame(prompts, columns=['texts'])

In [None]:
df_train, df_val= train_test_split(df, test_size=0.2, random_state=42)

In [None]:
from datasets import Dataset
train_dataset = Dataset.from_pandas(df_train)
val_dataset = Dataset.from_pandas(df_val)

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
import torch


trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset=val_dataset,
    dataset_text_field = "texts",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2, # Small batch size suitable for small dataset
        gradient_accumulation_steps = 4, # Helps to stabilize training with small batch sizes
        warmup_steps = 5,             # Few warmup steps due to small dataset
        max_steps = 100,              # Set a reasonable number of max steps for small dataset; adjust as needed
        num_train_epochs=4,            # Number of epochs; consider reducing if overfitting occurs
        learning_rate = 2e-4,         #Learning rate; adjust based on model size and dataset
        fp16 = torch.cuda.is_bf16_supported(), # Use mixed-precision if BF16 is not supported
        bf16 = not torch.cuda.is_bf16_supported(), # Use BF16 mixed-precision if supported
        logging_steps = 1,            # Log after every step for close monitoring on small datasets
        optim = "adamw_8bit",         # Optimizer; consider "adamw" if "adamw_8bit" causes issues
        weight_decay = 0.01,          # Regularization; adjust if overfitting is observed
        lr_scheduler_type = "linear", # Learning rate scheduler type
        seed = 3407,                  # Seed for reproducibility
        output_dir = "outputs",
        #evaluation_strategy="epoch", # Evaluate at the end of each epoch
        #save_strategy="epoch",       # Save model at the end of each epoch
        #load_best_model_at_end=True, # Load the best model at the end of training
        #metric_for_best_model="loss",# Use loss to determine the best model
    ),
)

  self.pid = os.fork()


Map (num_proc=2):   0%|          | 0/31 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/8 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [None]:
trainer.train()

Step,Training Loss
1,0.0163
2,0.0161
3,0.0143
4,0.0287
5,0.011
6,0.0303
7,0.0192
8,0.0291
9,0.0132
10,0.0223


TrainOutput(global_step=100, training_loss=0.018991066263988613, metrics={'train_runtime': 1188.4549, 'train_samples_per_second': 0.673, 'train_steps_per_second': 0.084, 'total_flos': 1.6645688394031104e+16, 'train_loss': 0.018991066263988613, 'epoch': 25.0})

In [None]:
model.push_to_hub("your_name/lora_model", token = "") # Online saving