In [None]:
# Install Unsloth and the required dependencies
!pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

# Install Flash Attention 2 for faster inference, only if the GPU supports it
import torch
if torch.cuda.get_device_capability()[0] >= 8:
    !pip install --no-deps packaging ninja einops "flash-attn>=2.6.3"

# Ensure the necessary Hugging Face libraries are installed
!pip install transformers datasets accelerate bitsandbytes

# Import necessary libraries and confirm installation
try:
    from unsloth import FastLanguageModel
    from datasets import load_dataset
    import torch
    print("Libraries imported successfully.")
except ModuleNotFoundError as e:
    print(f"An error occurred: {e}")
    print("Ensure that Unsloth and other dependencies are correctly installed.")


Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-ixuhdq_s/unsloth_b163315ddad544288e3f277ef1b31bb7
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-ixuhdq_s/unsloth_b163315ddad544288e3f277ef1b31bb7
  Resolved https://github.com/unslothai/unsloth.git to commit a2f4c9793ecf829ede2cb64f2ca7a909ce3b0884
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
Libraries imported successfully.


In [None]:
!pip install triton



In [None]:
# Import necessary libraries and confirm installation
try:
    from unsloth import FastLanguageModel
    from datasets import load_dataset
    import torch
    print("Libraries imported successfully.")
except ModuleNotFoundError as e:
    print(f"An error occurred: {e}")
    print("Ensure that Unsloth and other dependencies are correctly installed.")


Libraries imported successfully.


In [None]:
!pip install xformers




In [None]:
!pip install triton



In [None]:
# Import necessary libraries and confirm installation
try:
    from unsloth import FastLanguageModel
    from datasets import load_dataset
    import torch
    print("Libraries imported successfully.")
except ModuleNotFoundError as e:
    print(f"An error occurred: {e}")
    print("Ensure that Unsloth and other dependencies are correctly installed.")


Libraries imported successfully.


In [None]:
!pip install trl




In [None]:
# Import necessary libraries and confirm installation
try:
    from unsloth import FastLanguageModel
    from datasets import load_dataset
    import torch
    print("Libraries imported successfully.")
except ModuleNotFoundError as e:
    print(f"An error occurred: {e}")
    print("Ensure that Unsloth and other dependencies are correctly installed.")


Libraries imported successfully.


In [None]:
!pip install peft



In [None]:
# Import necessary libraries and confirm installation
try:
    from unsloth import FastLanguageModel
    from datasets import load_dataset
    import torch
    print("Libraries imported successfully.")
except ModuleNotFoundError as e:
    print(f"An error occurred: {e}")
    print("Ensure that Unsloth and other dependencies are correctly installed.")


Libraries imported successfully.


In [None]:
# Install necessary dependencies (if not installed)
!pip install trl peft bitsandbytes

# Import necessary libraries
from unsloth import FastLanguageModel
from datasets import load_dataset
import torch
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
from transformers import TrainingArguments

# Define model parameters
model_name = "unsloth/gemma-2-9b-bnb-4bit"
max_seq_length = 2048
load_in_4bit = True
dtype = torch.bfloat16  # Use bf16 for mixed precision on A100 GPU

# Load the model with 4-bit quantization
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,  # Ensure bf16 is used
    load_in_4bit=load_in_4bit
)

# Ensure the model is quantized
print(f"Model loaded with 4-bit quantization: {load_in_4bit}")

# Step 1: Define the LoRA configuration
lora_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0,
    bias="none",
    task_type="CAUSAL_LM"
)

# Step 2: Attach LoRA adapters to the model
model = get_peft_model(model, lora_config)

# Step 3: Prepare the model for quantized training after attaching LoRA adapters
model = prepare_model_for_kbit_training(model)

# Load and preprocess the Alpaca dataset
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs = examples["input"]
    outputs = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

# Load and preprocess the dataset
dataset = load_dataset("yahma/alpaca-cleaned", split="train")
dataset = dataset.map(formatting_prompts_func, batched=True)

# Step 4: Define training arguments and fine-tune the model
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=60,  # Adjust the number of steps for training
        learning_rate=2e-4,
        bf16=True,  # Use bf16 precision for A100 GPUs
        logging_steps=1,
        optim="adamw_bnb_8bit",  # Optimizer for quantized training
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs"
    ),
)

# Step 5: Start training
trainer.train()

# Step 6: Save the fine-tuned model and tokenizer
model.save_pretrained("lora_model")
tokenizer.save_pretrained("lora_model")


==((====))==  Unsloth 2024.9.post4: Fast Gemma2 patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post1. FA2 = True]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model loaded with 4-bit quantization: True


Map (num_proc=2):   0%|          | 0/51760 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 51,760 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 0
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
1,1.8137
2,2.3222
3,1.7623
4,2.1288
5,1.8497
6,2.0388
7,1.7525
8,1.8548
9,1.8778
10,2.2019


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


('lora_model/tokenizer_config.json',
 'lora_model/special_tokens_map.json',
 'lora_model/tokenizer.model',
 'lora_model/added_tokens.json',
 'lora_model/tokenizer.json')

In [None]:
# Load the fine-tuned model and tokenizer
from unsloth import FastLanguageModel
import torch

# Load the saved model and tokenizer
model_path = "lora_model"
model, tokenizer = FastLanguageModel.from_pretrained(model_path)

# Prepare the model for inference
FastLanguageModel.for_inference(model)

# Define the input for inference
input_instruction = "Explain the importance of machine learning in today's world."
input_text = ""  # If your instruction needs specific input context, add it here

# Prepare the input using the Alpaca prompt format
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

# Create the formatted input
formatted_input = alpaca_prompt.format(input_instruction, input_text, "")

# Tokenize the input and prepare for model inference
inputs = tokenizer([formatted_input], return_tensors="pt").to("cuda")

# Perform inference (you can adjust max_new_tokens for longer outputs)
outputs = model.generate(**inputs, max_new_tokens=100, use_cache=True)

# Decode the output tokens to human-readable text
decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)

# Print the generated response
print("Model response:", decoded_output[0])


==((====))==  Unsloth 2024.9.post4: Fast Gemma2 patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post1. FA2 = True]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2024.9.post4 patched 42 layers with 42 QKV layers, 42 O layers and 42 MLP layers.


Model response: Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Explain the importance of machine learning in today's world.

### Input:


### Response:
Machine learning is a branch of artificial intelligence that enables computers to learn from data and make predictions or decisions without being explicitly programmed. It has become increasingly important in today's world due to the vast amount of data being generated and the need for efficient and accurate decision-making. Machine learning algorithms can analyze large datasets and identify patterns and trends that would be difficult or impossible for humans to detect. This allows for more accurate predictions and decisions in areas such as finance, healthcare, and marketing. Additionally, machine
