In [None]:
# %%capture to suppress unnecessary output
!pip install unsloth
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

# Import necessary libraries
from unsloth import FastLanguageModel
from datasets import load_dataset
import torch

# Set parameters for the model, optimized for T4 Colab instance
max_seq_length = 2048  # RoPE scaling auto-handled by Unsloth
dtype = torch.float16  # Efficient precision for Tesla T4
load_in_4bit = True  # Memory-efficient 4-bit quantization

# Load the Mistral NeMo 12B model with LoRA adapters
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Mistral-Nemo-Base-2407-bnb-4bit",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit
)

# Add LoRA adapters for fine-tuning
model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # LoRA rank, memory-efficient
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_alpha=16,
    lora_dropout=0,  # Optimized for faster training
    bias="none",
    use_gradient_checkpointing=True,  # Reduces memory usage
    random_state=3407
)

# Load the Alpaca dataset for chatbot fine-tuning
dataset = load_dataset("yahma/alpaca-cleaned", split="train")

# Define the prompt format
alpaca_prompt = """
### Instruction:
{}

### Input:
{}

### Response:
{}
"""

EOS_TOKEN = tokenizer.eos_token  # End of sequence token

# Function to format the dataset prompts
def formatting_prompts_func(examples):
    texts = [alpaca_prompt.format(inst, inp, out) + EOS_TOKEN for inst, inp, out in zip(examples['instruction'], examples['input'], examples['output'])]
    return {"text": texts}

# Format the dataset for training
dataset = dataset.map(formatting_prompts_func, batched=True)

# Training setup using Huggingface's SFTTrainer for fast training
from transformers import TrainingArguments
from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    args=TrainingArguments(
        per_device_train_batch_size=2,  # Optimized for Tesla T4
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=60,  # Fast fine-tuning for demonstration
        learning_rate=2e-4,
        fp16=True,  # Use FP16 for memory-efficient training
        logging_steps=1,
        output_dir="outputs"
    ),
)

# Start fine-tuning
trainer.train()

# Inference with the fine-tuned chatbot model
FastLanguageModel.for_inference(model)  # Enable fast inference mode
input_text = """
### Instruction:
You are a friendly chatbot. Please introduce yourself and help users with questions about technology.

### Input:
Hello, who are you?

### Response:
"""
inputs = tokenizer([input_text], return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens=64)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

# Save the fine-tuned model (LoRA adapters)
model.save_pretrained("lora_model")
tokenizer.save_pretrained("lora_model")


Collecting unsloth
  Downloading unsloth-2024.9.post4-py3-none-any.whl.metadata (56 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/56.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.0/56.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.28.post1-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting bitsandbytes (from unsloth)
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting triton>=3.0.0 (from unsloth)
  Downloading triton-3.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.3 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.8.12-py3-none-any.whl.metadata (8.4 kB)
Collecting datasets>=2.16.0 (from unsloth)
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting trl!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3,<=0.11.1,>=0.7.9 (fro

model.safetensors.index.json:   0%|          | 0.00/165k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.31G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/158 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/177k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.26M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

Not an error, but Unsloth cannot patch MLP layers with our manual autograd engine since either LoRA adapters
are not enabled or a bias term (like in Qwen) is used.
Unsloth 2024.9.post4 patched 40 layers with 40 QKV layers, 40 O layers and 0 MLP layers.


README.md:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

alpaca_data_cleaned.json:   0%|          | 0.00/44.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/51760 [00:00<?, ? examples/s]

Map:   0%|          | 0/51760 [00:00<?, ? examples/s]

Map:   0%|          | 0/51760 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 51,760 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 19,660,800
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
1,1.5782
2,1.8813
3,1.8904
4,2.1952
5,1.4658
6,1.599
7,1.3067
8,1.436
9,1.394
10,1.3882


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]



### Instruction:
You are a friendly chatbot. Please introduce yourself and help users with questions about technology.

### Input:
Hello, who are you?

### Response:
Hello! I am a friendly chatbot here to help you with any questions you may have about technology. How can I assist you today?



('lora_model/tokenizer_config.json',
 'lora_model/special_tokens_map.json',
 'lora_model/tokenizer.json')

# **Continuining conversation**

In [None]:
# Inference function for conversation
def chat_with_bot(model, tokenizer, user_input):
    # Define the format of the conversation
    conversation_prompt = """
    ### Instruction:
    You are a friendly chatbot. Please introduce yourself and help users with questions about technology.

    ### Input:
    {}

    ### Response:
    """
    # Format the user input into the conversation prompt
    input_text = conversation_prompt.format(user_input)

    # Tokenize the input and send it to the model
    inputs = tokenizer([input_text], return_tensors="pt").to("cuda")

    # Generate the response from the model
    outputs = model.generate(**inputs, max_new_tokens=128)

    # Decode the response and return it
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Continue chatting in a loop
print("Chatbot is ready! Type 'exit' to end the conversation.")

while True:
    # Get user input
    user_input = input("You: ")

    # Exit the loop if the user types 'exit'
    if user_input.lower() == 'exit':
        print("Chatbot: Goodbye! Have a great day!")
        break

    # Generate chatbot response
    response = chat_with_bot(model, tokenizer, user_input)

    # Print the chatbot's response
    print(f"Chatbot: {response}")


Chatbot is ready! Type 'exit' to end the conversation.
You: who's the president of Togo?
Chatbot: 
    ### Instruction:
    You are a friendly chatbot. Please introduce yourself and help users with questions about technology.
    
    ### Input:
    who's the president of Togo?

    ### Response:
    
    Hello! I'm a friendly chatbot here to help you with any questions you may have about technology. I'm sorry, but I don't have any information about the president of Togo. However, I can assist you with any other technology-related questions you may have. How can I help you today?

You: when was AI discovered?
Chatbot: 
    ### Instruction:
    You are a friendly chatbot. Please introduce yourself and help users with questions about technology.
    
    ### Input:
    when was AI discovered?

    ### Response:
    
    Hello! I'm a friendly chatbot here to help you with any questions you may have about technology. As for your question, AI, or artificial intelligence, was first discovere