# Fine-Tuning Mistral 7B with RAG on Vertex AI Workbench
This notebook prepares, fine-tunes, and saves a Mistral 7B model using QLoRA on a T4 GPU.

In [1]:
!python --version


Python 3.10.16


In [2]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121


Looking in indexes: https://download.pytorch.org/whl/cu121
Collecting torchaudio
  Downloading https://download.pytorch.org/whl/cu121/torchaudio-2.5.1%2Bcu121-cp310-cp310-linux_x86_64.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m63.8 MB/s[0m eta [36m0:00:00[0m
INFO: pip is looking at multiple versions of torchaudio to determine which version is compatible with other requirements. This could take a while.
  Downloading https://download.pytorch.org/whl/cu121/torchaudio-2.5.0%2Bcu121-cp310-cp310-linux_x86_64.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m145.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Downloading https://download.pytorch.org/whl/cu121/torchaudio-2.4.1%2Bcu121-cp310-cp310-linux_x86_64.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m149.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Downloading https://download.pytorch.org/whl/cu121/

In [3]:
!pip install transformers datasets peft accelerate bitsandbytes scipy jupyter


Collecting transformers
  Using cached transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
Collecting datasets
  Using cached datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting peft
  Using cached peft-0.14.0-py3-none-any.whl.metadata (13 kB)
Collecting accelerate
  Using cached accelerate-1.4.0-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting jupyter
  Downloading jupyter-1.1.1-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting huggingface-hub<1.0,>=0.26.0 (from transformers)
  Downloading huggingface_hub-0.29.3-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers)
  Using cached regex-2024.11.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Using cached tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting s

In [1]:
import torch
import transformers

print("Torch Version:", torch.__version__)
print("CUDA Available:", torch.cuda.is_available())
print("Transformers Version:", transformers.__version__)



Torch Version: 2.5.1+cu121
CUDA Available: True
Transformers Version: 4.49.0


In [6]:
!pip install fsspec==2025.2.0 --no-cache-dir
!pip install gcsfs==2025.2.0 --no-cache-dir


Collecting fsspec==2025.2.0
  Downloading fsspec-2025.2.0-py3-none-any.whl.metadata (11 kB)
Downloading fsspec-2025.2.0-py3-none-any.whl (184 kB)
Installing collected packages: fsspec
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2024.12.0
    Uninstalling fsspec-2024.12.0:
      Successfully uninstalled fsspec-2024.12.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datasets 3.3.2 requires fsspec[http]<=2024.12.0,>=2023.1.0, but you have fsspec 2025.2.0 which is incompatible.[0m[31m
[0mSuccessfully installed fsspec-2025.2.0


In [1]:

# Install necessary dependencies
!pip install torch transformers datasets accelerate peft bitsandbytes sentencepiece google-cloud-aiplatform --quiet

import torch
import logging
import json
import random
from datasets import Dataset
from transformers import (AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer,
                          BitsAndBytesConfig)
from peft import LoraConfig, get_peft_model, TaskType

# Enable logging for debugging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

logger.info("All required packages installed and imported successfully.")


INFO:__main__:All required packages installed and imported successfully.


In [2]:
import os

# Print current working directory
print("Current Directory:", os.getcwd())

# List all files in the directory
print("Files in directory:", os.listdir())


Current Directory: /home/jupyter
Files in directory: ['.jupyter', 'mistral_finetune_vertexai_WEAI.ipynb', '.config', 'mistral-weai-finetune', '.bash_history', '.gsutil', '.ipython', 'transformed_dataset.jsonl', 'logs', '.nv', '.triton', 'RAG', 'mistral-weai-finetuned.zip', 'mistral-finetuned', 'mistral-weai-finetuned', '.ipynb_checkpoints', '.local', '.cache', '.docker', 'weai_finetune_data.jsonl', '.bashrc', '.npm']


In [3]:

# Load dataset
dataset_path = "transformed_dataset.jsonl"  # Adjust path if needed
logger.info(f"Loading dataset from {dataset_path}")

try:
    with open(dataset_path, "r", encoding="utf-8") as f:
        data = [json.loads(line) for line in f]
    logger.info(f"Loaded {len(data)} samples.")
except Exception as e:
    logger.error(f"Failed to load dataset: {e}")
    raise e

# Ensure dataset follows the required format for Mistral 7B with RAG
for idx, sample in enumerate(data[:5]):
    if "messages" not in sample or not isinstance(sample["messages"], list):
        logger.error(f"Malformed sample at index {idx}: {sample}")
        raise ValueError("Dataset does not follow the expected Mistral RAG format.")

# Shuffle and split dataset (80% train, 20% validation)
random.seed(42)
random.shuffle(data)
split_idx = int(len(data) * 0.8)
train_dataset = Dataset.from_list(data[:split_idx])
eval_dataset = Dataset.from_list(data[split_idx:])

logger.info(f"Training samples: {len(train_dataset)}, Validation samples: {len(eval_dataset)}")


INFO:__main__:Loading dataset from transformed_dataset.jsonl
INFO:__main__:Loaded 4068 samples.
INFO:__main__:Training samples: 3254, Validation samples: 814


In [4]:
from huggingface_hub import login

login("xxxxxxxxxxxxxxxxxxxxxxxxxxxxx")

In [5]:
!pip install sentencepiece



In [6]:
import sentencepiece
print(sentencepiece.__version__)


0.2.0


In [7]:
from transformers import AutoTokenizer
# Load tokenizer
model_name = 'mistralai/Mistral-7B-Instruct-v0.3'
logger.info(f"Loading tokenizer for {model_name}")

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
tokenizer.pad_token = tokenizer.eos_token  # Set padding token

print("Tokenizer loaded successfully!")

logger.info("Tokenizer loaded successfully.")


INFO:__main__:Loading tokenizer for mistralai/Mistral-7B-Instruct-v0.3
INFO:__main__:Tokenizer loaded successfully.


Tokenizer loaded successfully!


In [8]:

# Function to format messages into a string for tokenization
def formatting_func(example):
    if "messages" not in example or not isinstance(example["messages"], list):
        raise ValueError("Each example must have a 'messages' list.")
    
    for i, msg in enumerate(example["messages"]):
        if not isinstance(msg, dict) or "role" not in msg or "content" not in msg:
            raise ValueError(f"Invalid message structure at index {i}: {msg}")
    
    formatted_text = tokenizer.apply_chat_template(conversation=example["messages"])
    return {"text": formatted_text}

logger.info("Formatting function defined successfully.")


INFO:__main__:Formatting function defined successfully.


In [9]:
print(type(train_dataset[0]))  # Should be <class 'dict'>
print(train_dataset[0])  # Print first sample


<class 'dict'>
{'messages': [{'content': 'You are an AI assistant for Western University, providing accurate university-related information. Only respond to topics directly related to Western University or Western Engineering. If the question is unrelated, politely decline to answer.', 'role': 'system'}, {'content': 'Can you tell me about 2020 Fall Award Recipients at Western University?', 'role': 'user'}, {'content': 'Spencer Engineering Building', 'role': 'assistant'}]}


In [10]:
def generate_and_tokenize_prompt(examples):
    """
    Function to format and tokenize dataset examples for Mistral 7B.
    Ensures correct input formatting for tokenization.
    """
    formatted_texts = []

    for ex in examples["messages"]:
        if not isinstance(ex, list):
            raise TypeError(f"Expected a list of messages, but got {type(ex)}. Full data: {ex}")

        # Convert conversation into a single string using Hugging Face chat template
        formatted_text = tokenizer.apply_chat_template(conversation=ex, tokenize=False)  # Set `tokenize=False` to return a string
        formatted_texts.append(formatted_text)

    if not formatted_texts or not all(isinstance(text, str) for text in formatted_texts):
        raise ValueError("Formatted texts must be a list of strings.")

    # Tokenize formatted texts
    tokenized_outputs = tokenizer(
        formatted_texts,
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )

    # Set labels (ignore padding tokens)
    tokenized_outputs["labels"] = [
        [token_id if token_id != tokenizer.pad_token_id else -100 for token_id in output]
        for output in tokenized_outputs["input_ids"]
    ]

    return tokenized_outputs


In [11]:
print("Example before tokenization:", train_dataset[0])

Example before tokenization: {'messages': [{'content': 'You are an AI assistant for Western University, providing accurate university-related information. Only respond to topics directly related to Western University or Western Engineering. If the question is unrelated, politely decline to answer.', 'role': 'system'}, {'content': 'Can you tell me about 2020 Fall Award Recipients at Western University?', 'role': 'user'}, {'content': 'Spencer Engineering Building', 'role': 'assistant'}]}


In [12]:
logger.info("Tokenizing training and validation datasets...")

try:
    tokenized_train_dataset = train_dataset.map(
        generate_and_tokenize_prompt,
        batched=True,  
        remove_columns=train_dataset.column_names
    )

    tokenized_val_dataset = eval_dataset.map(
        generate_and_tokenize_prompt,
        batched=True,
        remove_columns=eval_dataset.column_names
    )

    logger.info("Tokenization completed successfully.")
except Exception as e:
    logger.error(f"Error in tokenization: {e}")
    raise e


INFO:__main__:Tokenizing training and validation datasets...


Map:   0%|          | 0/3254 [00:00<?, ? examples/s]

Map:   0%|          | 0/814 [00:00<?, ? examples/s]

INFO:__main__:Tokenization completed successfully.


In [13]:

# Load Mistral 7B model with 4-bit quantization for QLoRA
logger.info("Loading model with 4-bit quantization...")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.bfloat16
)

try:
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map='auto'
    )
    model.config.pad_token_id = tokenizer.pad_token_id
    logger.info("Model loaded successfully.")
except Exception as e:
    logger.error(f"Error loading model: {e}")
    raise e


INFO:__main__:Loading model with 4-bit quantization...
INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

INFO:__main__:Model loaded successfully.


In [14]:

# Apply QLoRA with PEFT
logger.info("Applying QLoRA configuration...")

peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "down_proj", "up_proj"]
)

try:
    model = get_peft_model(model, peft_config)
    logger.info("QLoRA configuration applied successfully.")
except Exception as e:
    logger.error(f"Error applying QLoRA: {e}")
    raise e


INFO:__main__:Applying QLoRA configuration...
INFO:__main__:QLoRA configuration applied successfully.


In [15]:

# Define training arguments
logger.info("Setting up training arguments...")

training_args = TrainingArguments(
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,
    fp16=True,
    optim='adamw_torch',
    num_train_epochs=3,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    output_dir='./RAG/mistral-finetuned',
    logging_steps=50,
    logging_dir='./RAG/logs'
)

logger.info("Training arguments set successfully.")


INFO:__main__:Setting up training arguments...
INFO:__main__:Training arguments set successfully.


In [None]:

# Initialize Trainer and start training
logger.info("Starting training...")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset
)

try:
    trainer.train()
    logger.info("Training completed successfully.")
except Exception as e:
    logger.error(f"Training failed: {e}")
    raise e


INFO:__main__:Starting training...
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss
1,1.0746,1.347851
2,0.7812,1.303357


INFO:__main__:Training completed successfully.


In [None]:
import os
import shutil
import logging
from IPython.display import FileLink

# Setup logger
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Define new paths inside the "RAG" directory
output_dir = "./RAG"
lora_model_path = os.path.join(output_dir, "mistral-finetuned")
base_model_path = os.path.join(output_dir, "mistral-7b-base")
zip_filename = "mistral_rag_model.zip"

# Create the RAG directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Save the LoRA adapter weights
logger.info(f"Saving LoRA adapter weights to {lora_model_path}...")
model.save_pretrained(lora_model_path)
tokenizer.save_pretrained(lora_model_path)
logger.info("LoRA adapters saved successfully.")

# Save the base Mistral 7B model (downloaded from Hugging Face)
logger.info(f"Saving the base Mistral 7B model to {base_model_path}...")
model.base_model.save_pretrained(base_model_path)  # Saves the full model

# Copy the fine-tuned LoRA model into the base model folder for completeness
shutil.copytree(lora_model_path, base_model_path, dirs_exist_ok=True)
logger.info("Base model and LoRA adapters saved successfully.")

# ✅ ZIP the entire "RAG" directory
zip_path = f"./{zip_filename}"
logger.info(f"Zipping the entire '{output_dir}' directory into {zip_filename}...")
shutil.make_archive(zip_filename.replace(".zip", ""), 'zip', output_dir)
logger.info(f"Model zipped successfully: {zip_path}")




INFO:__main__:Saving LoRA adapter weights to ./RAG/mistral-finetuned...
INFO:__main__:LoRA adapters saved successfully.
INFO:__main__:Saving the base Mistral 7B model to ./RAG/mistral-7b-base...
INFO:__main__:Base model and LoRA adapters saved successfully.
INFO:__main__:Zipping the entire './RAG' directory into mistral_rag_model.zip...


In [None]:
# ✅ Generate a Download Link
logger.info(f"Model archive is ready for download: {zip_path}")
FileLink(zip_path)

In [None]:
import os
import shutil
from google.cloud import storage
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Define paths
output_dir = "./RAG"
zip_filename = "mistral_finetuned.zip"
zip_filepath = f"./{zip_filename}"

# Zip the entire "RAG" directory
logger.info(f"Zipping the model directory: {output_dir}...")
shutil.make_archive(zip_filepath.replace(".zip", ""), 'zip', output_dir)
logger.info(f"Model successfully zipped as {zip_filename}")

# Initialize Google Cloud Storage client
storage_client = storage.Client()

# Define GCS bucket names
gcs_buckets = ["we_ai_backup", "weai-finetune-1741723108"]

# Upload function
def upload_to_gcs(bucket_name, source_file, destination_blob_name):
    """Uploads a file to a GCS bucket."""
    try:
        bucket = storage_client.bucket(bucket_name)
        blob = bucket.blob(destination_blob_name)
        blob.upload_from_filename(source_file)
        logger.info(f"Uploaded {source_file} to gs://{bucket_name}/{destination_blob_name}")
    except Exception as e:
        logger.error(f"Failed to upload to {bucket_name}: {e}")

# Upload zip file to both GCS buckets
for bucket in gcs_buckets:
    upload_to_gcs(bucket, zip_filepath, zip_filename)

logger.info("Backup complete! Model zip file uploaded to both GCS buckets.")
