<a href="https://colab.research.google.com/gist/sineeli/d187f815444bf191c6af45ea9aff3d5f/mistral_finetune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Start by installing the necessary packages
!pip -q uninstall -y torch torchvision torchaudio
!pip -q install --no-cache-dir torch==2.3.0+cu118 torchvision==0.18.0+cu118 torchaudio==2.3.0+cu118 -f https://download.pytorch.org/whl/torch_stable.html
!pip -q install --no-cache-dir transformers datasets peft bitsandbytes accelerate tqdm

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m839.6/839.6 MB[0m [31m312.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m320.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m332.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m417.9/417.9 MB[0m [31m299.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.1/13.1 MB[0m [31m274.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.2/23.2 MB[0m [31m261.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m875.6/875.6 kB[0m [31m353.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m728.5/728.5 MB[0m [31m249.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━

In [None]:
import csv, json, random, os, tqdm, pathlib
import torch
import re

# Configuration
HF_TOKEN = "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"  #  HuggingFace token
CSV_PATH = "./bootstrapped_feedback_followup_combined.csv"  # Path to your CSV data
WORK_DIR = "./work/"
CTX_LEN = 2048  # Context length
BASE_MODEL = "mistralai/Mistral-7B-Instruct-v0.2"

os.makedirs(WORK_DIR, exist_ok=True)
TRAIN_JL = f"{WORK_DIR}/train.jsonl"
EVAL_JL = f"{WORK_DIR}/eval.jsonl"

In [None]:
# Process data
rows = list(csv.DictReader(open(CSV_PATH, encoding="utf-8")))
random.shuffle(rows)
split = int(0.9 * len(rows))
train_rows, eval_rows = rows[:split], rows[split:]

# Create JSON files
def dump(subset, path):
    with open(path, "w", encoding="utf-8") as fp:
        for r in tqdm.tqdm(subset, desc=f"→ {path}"):
            domain = r["Domain"].strip()
            json.dump({
                "messages":[
                    {"role":"system", "content": f"You are a rigorous {domain} interviewer."},
                    {"role":"assistant", "content": r["question"]},
                    {"role":"user", "content": r["user_answer"]},
                    {"role":"assistant", "content": f"{r['feedback']}\n\nFollow-up: {r['follow_up_question']}"}
                ]
            }, fp)
            fp.write("\n")

dump(train_rows, TRAIN_JL)
dump(eval_rows, EVAL_JL)

→ ./work//train.jsonl: 100%|██████████| 1395/1395 [00:00<00:00, 30695.18it/s]
→ ./work//eval.jsonl: 100%|██████████| 155/155 [00:00<00:00, 26955.68it/s]


In [None]:
# Load tokenizer and prepare dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model
from datasets import load_dataset

# Initialize tokenizer for Mistral
tok = AutoTokenizer.from_pretrained(BASE_MODEL, token=HF_TOKEN, use_fast=True)
tok.pad_token = tok.eos_token
tok.padding_side = "right"
tok.model_max_length = CTX_LEN  # fix for OverflowError

# Prepare data format for Mistral
def to_ids(ex):
    # Mistral uses a slightly different chat format than Llama
    formatted_chat = ""
    for m in ex["messages"]:
        if m["role"] == "system":
            formatted_chat += f"<s>[INST] {m['content']} [/INST]\n"
        elif m["role"] == "user":
            formatted_chat += f"<s>[INST] {m['content']} [/INST]\n"
        elif m["role"] == "assistant":
            formatted_chat += f"{m['content']}</s>\n"

    ex["input_ids"] = tok(formatted_chat, truncation=True, max_length=CTX_LEN).input_ids
    return ex

In [None]:
# Load and process dataset
ds = load_dataset("json", data_files={"train": TRAIN_JL, "eval": EVAL_JL}) \
    .map(to_ids, remove_columns=["messages"])

# Load Mistral model
base = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL, token=HF_TOKEN,
    load_in_4bit=True, bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    torch_dtype=torch.float16, device_map="auto")

# Configure model for training
base.config.use_cache = False
base.gradient_checkpointing_enable()
base.enable_input_require_grads()

# Set up LoRA configuration for Mistral
lora_cfg = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    # Target modules for Mistral - these are the specific attention projection layers
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    bias="none",
    task_type="CAUSAL_LM"
)

# Create PEFT model
model = get_peft_model(base, lora_cfg)

Generating train split: 0 examples [00:00, ? examples/s]

Generating eval split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/1395 [00:00<?, ? examples/s]

Map:   0%|          | 0/155 [00:00<?, ? examples/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [None]:
# Set up training
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

args = TrainingArguments(
    output_dir=f"{WORK_DIR}/lora",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    max_steps=2000,
    warmup_steps=100,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=50,
    eval_strategy="steps",
    eval_steps=400,
    save_strategy="steps",
    save_steps=400,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=ds["train"],
    eval_dataset=ds["eval"],
    data_collator=DataCollatorForLanguageModeling(tok, mlm=False, pad_to_multiple_of=8)
)

# Train the model
trainer.train()

# Save the fine-tuned adapter
model.save_pretrained(f"{WORK_DIR}/adapter")
tok.save_pretrained(f"{WORK_DIR}/adapter")



Step,Training Loss,Validation Loss
400,2.8494,0.927963
800,1.1353,1.324228
1200,0.6142,1.571713
1600,0.505,1.849846
2000,0.4352,2.016637



Cannot access gated repo for url https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/config.json.
Access to model mistralai/Mistral-7B-Instruct-v0.2 is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in mistralai/Mistral-7B-Instruct-v0.2.

Cannot access gated repo for url https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/config.json.
Access to model mistralai/Mistral-7B-Instruct-v0.2 is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in mistralai/Mistral-7B-Instruct-v0.2.

Cannot access gated repo for url https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/config.json.
Access to model mistralai/Mistral-7B-Instruct-v0.2 is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the loo

('./work//adapter/tokenizer_config.json',
 './work//adapter/special_tokens_map.json',
 './work//adapter/tokenizer.model',
 './work//adapter/added_tokens.json',
 './work//adapter/tokenizer.json')