## Fine-tuning Llama-3.2-1B for Tech Clarity Copilot

### Package Installation, Imports

In [42]:
%%capture
"""
Installs all required dependencies for Unsloth, Transformers, TRL,
LangChain, FAISS, and document loaders. It adjusts installation behavior
depending on whether the runtime is Google Colab or a standard environment.
"""

import os, re, pprint

# Detect Colab environment and install required packages
if "COLAB_" not in "".join(os.environ.keys()):
    # For non-Colab environments, a simple install is sufficient
    !pip install unsloth --no-deps
    !pip install unsloth_zoo --no-deps
    !pip install -q sentence-transformers

else:
    # Colab requires specific dependency versions due to CUDA/PyTorch compatibility
    import torch; v = re.match(r"[0-9\.]{3,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.32.post2" if v == "2.8.0" else "0.0.29.post3")

    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth

# Install fixed versions for model training compatibility
!pip install transformers==4.55.4
!pip install --no-deps trl==0.22.2


In [43]:
# Install supporting libraries (LangChain, FAISS, PDF/Doc loaders)
%pip install -U langchain langchain-openai langchain-azure-ai langchain-community langchain-text-splitters faiss-cpu pypdf python-docx python-dotenv

print("Step 0.1: All necessary packages are assumed to be installed.")


Step 0.1: All necessary packages are assumed to be installed.


In [44]:
# from google.colab import drive
# drive.mount('/content/drive')


### Model & Tokenizer Loading

In [45]:
"""
Loads the pre-trained Unsloth LLaMA model.
It sets key parameters such as sequence length and 4-bit quantization for memory efficiency,
then loads the model and tokenizer. Finally, it prints device and model information.
"""

from unsloth import FastLanguageModel
import torch

# Model configuration
max_seq_length = 2048  # Maximum context window size
dtype = None           # Let the model auto-select the optimal data type for GPU
load_in_4bit = True    # Use 4-bit quantization to reduce memory usage

# Load the pre-trained model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-1B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

# Print device and basic model info
device = next(model.parameters()).device
print(f"Using device: {device}")
print(f"Model loaded successfully with {model.config.num_hidden_layers} layers")
print(f"Model vocabulary size: {len(tokenizer)}")
print(f"Context length (sequence length): {max_seq_length} tokens")


==((====))==  Unsloth 2025.12.1: Fast Llama patching. Transformers: 4.55.4.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 8.0. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Using device: cuda:0
Model loaded successfully with 16 layers
Model vocabulary size: 128256
Context length (sequence length): 2048 tokens


In [46]:
# # Install necessary packages
# !pip install -U unsloth modelscope

# import os
# from unsloth import FastLanguageModel
# import torch

# # --- Optional: force 4-bit quantization for memory efficiency ---
# max_seq_length = 2048
# load_in_4bit = True

# try:
#     # Try loading from Hugging Face first
#     model, tokenizer = FastLanguageModel.from_pretrained(
#         model_name="unsloth/Llama-3.2-1B",
#         max_seq_length=max_seq_length,
#         load_in_4bit=load_in_4bit
#     )
#     print("‚úÖ Model loaded from Hugging Face successfully!")
# except Exception as e:
#     print(f"‚ö† Hugging Face loading failed: {e}")
#     print("üîÑ Falling back to ModelScope...")

#     # Use ModelScope as a fallback
#     os.environ['UNSLOTH_USE_MODELSCOPE'] = '1'
#     model, tokenizer = FastLanguageModel.from_pretrained(
#         model_name="unsloth/Llama-3.2-1B",
#         max_seq_length=max_seq_length,
#         load_in_4bit=load_in_4bit
#     )
#     print("‚úÖ Model loaded from ModelScope successfully!")

# # --- Check device and basic info ---
# device = next(model.parameters()).device
# print(f"Using device: {device}")
# print(f"Model vocabulary size: {len(tokenizer)}")
# print(f"Context length (sequence length): {max_seq_length} tokens")


### LoRA Configuration

In [47]:
"""
Defines the training configuration for fine-tuning the model,
including number of epochs, LoRA (Low-Rank Adaptation) parameters, and
general training hyperparameters like learning rate, batch size, and gradient accumulation.
"""

# Training configuration
num_epochs = 0.3  # Small dataset: few epochs to avoid overfitting

# --- LoRA (Low-Rank Adaptation) parameters ---
r = 4
lora_alpha = 8
lora_dropout = 0.1
target_modules = ["q_proj", "k_proj", "v_proj"]  # Modules to apply LoRA


# --- General training hyperparameters ---
learning_rate = 0.00002
weight_decay = 0.01
warmup_ratio = 0.1
batch_size = 2
gradient_accumulation_steps = 6


In [48]:
"""
Attach LoRA adapters to the pre-trained model to efficiently fine-tune it
on downstream tasks. LoRA allows adding trainable low-rank updates to
specific layers while keeping most of the model frozen, saving memory
and speeding up training. After adding LoRA, print summary statistics
about trainable vs total parameters.
"""

# Add LoRA adapters using Unsloth's optimized function
model = FastLanguageModel.get_peft_model(
    model,
    r = r,  # Rank of LoRA adapters; higher values increase capacity and parameter count
    target_modules = target_modules,  # Specific layers to apply LoRA (q_proj, k_proj, v_proj, etc.)
    lora_alpha = lora_alpha,  # Scaling factor for LoRA updates
    lora_dropout = 0,  # Dropout for LoRA layers; 0 for speed, higher values can reduce overfitting
    bias = "none",  # Do not train bias terms (optimized for speed)
    use_gradient_checkpointing = "unsloth",  # Saves memory by recomputing activations during backprop
    random_state = 3407,  # Seed for reproducibility
    use_rslora = False,  # Rank-stabilized LoRA; optional, adds stability but more overhead
)

# --- Print model parameter statistics ---
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
print(f"Percentage of parameters being trained: {100 * trainable_params / total_params:.4f}%")


Total parameters: 775,030,784
Trainable parameters: 589,824
Percentage of parameters being trained: 0.0761%


### Data Definition

In [49]:
"""
Define a set of RAG examples.
Each example contains:
- a user question
- retrieved context relevant to that question
- a detailed response
These examples are to be used for fine-tuning.
"""

rag_examples = [
    {
        "question": "What fields are returned when fetching a contact using the Get a Contact endpoint?",
        "retrieved_context": """When you fetch a contact, the system returns a contact object with fields like:
- contactId: unique ID
- contactPhone: phone number
- firstName and lastName
- email
- birthday
- lists the contact belongs to
- customFields
- comment
- subscriptionStatus
- created and updated timestamps
- updateSource
""",
        "response": """### Summary
When you call the Get a Contact endpoint, you receive a contact object that describes who the person is, how to reach them, which lists they are on, and when their record was created or updated.

### How it works
The endpoint looks up a specific contact in the system using an identifier such as `contactId` or `contactPhone`. It then returns a structured object with details about that person, their contact information, and their list memberships.

### Key parameters / fields
- `contactId`: unique identifier for the contact.
- `contactPhone`: the contact's phone number.
- `firstName` / `lastName`: the contact's name.
- `email`: the contact's email address, if one is stored.
- `birthday`: birthday in `yyyy-mm-dd` format.
- `lists`: which lists or groups the contact belongs to.
- `customFields`: extra key/value data stored for this contact.
- `comment`: notes or internal comments about the contact.
- `subscriptionStatus`: whether the contact is opted in or out.
- `created` / `updated`: when the record was created and last changed.
- `updateSource`: how the record was last updated (for example, import or API).

### Common mistakes or troubleshooting tips
- Looking up a contact with the wrong ID or phone format.
- Assuming every field is always present (some may be optional).
- Ignoring `subscriptionStatus` and accidentally messaging contacts who are opted out.
"""
    },

    {
        "question": "Which endpoint allows you to add a contact to a list?",
        "retrieved_context": """There is an endpoint that lets you attach an existing contact to a specific list.

Example shape:
- Method: POST
- Path: /api/contact-lists/{listIdOrName}/contacts
- You provide the list identifier and the contact phone or ID.
""",
        "response": """### Summary
To add an existing contact to a list, you call the endpoint that links a contact to a specific list.

### How it works
You send a POST request to the contact-lists endpoint. In the path, you specify which list you are targeting, and in the body you include which contact you want to add. The system finds both and creates a link between that contact and that list.

### Key parameters / fields
- `listIdOrName`: identifies which list the contact should be added to.
- `contactPhoneOrId`: the contact‚Äôs phone number or unique ID.

### Common mistakes or troubleshooting tips
- Mixing up list ID and list name.
- Sending a phone number in an invalid format.
- Trying to add a contact that has not been created yet.
"""
    },

    {
        "question": "How can I update a Google Sheets cell using an automation tool like Make?",
        "retrieved_context": """Automation tools usually provide an action to update a single cell in a spreadsheet.

Typical inputs:
- Spreadsheet ID
- Sheet name
- Cell reference (for example, B2)
- New value
""",
        "response": """### Summary
You can update a specific Google Sheets cell by using an ‚ÄúUpdate cell‚Äù type module in your automation scenario.

### How it works
In your scenario, you add a module that connects to Google Sheets and choose the action that updates a single cell. You then point it to the right spreadsheet, sheet, and cell, and provide the value you want to write. When the scenario runs, it sends that update to Google Sheets.

### Key parameters / fields
- Spreadsheet ID: which Google Sheet to work with.
- Sheet name: which tab inside that spreadsheet.
- Cell reference: the exact cell to update (for example, `B2`).
- Value: the new content that should be written to that cell.

### Common mistakes or troubleshooting tips
- Selecting the wrong spreadsheet or sheet name.
- Typing the cell reference incorrectly (for example, `2B` instead of `B2`).
- Not authorizing the automation tool to access the Google account.
"""
    },

    {
        "question": "What happens if I try to fetch a contact that does not exist?",
        "retrieved_context": """If the contact ID does not exist, the API returns a 404 Not Found error with a JSON body like:
{"error": "Contact not found"}""",
        "response": """### Summary
If you try to fetch a contact that does not exist, the API returns an error instead of a contact object.

### How it works
The API looks up the contact by ID. If it cannot find a record with that ID, it responds with an HTTP 404 status code and a short error message in the body.

### Key parameters / fields
- HTTP status: `404 Not Found` to indicate the contact does not exist.
- Error body: often includes a field like `error` or `message` explaining that the contact was not found.

### Common mistakes or troubleshooting tips
- Using the wrong or outdated contact ID.
- Testing with an ID that only exists in a different environment.
- Forgetting to handle 404 responses gracefully in client code or automations.
"""
    },

    {
        "question": "Which endpoint allows you to get a specific contact list?",
        "retrieved_context": """### Endpoint
```
GET /api/contacts/{contactIdOrNumber}
```

### Tags
`simpletexting` `contacts` `get` `path-parameter`

## Get All Contacts
Retrieve **all contacts** for a given account.
Results are **paginated**.

Example use case:
Fetch all contacts created or updated since April 28, 2021.

### Example Request
```http
GET https://api-app2.simpletexting.com/v2/api/contacts?page=100&size=2&since=2021-04-28T23:20:08.489Z&direction=ASC
```

### Authorization
* `api_key` required""",
        "response": """The retrieved context does not provide information about an endpoint that allows you to get a specific contact list directly. It only details how to retrieve all contact lists with the endpoint:\n\n```\nGET /api/contact-lists\n```\n\nwhich returns all lists in a paginated format.  \nThere is no mention of an endpoint for fetching a single contact list by its ID or name.
"""
    },

    {
        "question": "How do I fetch all contacts updated since a specific date?",
        "retrieved_context": """### Endpoint
```
GET /api/contacts/{contactIdOrNumber}
```

### Tags
`simpletexting` `contacts` `get` `path-parameter`

## Get All Contacts
Retrieve **all contacts** for a given account.
Results are **paginated**.

Example use case:
Fetch all contacts created or updated since April 28, 2021.

### Example Request
```http
GET https://api-app2.simpletexting.com/v2/api/contacts?page=100&size=2&since=2021-04-28T23:20:08.489Z&direction=ASC
```

### Authorization
* `api_key` required
        """,
        "response": """To fetch all contacts updated since a specific date, you should use the \"Get All Contacts\" endpoint with the `since` query parameter. \n\nHere‚Äôs how to do it:\n\n1. Make a GET request to:\n```\nhttps://api-app2.simpletexting.com/v2/api/contacts\n```\n\n2. Include the `since` parameter in ISO 8601 format, specifying your desired date. For example:\n```\n?since=2021-04-28T23:20:08.489Z\n```\n\n3. Optionally, you can set other parameters like `page`, `size`, and `direction` to control pagination and sorting.\n\n**Example full request:**\n```http\nGET https://api-app2.simpletexting.com/v2/api/contacts?since=2021-04-28T23:20:08.489Z&size=50&direction=ASC\n```\n\n4. Ensure you include your API token in the `Authorization` header:\n```\nAuthorization: Bearer YOUR_API_KEY_HERE\n```\n\nThis will return a paginated list of contacts updated since the specified date.
"""
    },

    {
        "question": "How do I retrieve all contact lists in my account?",
        "retrieved_context": """### Responses
**200 Success** Returns all lists in the account.

```json
{
  "content": [
    {}
  ],
  "totalPages": 0,
  "totalElements": 0
}
```

### Response Fields
| Field           | Type                      | Description                                                  |
| --------------- | ------------------------- | ------------------------------------------------------------ |
| `content`       | array of objects (`List`) | Lists returned in the current page. Count depends on `size`. |
| `totalPages`    | integer                   | Total number of pages based on element count and page size.  |
| `totalElements` | integer                   | Total number of lists in the account.                        |

### Endpoint
```
GET /api/contact-lists
```

### Tags
`simpletexting` `lists` `get` `pagination`

## Add Contact To List
Add a contact to a specified contact list in your SimpleTexting account.

### Authorization
- `api_key` required
        """,
        "response": """To retrieve all contact lists in your account, you should use the \"Get All Lists\" API endpoint.\n\nHere's how:\n\n1. Send a GET request to:\n```\nhttps://api-app2.simpletexting.com/v2/api/contact-lists\n```\n\n2. You can include optional query parameters `page` and `size` for pagination:\n- `page`: zero-based page number (default is 0).\n- `size`: number of lists per page (maximum 500).\n\n3. Make sure to include your Bearer token in the request header:\n```\nAuthorization: Bearer YOUR_API_KEY_HERE\n```\n\nExample request:\n```http\nGET https://api-app2.simpletexting.com/v2/api/contact-lists?page=0&size=50\nAuthorization: Bearer YOUR_API_KEY_HERE\n```\n\nThe response will contain a list of contact lists under the `content` field, along with pagination details:\n\n```json\n{\n  \"content\": [/* array of list objects */],\n  \"totalPages\": 1,\n  \"totalElements\": 10\n}\n```\n\nThis will give you all the contact lists in your account.
"""
    },

    {
        "question": "What response do I get if my API token is missing or invalid?",
        "retrieved_context": """Your API token can be found in your SimpleTexting **Account Settings**.

**Important:**
Keep your bearer token secure. Do **not** store it in public code repositories (e.g., GitHub) or expose it in client-side code.

## How Authentication Works
SimpleTexting uses **Bearer Authentication** (token-based authentication).

A *bearer token* is a secure, randomly generated string issued by the server.
The token must be included in every protected request:

- Authorization: Bearer <your_token_here>

"Bearer" simply means: **anyone who possesses this token is granted access**.

### Security Scheme
| Field | Value |
|-------|--------|
| **Scheme Type** | API Key |
| **Location** | Header |
| **Header Name** | `Authorization` |
...
### Tags
`authentication` `security` `bearer` `api_key`
        """,
        "response": """The context provided does not specify the exact response you would receive if your API token is missing or invalid. Therefore, I don't have that information.
"""
    },

    {
        "question": "What happens if the Google Sheet name I provide doesn‚Äôt exist?",
        "retrieved_context": """### Connection
Create a connection to your Google Sheets account.

### Input Parameters
| Field         | Type   | Required | Example / Notes | Description |
| ------------- | ------ | -------- | --------------- | ----------- |
| Search Method | string | yes      | `Search by path` / `Select from all` / `Enter manually` | Choose the spreadsheet and sheet to clear. |
| Spreadsheet ID| string | yes      | `abc1234567` (from URL: `https://docs.google.com/spreadsheets/d/abc1234567/edit#gid=0`) | Spreadsheet ID to clear values from. |
| Sheet Name    | string | yes      | `Sheet1` | Name of the sheet to clear. |
| Range         | string | yes      | `A1:D25` | Range to clear values from. |

### Notes
- Ensure the correct spreadsheet and sheet are selected.
- Range values will be permanently cleared from the sheet.

### Tags
`make` `google_sheets` `clear-range` `write`
        """,
        "response": """The context does not provide information about what happens if the Google Sheet name you provide doesn‚Äôt exist.
"""
    },

    {
        "question": "How do I clear values from a specific range in a Google Sheet?",
        "retrieved_context": """### Connection
Create a connection to your Google Sheets account.

### Input Parameters
| Field         | Type   | Required | Example / Notes | Description |
| ------------- | ------ | -------- | --------------- | ----------- |
| Search Method | string | yes      | `Search by path` / `Select from all` / `Enter manually` | Choose the spreadsheet and sheet to clear. |
| Spreadsheet ID| string | yes      | `abc1234567` (from URL: `https://docs.google.com/spreadsheets/d/abc1234567/edit#gid=0`) | Spreadsheet ID to clear values from. |
| Sheet Name    | string | yes      | `Sheet1` | Name of the sheet to clear. |
| Range         | string | yes      | `A1:D25` | Range to clear values from. |

### Notes
- Ensure the correct spreadsheet and sheet are selected.
- Range values will be permanently cleared from the sheet.

### Tags
`make` `google_sheets` `clear-range` `write`
        """,
        "response": """To clear values from a specific range in a Google Sheet, you need to use the \"Clear Values from a Range\" module in Make.com. Here are the steps:\n\n1. Establish a connection to your Google Sheets account.\n2. Choose the \"Clear Values from a Range\" module.\n3. Provide the required input parameters:\n   - **Search Method:** Select how you'll specify the spreadsheet (e.g., \"Search by path,\" \"Select from all,\" or \"Enter manually\").\n   - **Spreadsheet ID:** Enter the ID of your Google Spreadsheet (found in the URL).\n   - **Sheet Name:** Enter the name of the sheet within the spreadsheet.\n   - **Range:** Specify the range of cells you want to clear, e.g., `A1:D25`.\n4. Run the scenario, and the specified range will be cleared of all values.\n\nThis operation will permanently remove the data within the specified range.
"""
    },

    {
        "question": "What happens if I try to clear values from a read-only spreadsheet?",
        "retrieved_context": """### Value Render Option Details
- **Formatted value:** Returns calculated and formatted values according to spreadsheet locale.
- **Unformatted value:** Returns calculated values without formatting.
- **Formula:** Returns cell formulas instead of calculated values.

### Date and Render Option Details
- **Serial number:** Date/time/duration as numbers (days since Dec 30, 1899, fractional portion = time).
- **Formatted string:** Date/time/duration returned as string formatted according to spreadsheet locale.

### Tags
`make` `google_sheets` `get-range` `read`

## Module: Clear Values from a Range
Clears a specific range of values from a Google Sheet.

### Connection
Create a connection to your Google Sheets account.
        """,
        "response": """The provided context does not specify what happens if you try to clear values from a read-only spreadsheet.
"""
    },
]


In [50]:
# Convert the RAG examples into a HuggingFace Dataset to allow for easier integration with training and evaluation pipelines.
from datasets import Dataset
dataset = Dataset.from_list(rag_examples)


# Inspect the dataset to ensure it was loaded correctly.
# Prints the first few examples with truncated context and response for readability.
print("\nSample examples:")
for i in range(min(3, len(dataset))):
    print(f"\nExample {i+1}:")
    print("Q:", dataset[i]["question"])
    print("Retrieved Context:", dataset[i]["retrieved_context"][:200], "...")
    print("Response:", dataset[i]["response"][:200], "...")



Sample examples:

Example 1:
Q: What fields are returned when fetching a contact using the Get a Contact endpoint?
Retrieved Context: When you fetch a contact, the system returns a contact object with fields like:
- contactId: unique ID
- contactPhone: phone number
- firstName and lastName
- email
- birthday
- lists the contact belo ...
Response: ### Summary
When you call the Get a Contact endpoint, you receive a contact object that describes who the person is, how to reach them, which lists they are on, and when their record was created or up ...

Example 2:
Q: Which endpoint allows you to add a contact to a list?
Retrieved Context: There is an endpoint that lets you attach an existing contact to a specific list.

Example shape:
- Method: POST
- Path: /api/contact-lists/{listIdOrName}/contacts
- You provide the list identifier an ...
Response: ### Summary
To add an existing contact to a list, you call the endpoint that links a contact to a specific list.

### How it works
You send a 

### Data Formatting

In [51]:
"""
Load a chat template compatible with Llama-3 to structure conversations consistently
for training.
"""
from unsloth import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template="llama-3",
)


In [52]:
"""
Transforms RAG examples into a conversation format expected by Llama-3 SFT.
Includes a system prompt, user message with retrieved context, and assistant response.
"""
def convert_to_conversation_format(example):
    system_msg = (
        "You are Tech Clarity Copilot, an assistant that explains technical "
        "concepts clearly for non-technical users. Always structure your answer "
        "in four markdown sections:\n"
        "1. Summary\n"
        "2. How it works\n"
        "3. Key parameters / fields\n"
        "4. Common mistakes or troubleshooting tips"
    )

    user_msg = f"""Retrieved Documentation:
{example['retrieved_context']}

User Question:
{example['question']}
"""

    return {
        "conversations": [
            {"role": "system", "content": system_msg},
            {"role": "user", "content": user_msg},
            {"role": "assistant", "content": example["response"]},
        ]
    }

print("\nConverting dataset to conversation format...")
formatted_dataset = dataset.map(convert_to_conversation_format)



Converting dataset to conversation format...


Map:   0%|          | 0/11 [00:00<?, ? examples/s]

In [53]:
"""
Applies the Llama-3 chat template to the conversation format to produce a single
text string for each sample suitable for supervised fine-tuning.
"""
def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [
        tokenizer.apply_chat_template(
            convo,
            tokenize=False,
            add_generation_prompt=False
        )
        for convo in convos
    ]
    return {"text": texts}

print("Applying chat template...")
processed_dataset = formatted_dataset.map(formatting_prompts_func, batched=True)


Applying chat template...


Map:   0%|          | 0/11 [00:00<?, ? examples/s]

In [54]:
"""
Inspect one processed sample to verify the conversation has been correctly
converted to text suitable for training.
"""
import pprint

print("\nExample of processed SFT training sample:\n" + "-"*50)
pprint.pprint(processed_dataset[0])



Example of processed SFT training sample:
--------------------------------------------------
{'conversations': [{'content': 'You are Tech Clarity Copilot, an assistant '
                               'that explains technical concepts clearly for '
                               'non-technical users. Always structure your '
                               'answer in four markdown sections:\n'
                               '1. Summary\n'
                               '2. How it works\n'
                               '3. Key parameters / fields\n'
                               '4. Common mistakes or troubleshooting tips',
                    'role': 'system'},
                   {'content': 'Retrieved Documentation:\n'
                               'When you fetch a contact, the system returns a '
                               'contact object with fields like:\n'
                               '- contactId: unique ID\n'
                               '- contactPhone: phone number\n

### Base Model Evaluation

In [55]:
"""
Defines a function to generate structured technical explanations using the
fine-tuned Llama model.
"""

from unsloth import FastLanguageModel
from transformers import TextStreamer

# Ensure the model is in inference mode
FastLanguageModel.for_inference(model)

def generate_tech_response(question, retrieved_context, max_new_tokens=160):
    """
    Generate a Tech Clarity Copilot style explanation given a question and
    relevant retrieved documentation.
    """

    # SYSTEM message instructs the assistant on style and structure
    system_msg = (
        "You are Tech Clarity Copilot, an assistant that explains technical concepts "
        "clearly for non-technical users. Always structure your answer in four "
        "markdown sections with headings:\n"
        "1. Summary\n"
        "2. How it works\n"
        "3. Key parameters / fields\n"
        "4. Common mistakes or troubleshooting tips"
    )

    # USER message includes only the retrieved documentation and question
    user_msg = f"""Retrieved Documentation:
{retrieved_context}

User Question:
{question}
"""

    messages = [
        {"role": "system", "content": system_msg},
        {"role": "user", "content": user_msg},
    ]

    # Convert conversation into model-ready input tensors
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to(device)

    # Generate output tokens from the model deterministically
    output_ids = model.generate(
        input_ids=inputs,
        # streamer=text_streamer,  # uncomment if you want to see tokens stream out
        max_new_tokens=max_new_tokens,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=False,           # deterministic, less derpy
        temperature=0.2,
        top_p=0.9,
        no_repeat_ngram_size=4,    # reduce ‚Äúsame sentence 20 times‚Äù behavior
        repetition_penalty=1.1,
        use_cache=True,
    )

    # Remove input tokens from the output to get only the generated response
    generated_tokens = output_ids[0, inputs.shape[1]:]
    generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)

    return generated_text.strip()


In [56]:
"""
Generates responses from the base model for the first few RAG examples and
compares them against the reference responses to inspect quality.
"""

print("Base model performance on sample Tech Clarity tasks:")
for idx in [0, 1, 2]:
    question = dataset[idx]["question"]
    context = dataset[idx]["retrieved_context"]
    reference_response = dataset[idx]["response"]

    print("\n\n" + "="*80)
    print(f"Question: {question}")
    print("\n\nBase model response:")
    print(generate_tech_response(question, context))
    print("\n\nReference response:")
    print(reference_response)
    print("="*80)


Base model performance on sample Tech Clarity tasks:


Question: What fields are returned when fetching a contact using the Get a Contact endpoint?


Base model response:
Answer:
The following fields are returned by default:

contactId: unique identifier of the contact
contactPhone: phone numbers
firstName and lastName: first and last name
email: email address
birthday: date of birth
lists the contact belongs too
customFields: additional information about the contact
comment: any comments made about this contact
subscriptionStatus: status of the contact's subscriptions
created and updated timestamps


Reference response:
### Summary
When you call the Get a Contact endpoint, you receive a contact object that describes who the person is, how to reach them, which lists they are on, and when their record was created or updated.

### How it works
The endpoint looks up a specific contact in the system using an identifier such as `contactId` or `contactPhone`. It then returns a structured obj

### SFT Training and Setup

In [57]:
"""
This section initializes the SFTTrainer for supervised fine-tuning (SFT) of the model
on the processed dataset. It calculates effective training steps, sets hyperparameters,
and configures the trainer to train using only the response portion of the conversation.
"""

from trl import SFTConfig, SFTTrainer
from transformers import DataCollatorForSeq2Seq
import math

# Calculate training steps based on dataset size, epochs, and batch setup
effective_batch_size = batch_size * gradient_accumulation_steps
num_training_steps = math.ceil((len(processed_dataset) * num_epochs) / effective_batch_size)
warmup_steps = int(num_training_steps * warmup_ratio)

# Initialize the SFT trainer
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=processed_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
    packing=False,
    args=SFTConfig(
        per_device_train_batch_size=batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        num_train_epochs=num_epochs,
        warmup_steps=warmup_steps,
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        seed=3407,
        output_dir="outputs",
        report_to="none",
        logging_steps=10,
        save_steps=100,
        save_total_limit=3,
    ),
)

# Configure trainer to only train on the assistant's responses (ignoring system/user prompts)
from unsloth.chat_templates import train_on_responses_only

trainer = train_on_responses_only(
    trainer,
    instruction_part="<|start_header_id|>user<|end_header_id|>\n\n",
    response_part="<|start_header_id|>assistant<|end_header_id|>\n\n",
)


num_proc must be <= 11. Reducing num_proc to 11 for dataset of size 11.


Unsloth: Tokenizing ["text"] (num_proc=11):   0%|          | 0/11 [00:00<?, ? examples/s]

num_proc must be <= 11. Reducing num_proc to 11 for dataset of size 11.


Map (num_proc=11):   0%|          | 0/11 [00:00<?, ? examples/s]

### Training & Saving

In [58]:
"""
Runs the training process on the SFTTrainer and saves the final
model and tokenizer to the specified output directory for future use.
"""

# Train the model
trainer.train()

# Save the fine-tuned model and tokenizer
trainer.save_model("outputs/final_model")
tokenizer.save_pretrained("outputs/final_model")


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 11 | Num Epochs = 1 | Total steps = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 6
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 6 x 1) = 12
 "-____-"     Trainable parameters = 589,824 of 1,236,404,224 (0.05% trained)


Step,Training Loss


('outputs/final_model/tokenizer_config.json',
 'outputs/final_model/special_tokens_map.json',
 'outputs/final_model/chat_template.jinja',
 'outputs/final_model/tokenizer.json')

### Fine-Tuned Evaluation

In [59]:
"""
Enable inference mode for the fine-tuned model and compare its responses
to reference answers for a few sample tasks from the training set.
"""

# Enable inference mode for the fine-tuned model
FastLanguageModel.for_inference(model)

print("Fine-tuned Tech Clarity Copilot performance on the same sample tasks:")

sample_indices = [0, 1, 2]
for idx in sample_indices:
    question = dataset[idx]["question"]
    context = dataset[idx]["retrieved_context"]
    reference_response = dataset[idx]["response"]

    print("\n" + "="*80)
    print(f"Question: {question}")

    print("\nFine-tuned model's response:")
    fine_tuned_response = generate_tech_response(question, context)
    print(fine_tuned_response)

    print("\nReference response:")
    print(reference_response)

# Define new technical questions / documentation
new_tech_questions = [
    {
        "question": "How do I create a new contact list in SimpleTexting?",
        "retrieved_context": "### Create Contact List Endpoint\nPOST /api/contact-lists\nParameters:\n- name: string, required, name of the contact list\n- description: string, optional, description of the list\n- api_key: required\n..."
    },
    {
        "question": "How can I update a Google Sheets cell using Make modules?",
        "retrieved_context": "### Update Cell in Google Sheets\nModule: Update a Cell\nInput parameters:\n- Spreadsheet ID\n- Sheet Name\n- Cell Range (e.g., 'B2')\n- Value\n..."
    },
    {
        "question": "How do I remove a contact from multiple lists at once?",
        "retrieved_context": "### Remove Contact From Lists\nEndpoint: DELETE /api/contact-lists/{listIdOrName}/contacts\nPath parameters:\n- contactId\n- listIds\n..."
    },
    {
        "question": "What happens if I try to fetch a contact that doesn't exist?",
        "retrieved_context": "### Get a Contact Endpoint\nIf the contact ID does not exist, the API returns a 404 Not Found error with a JSON message: {\"error\": \"Contact not found\"}..."
    }
]

print("\nFine-tuned Tech Clarity Copilot performance on new technical questions:")
for i, q in enumerate(new_tech_questions):
    print(f"\n\n{'='*80}")
    print(f"New Question {i+1}: {q['question']}")

    print("\nGenerated explanation:")
    generated_response = generate_tech_response(q['question'], q['retrieved_context'], max_new_tokens=160)
    print(generated_response)


Fine-tuned Tech Clarity Copilot performance on the same sample tasks:

Question: What fields are returned when fetching a contact using the Get a Contact endpoint?

Fine-tuned model's response:
Answer:
The following fields are returned by default:

contactId: unique identifier of the contact
contactPhone: phone numbers
firstName and lastName: first and last name
email: email address
birthday: date of birth
lists the contact belongs too
customFields: additional information about the contact
comment: any comments made about this contact
subscriptionStatus: status of the contact's subscriptions
created and updated timestamps

Reference response:
### Summary
When you call the Get a Contact endpoint, you receive a contact object that describes who the person is, how to reach them, which lists they are on, and when their record was created or updated.

### How it works
The endpoint looks up a specific contact in the system using an identifier such as `contactId` or `contactPhone`. It then re

## RAG Section

In [60]:
"""
0. IMPORTS & SETUP
Load necessary libraries for document handling, LangChain, vector stores, and SFT model.
"""
import os
from dotenv import load_dotenv

load_dotenv()
from pathlib import Path
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

from unsloth import FastLanguageModel # SFT model imports

"""
1. LOAD KNOWLEDGE BASE
Read documents (txt, md, pdf, docx) and split into chunks for retrieval.
"""
def load_text_from_file(path: str) -> str:
    p = Path(path)
    if not p.exists():
        raise FileNotFoundError(f"File not found: {path}")
    suffix = p.suffix.lower()
    if suffix in [".txt", ".md"]:
        return p.read_text(encoding="utf-8", errors="ignore")
    if suffix == ".pdf":
        from pypdf import PdfReader
        reader = PdfReader(str(p))
        return "\n".join([page.extract_text() or "" for page in reader.pages])
    if suffix in [".docx", ".doc"]:
        import docx
        doc = docx.Document(str(p))
        return "\n".join([para.text for para in doc.paragraphs])
    raise ValueError(f"Unsupported file type: {suffix}")

INPUT_FILE = "./tech_clarity_copilot_rag_doc.md"
raw_knowledge_base = load_text_from_file(INPUT_FILE)
docs = [Document(page_content=raw_knowledge_base)]


"""
2. DOCUMENT CHUNKING
"""
CHUNK_SIZE = 1200
CHUNK_OVERLAP = 150
text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
split_documents = text_splitter.split_documents(docs)


"""
3. EMBEDDINGS & VECTOR STORE
Create embeddings using Azure OpenAI and store document vectors in FAISS.
"""
from langchain_openai import AzureOpenAIEmbeddings

AZURE_OPENAI_BASE = f"https://aistudio-apim-ai-gateway02.azure-api.net/MIS372T/v1"
EMBED_DEPLOYMENT = "text-embedding-3-small"
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
APIM_HEADERS = {"Ocp-Apim-Subscription-Key": AZURE_OPENAI_API_KEY}

embeddings = AzureOpenAIEmbeddings(
    azure_endpoint=AZURE_OPENAI_BASE,
    azure_deployment=EMBED_DEPLOYMENT,
    api_version="2023-05-15",
    openai_api_key=AZURE_OPENAI_API_KEY,
    default_headers=APIM_HEADERS,
)

"""
4. RETRIEVER SETUP
"""
vector_store = FAISS.from_documents(split_documents, embeddings)
retriever = vector_store.as_retriever(search_kwargs={"k": 5})


"""
5. LOAD FINE-TUNED SFT MODEL
Load the LoRA/SFT checkpoint and enable inference mode.
"""
# model, tokenizer = FastLanguageModel.from_pretrained(
#     model_name="outputs",                # Path to your fine-tuned SFT/LoRA checkpoint
#     base_model="unsloth/Llama-3.2-1B",  # Same base model used for training
#     max_seq_length=2048,
#     load_in_4bit=True
# )
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="outputs/final_model",
    max_seq_length=2048,
    load_in_4bit=False
)
FastLanguageModel.for_inference(model)

"""
6. PROMPT TEMPLATES
Define RAG-based and general (non-RAG) prompt templates.
"""
RAG_PROMPT_TEMPLATE = """
ROLE:
You are a helpful assistant for question-answering tasks.

INSTRUCTION:
Use the following retrieved context to answer the question.

RESPONSE PROTOCOL:
If the answer is not in the context, say you don't have that information.

CONTEXT:
{context}

QUESTION:
{question}
"""
rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT_TEMPLATE)

NON_RAG_PROMPT_TEMPLATE = """
QUESTION:
{question}

Please answer the user's question based on your general knowledge.
"""
non_rag_prompt = ChatPromptTemplate.from_template(NON_RAG_PROMPT_TEMPLATE)

"""
7. HELPER FUNCTIONS
Functions to handle SFT inference, formatting retrieved docs, and asking questions with or without RAG.
"""
import torch

def llm_inference_wrapper(prompt_dict):
    """
    Generate text from your fine-tuned SFT model using standard Hugging Face generate().
    """
    question = prompt_dict.get("question", "")
    context = prompt_dict.get("context", "")
    full_prompt = f"Context:\n{context}\n\nQuestion:\n{question}\nAnswer:"

    inputs = tokenizer(full_prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=256,
            temperature=0.2,
            do_sample=True,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id
        )

    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

    answer = decoded[len(full_prompt):].strip()
    return answer


def format_docs(docs):
    """Concatenate document contents for context"""
    return "\n\n".join([doc.page_content for doc in docs])

def ask_question(question: str, use_rag=True):
    if use_rag:
        retrieved_docs = retriever.invoke(question)
        context = format_docs(retrieved_docs)
        return llm_inference_wrapper({"context": context, "question": question})
    else:
        return llm_inference_wrapper({"context": "", "question": question})

"""
6. DEMO
Run example questions through the system using RAG-based retrieval and SFT model.
"""
demo_questions = [
    "What fields are returned when fetching a contact using the Get a Contact endpoint?",
    "How do I clear values from a specific range in a Google Sheet?",
    "What happens if I try to clear values from a read-only spreadsheet?",
]

for q in demo_questions:
    print(f"\n\n=== Question ===\n{q}\n")
    print("RAG-based answer:")
    print(ask_question(q, use_rag=True))
    # print("\nNon-RAG answer:")
    # print(ask_question(q, use_rag=False))


==((====))==  Unsloth 2025.12.1: Fast Llama patching. Transformers: 4.55.4.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 8.0. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


=== Question ===
What fields are returned when fetching a contact using the Get a Contact endpoint?

RAG-based answer:
The response contains the following fields:
* `contactId` (string)
* `contactPhone` (string)
* `firstName` (string)
* `lastName` (string)
* `email` (string)
* `birthday` (string)
* `lists` (array of objects (`ContactList`))
* `customFields` (object)
* `comment` (string)
* `subscriptionStatus` (enum)
* `created` (string)
* `updated` (string)
* `updateSource` (enum)

### Authorization
* `api_key` required

### Path P

In [61]:
# from google.colab import drive
# drive.mount('/content/drive')


In [62]:
# Check the model type
print(model.__class__)


<class 'peft.peft_model.PeftModelForCausalLM'>


## Extra: Evaluation of KPIs for SFT and RAG + SFT models

In [41]:
# ================================
# KPI EVALUATION FOR SFT + RAG
# ================================

import re
import numpy as np
from sklearn.metrics import accuracy_score

# -------------------------------
# 1. STRUCTURE COMPLIANCE KPI
# -------------------------------
def structure_compliance_score(text):
    sections = ["Summary", "How it works", "Key parameters", "Common mistakes"]
    score = sum([1 for s in sections if s.lower() in text.lower()])
    return score / 4.0


# -------------------------------
# 2. GROUNDEDNESS KPI
# -------------------------------
def groundedness_score(answer, context):
    context_tokens = set(context.lower().split())
    answer_tokens = set(answer.lower().split())
    overlap = context_tokens.intersection(answer_tokens)
    return len(overlap) / max(1, len(answer_tokens))


# -------------------------------
# 3. HALLUCINATION RATE (inverse groundedness)
# -------------------------------
def hallucination_score(answer, context):
    return 1 - groundedness_score(answer, context)


# -------------------------------
# 4. READABILITY KPI (sentence simplicity)
# -------------------------------
def readability_score(text):
    sentences = re.split(r'[.!?]', text)
    avg_len = np.mean([len(s.split()) for s in sentences if s.strip()])
    return max(0, 1 - avg_len / 40)  # normalized readability


# -------------------------------
# 5. ENDPOINT ACCURACY KPI
# -------------------------------
def endpoint_accuracy(answer, expected_endpoint):
    return int(expected_endpoint.lower() in answer.lower())


# -------------------------------
# 6. MASTER KPI EVALUATION
# -------------------------------
def evaluate_kpis(dataset, use_rag=True):
    results = []

    for item in dataset:
        question = item["question"]
        context = item["retrieved_context"]
        reference = item["response"]

        generated = ask_question(question, use_rag=use_rag)

        kpis = {
            "structure": structure_compliance_score(generated),
            "groundedness": groundedness_score(generated, context),
            "hallucination": hallucination_score(generated, context),
            "readability": readability_score(generated),
            "endpoint_accuracy": endpoint_accuracy(
                generated,
                expected_endpoint="/api/contact"  # adjust per task if needed
            ),
        }

        results.append(kpis)

    return {
        key: float(np.mean([r[key] for r in results]))
        for key in results[0]
    }


# ================================
# KPI RESULTS
# ================================
print("\n===== KPI RESULTS: RAG + SFT =====")
rag_kpis = evaluate_kpis(dataset, use_rag=True)
for k, v in rag_kpis.items():
    print(f"{k.upper():<20}: {v:.3f}")

print("\n===== KPI RESULTS: SFT ONLY (NO RAG) =====")
nonrag_kpis = evaluate_kpis(dataset, use_rag=False)
for k, v in nonrag_kpis.items():
    print(f"{k.upper():<20}: {v:.3f}")



===== KPI RESULTS: RAG + SFT =====
STRUCTURE           : 0.000
GROUNDEDNESS        : 0.212
HALLUCINATION       : 0.788
READABILITY         : 0.664
ENDPOINT_ACCURACY   : 0.455

===== KPI RESULTS: SFT ONLY (NO RAG) =====


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


STRUCTURE           : 0.000
GROUNDEDNESS        : 0.176
HALLUCINATION       : 0.824
READABILITY         : 0.434
ENDPOINT_ACCURACY   : 0.000
