# **1 — Install Dependencies**
Installs all libraries needed for LoRA finetuning, Chroma vectorstore, PDF loading, embeddings, and FastAPI deployment.

In [1]:
!pip install -qU transformers accelerate bitsandbytes datasets peft \
sentence-transformers langchain langchain-community chromadb pypdf fastapi uvicorn

print("Dependencies installed.")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m89.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.9/380.9 kB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m30.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m556.4/556.4 kB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7

# **2 — Imports & Global Config**
Loads all required Python packages and defines global paths, model names, and API configuration.

In [2]:
# ---------------------------
# BLOCK 2 — Imports & Config
# ---------------------------
import os
import json
from pathlib import Path

import torch
from transformers import (
    AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig,
    TrainingArguments, Trainer, DataCollatorForLanguageModeling
)
from datasets import load_dataset

from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel

# LangChain + Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

# FastAPI
from fastapi import FastAPI, Header, HTTPException
from pydantic import BaseModel


# === Kaggle Paths ===

# Read-only dataset provided in /kaggle/input
DATASET_JSON = "/kaggle/input/chatbot/combined_dataset.json"

# Files YOU generate must be in /kaggle/working
TRAIN_JSONL = "/kaggle/working/train_mixed.jsonl"
CHROMA_DIR = "/kaggle/working/chroma_db"
LORA_OUTPUT_DIR = "/kaggle/working/mistral_lora_adapter"

# Models
MISTRAL_MODEL = "mistralai/Mistral-7B-Instruct-v0.2"
CHROMA_EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

# API Token (optional, but OK for testing)
API_TOKEN = "36FiFRZl0aab1ijm8FIMqyalGk8_67W4FjZhqVDiTxx3LBmHt"

print("Imports loaded, global config set.")


2025-12-12 20:55:11.572242: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1765572911.807315      47 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1765572911.856793      47 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

Imports loaded, global config set.


# **3 — Convert Dataset JSON → JSONL**
Reads your dataset, extracts Q/A pairs, and converts them into instruction-tuning format for LoRA training.

In [3]:
# ===============================================================
# BLOCK 2 — Convert dataset JSON -> JSONL for instruction tuning
# ===============================================================

def convert_dataset_to_jsonl(input_json, output_jsonl):
    raw_data = []
    with open(input_json, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line:
                try:
                    raw_data.append(json.loads(line))
                except json.JSONDecodeError as e:
                    print(f"⚠ Warning: Skipping malformed JSON line: {line[:100]}... Error: {e}")

    records = []
    for i, r in enumerate(raw_data):
        question = r.get("Context", "")
        answer = r.get("Response", "")
        if question and answer:
            records.append({
                "instruction": question.strip(),
                "input": "",
                "output": answer.strip(),
                "metadata": {"id": r.get("id", str(i))}
            })

    with open(output_jsonl, "w", encoding="utf-8") as f:
        for item in records:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")

    print(f"Converted dataset → {len(records)} records.")
    return records

records = convert_dataset_to_jsonl(DATASET_JSON, TRAIN_JSONL)


Converted dataset → 3508 records.


# **4 — Add Dataset Answers into Chroma**
Adds your dataset answers into Chroma so the chatbot retrieves knowledge from both PDF and dataset.

In [4]:
# ---------------------------
# BLOCK 5 — Add dataset answers to Chroma
# ---------------------------
from langchain_core.documents import Document # Import the Document class

# Initialize embeddings and ChromaDB
chroma_embed_model = HuggingFaceEmbeddings(model_name=CHROMA_EMBED_MODEL)
chroma_db = Chroma(
    persist_directory=CHROMA_DIR,
    embedding_function=chroma_embed_model
)

def add_records_to_chroma(chroma, records):
    # Create Document objects instead of plain dictionaries
    docs = [Document(page_content=r["output"], metadata=r["metadata"]) for r in records]
    chroma.add_documents(docs)
    chroma.persist()
    print(f"Added {len(docs)} dataset answers to Chroma.")

add_records_to_chroma(chroma_db, records)


  chroma_embed_model = HuggingFaceEmbeddings(model_name=CHROMA_EMBED_MODEL)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  chroma_db = Chroma(


Added 3508 dataset answers to Chroma.


  chroma.persist()


# **5 — Prepare Dataset & Tokenize for LoRA Training**
Tokenizes examples and masks prompt tokens for supervised instruction fine-tuning.

In [5]:
# ---------------------------
# BLOCK 6 — Tokenize dataset
# ---------------------------
def prepare_training_dataset(jsonl_path):
    tok = AutoTokenizer.from_pretrained(MISTRAL_MODEL, use_fast=True)
    tok.pad_token = tok.eos_token

    ds = load_dataset("json", data_files={"train": jsonl_path}, split="train")

    def build_example(e):
        prompt = f"### Instruction:\n{e['instruction']}\n\n### Response:\n"
        prompt_ids = tok(prompt, add_special_tokens=False)["input_ids"]
        return {"full": prompt + e["output"], "prompt_len": len(prompt_ids)}

    ds = ds.map(build_example)

    def tokenize_fn(e):
        tok_out = tok(e["full"], max_length=1024, truncation=True, padding="max_length")
        labels = tok_out["input_ids"].copy()
        for i in range(e["prompt_len"]):
            labels[i] = -100
        tok_out["labels"] = labels
        return tok_out

    tokenized = ds.map(tokenize_fn, remove_columns=ds.column_names)
    tokenized = tokenized.train_test_split(test_size=0.05)

    print("Tokenization complete.")
    return tokenized, tok

tokenized_ds, tokenizer = prepare_training_dataset(TRAIN_JSONL)


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/3508 [00:00<?, ? examples/s]

Map:   0%|          | 0/3508 [00:00<?, ? examples/s]

Tokenization complete.


# **6 — Train LoRA Adapter (4-bit)**
Loads Mistral-7B in 4-bit mode, attaches LoRA adapters, and fine-tunes using your dataset.

In [6]:
# ---------------------------
# BLOCK 7 — Train LoRA (Fixed)
# ---------------------------
def train_lora(tokenized_ds, tokenizer):
    bnb = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16
    )

    model = AutoModelForCausalLM.from_pretrained(
        MISTRAL_MODEL, quantization_config=bnb, device_map="auto"
    )
    model = prepare_model_for_kbit_training(model)

    lora_cfg = LoraConfig(
        r=16, lora_alpha=32,
        target_modules=["q_proj","k_proj","v_proj","o_proj"],
        lora_dropout=0.05,
        task_type="CAUSAL_LM"
    )
    model = get_peft_model(model, lora_cfg)

    collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

    args = TrainingArguments(
        output_dir=LORA_OUTPUT_DIR,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=8,
        num_train_epochs=2,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=20,
        save_strategy="epoch",
        report_to="none" # Disable Weights & Biases logging
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=tokenized_ds["train"],
        eval_dataset=tokenized_ds["test"],
        data_collator=collator
    )

    trainer.train()

    trainer.model.save_pretrained(LORA_OUTPUT_DIR)

    print("LoRA training complete and saved.")

train_lora(tokenized_ds, tokenizer)

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss
20,2.2507
40,2.0815
60,2.0714
80,2.0284
100,2.0229
120,2.0342
140,1.9354
160,1.9714
180,1.9137
200,1.929


  return fn(*args, **kwargs)


LoRA training complete and saved.


# **7 — Load Base Model + LoRA Adapter for Inference**
Loads Mistral-7B in 4-bit and merges LoRA weights for efficient inference.

In [7]:
# ---------------------------
# BLOCK 8 — Load model for inference
# ---------------------------
def load_inference():
    bnb = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)

    base = AutoModelForCausalLM.from_pretrained(
        MISTRAL_MODEL, quantization_config=bnb, device_map="auto"
    )

    tok = AutoTokenizer.from_pretrained(MISTRAL_MODEL, use_fast=True)
    tok.pad_token = tok.eos_token

    model = PeftModel.from_pretrained(base, LORA_OUTPUT_DIR)
    model.eval()

    print("Model + LoRA loaded for inference.")
    return model, tok

model, inference_tokenizer = load_inference()


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Model + LoRA loaded for inference.


In [8]:
import torch

def load_inference_universal():
    # BitsAndBytes for 4bit
    bnb = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16
    )

    # Auto device placement (CPU, single GPU, or multi-GPU)
    base = AutoModelForCausalLM.from_pretrained(
        MISTRAL_MODEL,
        quantization_config=bnb,
        device_map="auto"           # ← Auto-select device
    )

    tok = AutoTokenizer.from_pretrained(MISTRAL_MODEL, use_fast=True)
    tok.pad_token = tok.eos_token

    # Load LoRA on the SAME device_map as base model
    model = PeftModel.from_pretrained(
        base,
        LORA_OUTPUT_DIR,
        device_map="auto"           # ← Also auto-load
    )

    model.eval()
    print("Model + LoRA loaded on:", model.device)
    return model, tok

model, inference_tokenizer = load_inference_universal()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Model + LoRA loaded on: cuda:0


In [9]:
def generate_answer(prompt):
    # Auto-detect device from model weights
    device = next(model.parameters()).device

    # Tokenize & move inputs automatically
    inputs = inference_tokenizer(prompt, return_tensors="pt").to(device)

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=200,
            temperature=0.7,
            do_sample=True,
            pad_token_id=inference_tokenizer.eos_token_id,
        )

    return inference_tokenizer.decode(output[0], skip_special_tokens=True)

In [10]:
test_prompt = "Instruction: I feel depressed. How can I feel better?\nInput:\nOutput:"
print(generate_answer(test_prompt))

Instruction: I feel depressed. How can I feel better?
Input:
Output:
Psychological distress is not uncommon.  Depression is a treatable condition.  In the first instance, it is highly recommended that you seek the assistance of a licensed psychologist or therapist.  The treatment for depression is multifaceted and requires the input of a licensed professional.  Treatment can include psychotherapy, medication, and lifestyle changes.  You can start by making an appointment with your primary care provider to discuss your symptoms and get a referral to a licensed therapist.  You can also call your health insurance provider for a list of in-network therapists.  You can also visit the American Psychological Association website to find a licensed professional in your area.  http://findtherapist.psychologytoday.com/Online%20Therapy%20and%20Counseling,%20Inc./Online%20Ther


In [14]:
import shutil
import os

# المسار اللي فيه LoRA Adapter
SOURCE_DIR = "/kaggle/working/mistral_lora_adapter"

# اسم ملف الـ zip
ZIP_PATH = "/kaggle/working/mistral_lora_adapter.zip"

# لو كان في zip قديم امسحيه
if os.path.exists(ZIP_PATH):
    os.remove(ZIP_PATH)1

# اعملي zip
shutil.make_archive(
    base_name=ZIP_PATH.replace(".zip", ""),
    format="zip",
    root_dir=SOURCE_DIR
)

print("✅ mistral_lora_adapter.zip created successfully!")


✅ mistral_lora_adapter.zip created successfully!
