In [None]:
pip install torch transformers peft accelerate datasets jsonlines

In [None]:
!pip install sentence-transformers
!pip install chromadb
!pip install transformers
!pip install torch
!pip install datasets
!pip install sentence-transformers chromadb transformers torch datasets

In [None]:
%%writefile finetune_tinylama_dop.py

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
from datasets import Dataset
import jsonlines
import os

# --- Configuration ---
BASE_MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
DATASET_PATH = "/kaggle/input/dop-dataset/Dataset/combined_dataset.jsonl"
OUTPUT_ADAPTERS_DIR = "./fine_tuned_tinylama_dop_adapters"

# --- ENVIRONMENT CONFIGURATIONS ---
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
# IMPORTANT: Remove or comment out CUDA_VISIBLE_DEVICES for multi-GPU
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Determine device
if torch.cuda.is_available():
    DEVICE = "cuda"
    print(f"CUDA is available. Training will use GPU(s). Total GPUs detected: {torch.cuda.device_count()}")
else:
    DEVICE = "cpu"
    print("WARNING: CUDA is not available. Training will be done on CPU, which will be very slow.")

# --- 1. Load and Prepare Dataset ---
def load_and_prepare_dataset(tokenizer_obj: AutoTokenizer, dataset_file: str):
    """Loads a single .jsonl file, formats, tokenizes, and splits it."""
    raw_data = []
    with jsonlines.open(dataset_file) as reader:
        for obj in reader:
            raw_data.append(obj)

    formatted_texts = []
    for item in raw_data:
        instruction = item["instruction"]
        output = item["output"]
        formatted_texts.append(f"<s>[INST] {instruction} [/INST] {output}</s>")

    # Create a single dataset first
    full_dataset = Dataset.from_dict({"text": formatted_texts})

    # Tokenize the entire dataset
    def tokenize_function(examples):
        tokenized_output = tokenizer_obj(
            examples["text"],
            padding="max_length",
            truncation=True,
            max_length=512,
        )
        # Standard causal LM - labels are the same as input_ids
        tokenized_output["labels"] = tokenized_output["input_ids"].copy()
        return tokenized_output

    tokenized_dataset = full_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
    
    # Split the dataset into training and validation sets (e.g., 90% train, 10% validation)
    split_dataset = tokenized_dataset.train_test_split(test_size=0.1, seed=42)

    print(f"Dataset loaded from {dataset_file}")
    print(f"Training set size: {len(split_dataset['train'])}")
    print(f"Validation set size: {len(split_dataset['test'])}")
    
    # Return the split datasets
    return split_dataset['train'], split_dataset['test']

# --- 2. Load Tokenizer and Base Model ---
print(f"Loading tokenizer for {BASE_MODEL_ID}...")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID)

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    print("Added [PAD] token to tokenizer.")

print(f"Loading base model {BASE_MODEL_ID}...")
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_ID,
    torch_dtype=torch.bfloat16,
    # device_map not set here; accelerate handles it for multi-GPU
)

if tokenizer.pad_token_id is not None:
    model.resize_token_embeddings(len(tokenizer))
    # Optional: Initialize the new pad token embedding
    with torch.no_grad():
        model.model.embed_tokens.weight.data[tokenizer.pad_token_id] = model.model.embed_tokens.weight.data[tokenizer.eos_token_id]

# --- 3. Configure LoRA ---
print("Configuring LoRA...")
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_config)

if DEVICE == "cuda":
    model.enable_input_require_grads()
    model.gradient_checkpointing_enable()
    model.config.use_cache = False
    print("Gradient checkpointing enabled and use_cache set to False. Input gradients enabled.")

model.print_trainable_parameters()

# --- 4. Load and Prepare Training & Validation Datasets ---
train_dataset, eval_dataset = load_and_prepare_dataset(tokenizer, DATASET_PATH)

# --- 5. Define Training Arguments ---
print("Defining training arguments for optimal multi-GPU training...")
use_bf16 = False
use_fp16 = False
if DEVICE == "cuda":
    if torch.cuda.get_device_properties(0).major >= 8:  # Ampere or newer (RTX 30xx, A100, H100)
        use_bf16 = True
        print("Using bfloat16 (BF16) precision for training.")
    else:  # Older GPUs like T4, P100 (Volta, Pascal, etc.)
        use_fp16 = True
        print("Using float16 (FP16) precision for training.")

training_args = TrainingArguments(
    output_dir=OUTPUT_ADAPTERS_DIR,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    num_train_epochs=4,
    logging_dir="./logs_tinylama_dop",
    logging_steps=10,
    save_steps=200,
    save_total_limit=2,
    report_to="tensorboard",
    bf16=use_bf16,
    fp16=use_fp16,
    overwrite_output_dir=True,
    gradient_checkpointing=True if DEVICE == "cuda" else False,
    ddp_find_unused_parameters=False,
    
    # --- Arguments for Validation ---
    eval_strategy="steps",
    eval_steps=50,  # Evaluate every 50 steps
    per_device_eval_batch_size=4,
)

# --- 6. Create Trainer and Start Training ---
print("Initializing Trainer...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,  # Pass in the validation dataset
    tokenizer=tokenizer,
)

print("\nStarting fine-tuning process...")
trainer.train()
print("\nFine-tuning complete. LoRA adapters saved to:", OUTPUT_ADAPTERS_DIR)

# --- End of Part 1 ---
print("\n-----------------------------------------------------")
print("Fine-tuning (Part 1) complete. LoRA adapters saved.")
print("Proceed to Part 2 to merge adapters and save the full model.")
print("-----------------------------------------------------")

In [None]:
!accelerate launch finetune_tinylama_dop.py

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model, set_peft_model_state_dict, PeftModel
import os
# --- ADDED IMPORT ---
import safetensors.torch # Import the safetensors library
# --- END ADDED IMPORT ---

# --- Configuration (Must match Part 1's configuration) ---
BASE_MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
# Directory where LoRA adapters were saved by Part 1
OUTPUT_ADAPTERS_DIR = "./fine_tuned_tinylama_dop_adapters/checkpoint-80" # Confirmed correct path
# Directory where the final merged fine-tuned model will be saved
OUTPUT_MERGED_MODEL_DIR = "./merged_fine_tuned_tinylama_dop"

# --- Determine device for loading models ---
if torch.cuda.is_available():
    DEVICE = "cuda"
    print(f"CUDA is available. Merging will use GPU: {torch.cuda.get_device_name(0)}")
else:
    DEVICE = "cpu"
    print("WARNING: CUDA is not available. Merging will be done on CPU.")

# --- 1. Load Tokenizer and Base Model ---
print(f"Loading tokenizer for {BASE_MODEL_ID}...")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID)

print(f"Loading base model {BASE_MODEL_ID} for merging...")
base_model_for_merge = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_ID,
    torch_dtype=torch.float16,
    device_map="auto" if DEVICE == "cuda" else None,
)

# --- 2. Load LoRA Adapters and Merge (Manual Load) ---
print(f"Loading LoRA adapters weights from: {os.path.abspath(OUTPUT_ADAPTERS_DIR)}")

lora_config = LoraConfig.from_pretrained(os.path.abspath(OUTPUT_ADAPTERS_DIR))

merged_model = get_peft_model(base_model_for_merge, lora_config)

adapter_weights_path = os.path.join(os.path.abspath(OUTPUT_ADAPTERS_DIR), "adapter_model.bin")
if not os.path.exists(adapter_weights_path):
    adapter_weights_path = os.path.join(os.path.abspath(OUTPUT_ADAPTERS_DIR), "adapter_model.safetensors")
    if not os.path.exists(adapter_weights_path):
        raise FileNotFoundError(f"Neither adapter_model.bin nor adapter_model.safetensors found in {os.path.abspath(OUTPUT_ADAPTERS_DIR)}")

print(f"Loading adapter weights from: {adapter_weights_path}")
# CRITICAL FIX: Use safetensors.torch.load_file for .safetensors files
adapter_state_dict = safetensors.torch.load_file(adapter_weights_path, device="cpu") # map_location="cpu" is replaced by device="cpu"

set_peft_model_state_dict(merged_model, adapter_state_dict)

print("Merging LoRA adapters into the base model...")
merged_model = merged_model.merge_and_unload()

# --- 3. Save the Merged Fine-Tuned Model ---
print(f"Saving the full fine-tuned model to: {OUTPUT_MERGED_MODEL_DIR}")
os.makedirs(OUTPUT_MERGED_MODEL_DIR, exist_ok=True)
merged_model.save_pretrained(OUTPUT_MERGED_MODEL_DIR)
tokenizer.save_pretrained(OUTPUT_MERGED_MODEL_DIR)

print("\n--- Merging Complete ---")
print("Your full fine-tuned model (base model + LoRA adapters) has been saved.")
print(f"Model path: {os.path.abspath(OUTPUT_MERGED_MODEL_DIR)}")
print("\n--- Next Steps ---")
print("1. **Evaluate/Test the Model:** Load the merged model and evaluate its performance.")
print("2. **Update Chatbot:** Update your `NEEPCOPolicyChatbot` code to use this fine-tuned model.")
print("3. **(Optional) Quantize the Model:** If you need a lighter model for inference, proceed with quantization.")

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model, set_peft_model_state_dict, PeftModel
import os
import safetensors.torch  # Import safetensors

# === Configurations (must match Part 1) ===
BASE_MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
OUTPUT_ADAPTERS_DIR = "./fine_tuned_tinylama_dop_adapters/checkpoint-80"
OUTPUT_MERGED_MODEL_DIR = "/kaggle/working/output/merged_tinylama_dop"

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID)

# Load base model
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_ID,
    torch_dtype=torch.float16 if device=="cuda" else torch.float32,
    device_map="auto" if device=="cuda" else None,
)

# Load LoRA
lora_config = LoraConfig.from_pretrained(OUTPUT_ADAPTERS_DIR)
model = get_peft_model(base_model, lora_config)

# Load adapter weights
adapter_path = os.path.join(OUTPUT_ADAPTERS_DIR, "adapter_model.bin")
if not os.path.exists(adapter_path):
    adapter_path = os.path.join(OUTPUT_ADAPTERS_DIR, "adapter_model.safetensors")
    if not os.path.exists(adapter_path):
        raise FileNotFoundError("adapter_model.bin or .safetensors not found")

state = safetensors.torch.load_file(adapter_path, device="cpu")
set_peft_model_state_dict(model, state)

# Merge LoRA
model = model.merge_and_unload()

# Save merged model into notebook output path
print(f"Saving merged model to {OUTPUT_MERGED_MODEL_DIR}")
os.makedirs(OUTPUT_MERGED_MODEL_DIR, exist_ok=True)
model.save_pretrained(OUTPUT_MERGED_MODEL_DIR)
tokenizer.save_pretrained(OUTPUT_MERGED_MODEL_DIR)
print("✅ Saved merged model and tokenizer to notebook output directory.")


In [None]:
import json
from typing import List, Dict, Any
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os

def create_optimized_chunks(context_file_path: str) -> List[Dict]:
    """
    Converts combined_context.jsonl into hyper-granular, context-rich chunks
    using an advanced strategy for optimal RAG performance.
    """
    chunks = []
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=700,
        chunk_overlap=100,
        length_function=len,
        add_start_index=False,
    )

    with open(context_file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data = json.loads(line.strip())

            section = data.get('section', '')
            title = data.get('title', '')
            main_clause_num = data.get('clause') or data.get('Clause', '')

            if 'subclauses' in data:
                for subclause in data['subclauses']:
                    if 'methods' in subclause:
                        for method in subclause['methods']:
                            _process_and_append_chunk(
                                text_splitter, chunks, method,
                                parent_context={
                                    'section': section, 'title': title, 'main_clause_num': main_clause_num,
                                    'subclause_id': subclause.get('id', ''), 'subclause_desc': subclause.get('description', ''),
                                    'subclause_remarks': subclause.get('remarks', [])
                                }
                            )
                    else:
                        _process_and_append_chunk(text_splitter, chunks, subclause, {'section': section, 'title': title, 'main_clause_num': main_clause_num})
            else:
                _process_and_append_chunk(text_splitter, chunks, data, {'section': section, 'title': title, 'main_clause_num': main_clause_num})
    return chunks

def _process_and_append_chunk(text_splitter: RecursiveCharacterTextSplitter, chunks: List[Dict], content_dict: Dict[str, Any], parent_context: Dict[str, Any]):
    """Helper function to construct, split, and append a context-aware chunk."""
    section, title, main_clause_num = parent_context.get('section'), parent_context.get('title'), parent_context.get('main_clause_num')
    sub_title = content_dict.get('title', '')

    is_method_level = 'method' in content_dict
    if is_method_level:
        sub_id = parent_context.get('subclause_id', '')
        method_id = content_dict.get('id', '')
        method_type = content_dict.get('method', '')
    else:
        sub_id = content_dict.get('id', '')
        method_id = ''
        method_type = ''

    full_clause_id = f"{main_clause_num or ''}{sub_id or ''}{method_id or ''}"

    chunk_text = ""
    if section == 'Annexure A':
        chunk_text += "This item is a matter requiring approval of the Board of Directors.\n"

    chunk_text += f"Section {section}: {title}\nClause {full_clause_id}: "
    if sub_title: chunk_text += f"{sub_title} - "
    if method_type: chunk_text += f"{method_type} - "

    description = content_dict.get('description', parent_context.get('subclause_desc', ''))
    if description: chunk_text += f"{description}\n"

    items = content_dict.get('items', [])
    if items: chunk_text += f"Items: {'; '.join(str(item) for item in items)}\n"

    delegation = content_dict.get('delegation')
    if delegation:
        delegation_parts = []
        if isinstance(delegation, dict):
            for grade, power in delegation.items():
                if power and str(power).strip() not in ["NIL", "---"]:
                    delegation_parts.append(f"{grade}: {power}")
        elif isinstance(delegation, str) and delegation.strip() not in ["NIL", "---"]:
            delegation_parts.append(delegation)
        if delegation_parts:
            chunk_text += "Delegation - " + '; '.join(delegation_parts) + "\n"

    remarks = content_dict.get('remarks', parent_context.get('subclause_remarks', []))
    if remarks:
        if isinstance(remarks, str) and 'remarks_reference' in content_dict:
             chunk_text += f"Remarks Reference: {content_dict['remarks_reference']}\n"
        elif isinstance(remarks, list):
             chunk_text += f"Remarks: {' '.join(str(r) for r in remarks)}\n"

    base_chunk_text = chunk_text.strip()
    if not base_chunk_text: return

    split_texts = text_splitter.split_text(base_chunk_text)
    for i, split_text in enumerate(split_texts):
        chunk_id = f"sec_{section}_cl_{full_clause_id}_part_{i}".replace(" ", "_").lower()
        chunks.append({
            'text': split_text,
            'metadata': {
                'section': section, 'title': title, 'clause': str(full_clause_id),
                'subclause_title': sub_title or 'N/A', 'method': method_type or 'N/A'
            },
            'id': chunk_id
        })

if __name__ == "__main__":
    INPUT_CONTEXT_PATH = "/kaggle/input/dop-dataset/Context/combined_context.jsonl"
    OUTPUT_CHUNKS_PATH = "processed_chunks_final.json"

    if not os.path.exists(INPUT_CONTEXT_PATH):
        print(f"FATAL ERROR: The context file was not found at '{INPUT_CONTEXT_PATH}'")
    else:
        try:
            print(f"Starting chunk creation from '{INPUT_CONTEXT_PATH}'...")
            final_chunks = create_optimized_chunks(INPUT_CONTEXT_PATH)

            with open(OUTPUT_CHUNKS_PATH, "w", encoding='utf-8') as f:
                json.dump(final_chunks, f, indent=2, ensure_ascii=False)

            print(f"\n✅ Successfully created {len(final_chunks)} context-aware chunks.")
            print(f"   Output saved to '{OUTPUT_CHUNKS_PATH}'")

        except Exception as e:
            print(f"An error occurred during chunk creation: {e}")

In [None]:
%%writefile create_vector_database.py
import json
import os
import shutil
from typing import List, Dict

import chromadb
from sentence_transformers import SentenceTransformer

class PolicyVectorDB:
    """Manages the creation and searching of a persistent vector database."""
    def __init__(self, persist_directory: str = "chroma_db"):
        self.client = chromadb.PersistentClient(path=persist_directory)
        self.collection_name = "neepco_dop_policies"
        self.embedding_model = SentenceTransformer('BAAI/bge-large-en-v1.5', device = 'cpu')
        self.collection = self.client.get_or_create_collection(
            name=self.collection_name,
            metadata={"description": "NEEPCO Delegation of Powers Policy"}
        )
        print(f"Loaded/Created persistent collection '{self.collection_name}' at '{persist_directory}'")

    def _flatten_metadata(self, metadata: Dict) -> Dict:
        """Ensures all metadata values are strings for ChromaDB compatibility."""
        return {key: str(value) for key, value in metadata.items()}

    def add_chunks(self, chunks: List[Dict]):
        """Encodes and adds a list of chunk dictionaries to the database."""
        if not chunks:
            print("No chunks provided to add.")
            return

        existing_ids = set(self.collection.get(include=[])['ids'])
        new_chunks = [chunk for chunk in chunks if chunk.get('id') not in existing_ids]

        if not new_chunks:
            print("No new chunks to add. All provided chunks already exist in the database.")
            return

        print(f"Found {len(new_chunks)} new chunks to add.")
        batch_size = 128

        for i in range(0, len(new_chunks), batch_size):
            batch = new_chunks[i:i + batch_size]
            print(f"  - Processing batch {i//batch_size + 1}/{ -(-len(new_chunks) // batch_size) }...")

            texts = [chunk['text'] for chunk in batch]
            ids = [chunk['id'] for chunk in batch]
            metadatas = [self._flatten_metadata(chunk['metadata']) for chunk in batch]

            embeddings = self.embedding_model.encode(texts, show_progress_bar=False).tolist()
            self.collection.add(ids=ids, embeddings=embeddings, documents=texts, metadatas=metadatas)
        
        print(f"Successfully added {len(new_chunks)} new chunks to the database!")

    def search(self, query_text: str, top_k: int = 3) -> List[Dict]:
        """Searches the collection for a given query text."""
        query_embedding = self.embedding_model.encode([query_text]).tolist()
        results = self.collection.query(
            query_embeddings=query_embedding,
            n_results=top_k,
            include=['documents', 'metadatas', 'distances']
        )
        
        search_results = []
        if not results.get('documents'):
            return []

        for i, doc in enumerate(results['documents'][0]):
            relevance_score = 1 - results['distances'][0][i]
            search_results.append({
                'text': doc,
                'metadata': results['metadatas'][0][i],
                'relevance_score': relevance_score
            })
        return search_results

def main():
    """Main function to build and verify the vector database."""
    INPUT_CHUNKS_PATH = "processed_chunks_final.json"
    PERSIST_DIRECTORY = "policy_vector_db"

    if not os.path.exists(INPUT_CHUNKS_PATH):
        print(f"FATAL ERROR: The input chunk file was not found at '{INPUT_CHUNKS_PATH}'")
        print("Please run 'create_chunks.py' first.")
        return

    if os.path.exists(PERSIST_DIRECTORY):
        print(f"Removing existing database at '{PERSIST_DIRECTORY}' to ensure a clean build.")
        shutil.rmtree(PERSIST_DIRECTORY)
        
    print(f"Creating database directory: '{PERSIST_DIRECTORY}'")
    os.makedirs(PERSIST_DIRECTORY, exist_ok=True)
    os.chmod(PERSIST_DIRECTORY, 0o777)

    print("\nStep 1: Loading processed chunks...")
    with open(INPUT_CHUNKS_PATH, 'r', encoding='utf-8') as f:
        chunks_to_add = json.load(f)
    print(f"Loaded {len(chunks_to_add)} chunks.")
    
    print("\nStep 2: Setting up persistent vector database...")
    db = PolicyVectorDB(persist_directory=PERSIST_DIRECTORY)
    
    print("\nStep 3: Adding chunks to the database...")
    db.add_chunks(chunks_to_add)
    
    print(f"\n✅ Vector database setup complete. Total chunks in DB: {db.collection.count()}")
    print(f"Database is saved in: {os.path.abspath(PERSIST_DIRECTORY)}")

    print("\n--- Running Verification Tests ---")
    test_questions = [
        "Who can approve changes to the pay structure?",
        "What is the financial limit for a DGM for works on a limited tender basis?",
        "What's the delegation power of an ED for single tender O&M contracts from an OEM?"
    ]

    for question in test_questions:
        print(f"\n--- Testing Query ---")
        print(f"Query: {question}")
        search_results = db.search(question, top_k=2)
        if search_results:
            for j, result in enumerate(search_results, 1):
                print(f"  Result {j} (Relevance: {result['relevance_score']:.4f}):")
                print(f"  Text: {result['text'][:300]}...")
                print(f"  Metadata: {result['metadata']}")
        else:
            print("  No results found.")

if __name__ == "__main__":
    main()

In [None]:
!python /kaggle/working/create_vector_database.py

In [None]:
!rm -rf /kaggle/working/chroma_db
print("Cleaned up old ChromaDB folder (if any).")
!rm /kaggle/working/create_vector_database.py

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
import chromadb
from sentence_transformers import SentenceTransformer
import os
import json

# --- Configuration ---
MERGED_MODEL_PATH = "/kaggle/working/merged_fine_tuned_tinylama_dop"
VECTOR_DB_PATH = "/kaggle/working/policy_vector_db"

# --- Class Definition 1: The Correct Vector DB Class ---
class PolicyVectorDB:
    """Manages the persistent vector database using ChromaDB."""
    def __init__(self, persist_directory: str):
        self.client = chromadb.PersistentClient(path=persist_directory)
        self.collection_name = "neepco_dop_policies"
        self.embedding_model = SentenceTransformer('BAAI/bge-large-en-v1.5')
        self.collection = self.client.get_or_create_collection(name=self.collection_name)
        print(f"Loaded/Created persistent collection '{self.collection_name}' at '{persist_directory}'")

    def search(self, query: str, top_k: int = 3) -> list:
        """Searches the database for a given query."""
        query_embedding = self.embedding_model.encode([query]).tolist()
        results = self.collection.query(
            query_embeddings=query_embedding,
            n_results=top_k,
            include=['documents', 'metadatas', 'distances']
        )
        search_results = []
        if not results or not results.get('documents') or not results['documents'][0]:
            return []
        for i in range(len(results['documents'][0])):
            relevance_score = 1 - results['distances'][0][i]
            search_results.append({
                'text': results['documents'][0][i],
                'metadata': results['metadatas'][0][i],
                'relevance_score': relevance_score
            })
        return search_results

# --- Class Definition 2: The Interactive Chatbot Class ---
class InteractiveChatbot:
    """Handles the full RAG pipeline for interactive chat."""
    def __init__(self, model_path: str, vector_db_path: str):
        print(f"Loading fine-tuned model from: {model_path}...")
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = AutoModelForCausalLM.from_pretrained(
            model_path,
            torch_dtype=torch.float16,
            device_map="auto"
        )
        if self.tokenizer.pad_token is None: self.tokenizer.pad_token = self.tokenizer.eos_token
        
        print("Connecting to existing vector database...")
        self.vector_db = PolicyVectorDB(vector_db_path)
        
        print("\nInteractive Chatbot ready!")

    def retrieve_context(self, query: str, top_k: int = 3) -> list:
        """Retrieves and filters context from the vector database."""
        retrieved_results = self.vector_db.search(query, top_k=top_k)
        RELEVANCE_THRESHOLD = 0.1 
        return [r for r in retrieved_results if r.get('relevance_score', 0) >= RELEVANCE_THRESHOLD]
    
    def format_prompt(self, query: str, context_results: list) -> str:
        """Formats the prompt with concise context for the LLM."""
        context_text = ""
        if context_results:
            for i, result in enumerate(context_results, 1):
                metadata = result.get("metadata", {})
                source_info = f"[Section: {metadata.get('section', 'N/A')}, Clause: {metadata.get('clause', 'N/A')}]"
                context_text += f"Source {i}: {source_info}\nDetails: {result.get('text', '')}\n\n"
        else:
            context_text = "No specific policy information was found for this question."
            
        return f"""<s>[INST] You are a helpful assistant for NEEPCO's Delegation of Power (DOP) policies. Use only the provided policy information to answer questions accurately and completely. Ensure you state the full rule, including all required actions, conditions, and approvals mentioned in the policy. If the provided information is insufficient, state that you cannot answer based on the given policies.

Policy Information:
{context_text.strip()}

Question: {query} [/INST]

Answer: Based on the provided policy information,"""

    def chat(self, query: str):
        """Orchestrates the RAG pipeline and streams the response."""
        context_results = self.retrieve_context(query)
        
        if not context_results:
            print("\nAnswer: I apologize, but I cannot find relevant information in the provided NEEPCO policy documents to answer your question.")
            return

        prompt = self.format_prompt(query, context_results)
        streamer = TextStreamer(self.tokenizer, skip_prompt=True, skip_special_tokens=True)
        
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
        
        print("\nAnswer: ", end="")
        _ = self.model.generate(
            **inputs,
            streamer=streamer,
            max_new_tokens=512,
            do_sample=True,
            temperature=0.1,
            top_p=0.9,
            pad_token_id=self.tokenizer.eos_token_id
        )
        print("\n\n---")
        print("Sources Used:")
        for i, r in enumerate(context_results, 1):
            meta = r['metadata']
            print(f"{i}. [Relevance: {r['relevance_score']:.2f}] Section {meta.get('section', 'N/A')}, Clause {meta.get('clause', 'N/A')}")
        print("-" * 50)


# --- Main execution block ---
def start_chat():
    # Check if required files/folders exist
    if not os.path.exists(VECTOR_DB_PATH) or not os.path.exists(MERGED_MODEL_PATH):
        print("Error: Database or Model not found.")
        print(f"Please ensure '{VECTOR_DB_PATH}' folder exists by running 'create_vector_database.py' first.")
        print(f"And ensure the fine-tuned model exists at '{MERGED_MODEL_PATH}'.")
        return

    chatbot = InteractiveChatbot(model_path=MERGED_MODEL_PATH, vector_db_path=VECTOR_DB_PATH)
    
    print("\nType your questions about NEEPCO DOP policies. Type 'quit' to exit.")
    while True:
        try:
            user_query = input("\n\nYour question: ").strip()
            if user_query.lower() == 'quit':
                print("Exiting chat. Goodbye!")
                break
            if user_query:
                chatbot.chat(user_query)
        except (KeyboardInterrupt, EOFError):
            print("\nExiting chat. Goodbye!")
            break

# Start the chatbot
start_chat()

In [None]:
%%writefile quantize_model.py

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import os

# --- Configuration ---
# Path to the full-precision merged model from the previous step
MERGED_MODEL_PATH = "./merged_fine_tuned_tinylama_dop"
# Path where the new, quantized model will be saved
QUANTIZED_MODEL_PATH = "./quantized_fine_tuned_tinylama_dop"

# --- Main Quantization Logic ---
def quantize_and_save_model():
    """Loads a model, quantizes it to 4-bit, and saves it."""

    print(f"Loading model from: {MERGED_MODEL_PATH}")

    # --- 1. Define Quantization Configuration ---
    # This configures the model to be loaded in 4-bit precision.
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4", # Use NF4 (NormalFloat-4) quantization
        bnb_4bit_compute_dtype=torch.bfloat16, # Compute in bfloat16 for performance
        bnb_4bit_use_double_quant=True, # Use a second quantization for more memory savings
    )

    # --- 2. Load the Model with Quantization ---
    # The `quantization_config` argument applies the 4-bit conversion on-the-fly.
    model = AutoModelForCausalLM.from_pretrained(
        MERGED_MODEL_PATH,
        quantization_config=quantization_config,
        device_map="auto", # Automatically map model layers to available devices (GPU/CPU)
    )

    # Also load the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(MERGED_MODEL_PATH)
    
    print(f"\nModel loaded and quantized successfully.")
    print("Model memory footprint:")
    print(model.get_memory_footprint())

    # --- 3. Save the Quantized Model ---
    # The `save_pretrained` method for a quantized model saves the model with its
    # quantization configuration, making it easy to load later.
    print(f"\nSaving quantized model to: {QUANTIZED_MODEL_PATH}")
    os.makedirs(QUANTIZED_MODEL_PATH, exist_ok=True)
    
    model.save_pretrained(QUANTIZED_MODEL_PATH)
    tokenizer.save_pretrained(QUANTIZED_MODEL_PATH)
    
    print("\n--- Quantization Complete ---")
    print(f"The 4-bit quantized model is saved at: {os.path.abspath(QUANTIZED_MODEL_PATH)}")
    print("You can now use this path in your inference script.")

# --- Start the process ---
if __name__ == "__main__":
    if not os.path.exists(MERGED_MODEL_PATH):
        print(f"Error: Merged model not found at '{MERGED_MODEL_PATH}'.")
        print("Please ensure you have run the merging script first.")
    else:
        quantize_and_save_model()

In [None]:
!python quantize_model.py

In [None]:
%%writefile rag_chatbot_quantized.py

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
import chromadb
from sentence_transformers import SentenceTransformer
import os

# --- Configuration ---
MODEL_PATH = "./quantized_fine_tuned_tinylama_dop"
VECTOR_DB_PATH = "/kaggle/working/policy_vector_db"

# --- Class Definition 1: The Vector DB Class (With Fix) ---
class PolicyVectorDB:
    """Manages the persistent vector database using ChromaDB."""
    def __init__(self, persist_directory: str):
        self.client = chromadb.PersistentClient(path=persist_directory)
        self.collection_name = "neepco_dop_policies"
        
        # *** FIX APPLIED HERE: Load embedding model on the CPU ***
        print("Loading embedding model onto CPU...")
        self.embedding_model = SentenceTransformer('BAAI/bge-large-en-v1.5', device='cpu')
        
        self.collection = self.client.get_or_create_collection(name=self.collection_name)
        print(f"Loaded/Created persistent collection '{self.collection_name}' at '{persist_directory}'")

    def search(self, query: str, top_k: int = 3) -> list:
        """Searches the database for a given query."""
        query_embedding = self.embedding_model.encode([query]).tolist()
        results = self.collection.query(
            query_embeddings=query_embedding,
            n_results=top_k,
            include=['documents', 'metadatas', 'distances']
        )
        search_results = []
        if not results or not results.get('documents') or not results['documents'][0]:
            return []
        for i in range(len(results['documents'][0])):
            relevance_score = 1 - results['distances'][0][i]
            search_results.append({
                'text': results['documents'][0][i],
                'metadata': results['metadatas'][0][i],
                'relevance_score': relevance_score
            })
        return search_results

# --- Class Definition 2: The Interactive Chatbot Class (Unchanged) ---
class InteractiveChatbot:
    """Handles the full RAG pipeline for interactive chat."""
    def __init__(self, model_path: str, vector_db_path: str):
        print(f"Loading QUANTIZED fine-tuned model from: {model_path}...")
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        
        self.model = AutoModelForCausalLM.from_pretrained(
            model_path,
            device_map="auto"
        )
        
        if self.tokenizer.pad_token is None: self.tokenizer.pad_token = self.tokenizer.eos_token
        
        print("Connecting to existing vector database...")
        self.vector_db = PolicyVectorDB(vector_db_path)
        
        print("\nInteractive Quantized Chatbot ready!")

    def retrieve_context(self, query: str, top_k: int = 3) -> list:
        retrieved_results = self.vector_db.search(query, top_k=top_k)
        RELEVANCE_THRESHOLD = 0.1
        return [r for r in retrieved_results if r.get('relevance_score', 0) >= RELEVANCE_THRESHOLD]
    
    def format_prompt(self, query: str, context_results: list) -> str:
        context_text = ""
        if context_results:
            for i, result in enumerate(context_results, 1):
                metadata = result.get("metadata", {})
                source_info = f"[Section: {metadata.get('section', 'N/A')}, Clause: {metadata.get('clause', 'N/A')}]"
                context_text += f"Source {i}: {source_info}\nDetails: {result.get('text', '')}\n\n"
        else:
            context_text = "No specific policy information was found for this question."
            
        return f"""<s>[INST] You are a helpful assistant for NEEPCO's Delegation of Power (DOP) policies. Use only the provided policy information to answer questions accurately and completely. Ensure you state the full rule, including all required actions, conditions, and approvals mentioned in the policy. If the provided information is insufficient, state that you cannot answer based on the given policies.

Policy Information:
{context_text.strip()}

Question: {query} [/INST]

Answer: Based on the provided policy information,"""

    def chat(self, query: str):
        context_results = self.retrieve_context(query)
        
        if not context_results:
            print("\nAnswer: I apologize, but I cannot find relevant information in the provided NEEPCO policy documents to answer your question.")
            return

        prompt = self.format_prompt(query, context_results)
        streamer = TextStreamer(self.tokenizer, skip_prompt=True, skip_special_tokens=True)
        
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
        
        print("\nAnswer: ", end="")
        _ = self.model.generate(
            **inputs,
            streamer=streamer,
            max_new_tokens=512,
            do_sample=True,
            temperature=0.1,
            top_p=0.9,
            pad_token_id=self.tokenizer.eos_token_id
        )
        print("\n\n---")
        print("Sources Used:")
        for i, r in enumerate(context_results, 1):
            meta = r['metadata']
            print(f"{i}. [Relevance: {r['relevance_score']:.2f}] Section {meta.get('section', 'N/A')}, Clause {meta.get('clause', 'N/A')}")
        print("-" * 50)

# --- Main execution block ---
# --- Main execution block (SIMPLE, ONE-SHOT VERSION) ---
def start_chat():
    # Check if required files/folders exist
    if not os.path.exists(VECTOR_DB_PATH) or not os.path.exists(MODEL_PATH):
        print("Error: Database or Quantized Model not found.")
        print(f"Please ensure your vector DB folder exists at '{VECTOR_DB_PATH}'.")
        print(f"And ensure the quantized model exists at '{MODEL_PATH}'.")
        return

    # Initialize the chatbot once
    chatbot = InteractiveChatbot(model_path=MODEL_PATH, vector_db_path=VECTOR_DB_PATH)

    # --- ASK YOUR QUESTION HERE ---
    # Change the text in the quotes below and re-run the cell to ask a new question.
    user_query = "What is the approval limit for a CGM for purchasing steel via open tender?"
    
    print(f"\n\nProcessing your question: '{user_query}'")
    if user_query:
        chatbot.chat(user_query)
    else:
        print("Please write a question in the `user_query` variable.")


# Start the process
if __name__ == "__main__":
    start_chat()

In [None]:
!python rag_chatbot_quantized.py

In [None]:
import os
import shutil
from IPython.display import FileLink, display

# --- 1. Define the paths to your assets ---
# The directory of your final, quantized model
quantized_model_dir = "./quantized_fine_tuned_tinylama_dop"

# The directory containing your ChromaDB vector database
vector_db_dir = "./policy_vector_db"

# The original dataset file that contains the processed chunks
# Note: Adjust this path if it's different in your environment
source_dataset_file = "/kaggle/input/dop-dataset/Dataset/combined_dataset.jsonl"

# The name for the final zip file
output_zip_name = "dop_chatbot_deployment_package"

# A temporary directory to gather all files before zipping
staging_dir = "./for_zipping"


# --- 2. Create a clean staging directory ---
print(f"Creating temporary staging directory: {staging_dir}")
if os.path.exists(staging_dir):
    shutil.rmtree(staging_dir)
os.makedirs(staging_dir)


# --- 3. Copy all assets to the staging directory ---
print("Copying assets...")

# Copy the quantized model
if os.path.exists(quantized_model_dir):
    print(f"- Copying model from {quantized_model_dir}")
    shutil.copytree(quantized_model_dir, os.path.join(staging_dir, "quantized_model"))
else:
    print(f"WARNING: Model directory not found at {quantized_model_dir}")

# Copy the vector database
if os.path.exists(vector_db_dir):
    print(f"- Copying vector database from {vector_db_dir}")
    shutil.copytree(vector_db_dir, os.path.join(staging_dir, "vector_database"))
else:
    print(f"WARNING: Vector DB directory not found at {vector_db_dir}")

# Copy the source dataset
if os.path.exists(source_dataset_file):
    print(f"- Copying source dataset from {source_dataset_file}")
    shutil.copy(source_dataset_file, staging_dir)
else:
    print(f"WARNING: Source dataset not found at {source_dataset_file}")

print("\nAssets copied successfully.")


# --- 4. Create the zip archive ---
print(f"\nCreating zip file: '{output_zip_name}.zip'...")
shutil.make_archive(output_zip_name, 'zip', staging_dir)
print("Zip file created successfully.")


# --- 5. Clean up the temporary staging directory ---
print("Cleaning up temporary directory...")
shutil.rmtree(staging_dir)
print("Cleanup complete.")


# --- 6. Generate and display the download link ---
print("\n-------------------------------------------")
print("Your download is ready!")
display(FileLink(f'{output_zip_name}.zip'))

In [None]:
import os
import shutil
import subprocess

print("--- Starting GGUF conversion using llama.cpp (Addressing requirements.txt error) ---")

# Define base paths and file names
KAGGLE_WORKING_DIR = "/kaggle/working"
LLAMA_CPP_REPO_NAME = "llama.cpp"
LLAMA_CPP_DIR = os.path.join(KAGGLE_WORKING_DIR, LLAMA_CPP_REPO_NAME)
LLAMA_CPP_BUILD_DIR = os.path.join(LLAMA_CPP_DIR, "build")
LLAMA_CPP_BIN_DIR = os.path.join(LLAMA_CPP_BUILD_DIR, "bin")

# Updated conversion script path (confirmed from your ls -F output)
ACTIVE_CONVERT_SCRIPT_NAME = "convert_hf_to_gguf.py"
ACTIVE_CONVERT_SCRIPT_PATH_IN_LLAMA_CPP = os.path.join(LLAMA_CPP_DIR, ACTIVE_CONVERT_SCRIPT_NAME)

# Model and output paths
source_model_path = os.path.join(KAGGLE_WORKING_DIR, "merged_fine_tuned_tinylama_dop")
output_gguf_fp32_path = os.path.join(KAGGLE_WORKING_DIR, "tinyllama_dop_fp32.gguf")
final_quantized_output_gguf_path = os.path.join(KAGGLE_WORKING_DIR, "tinyllama_dop_q4_k_m.gguf")
quantization_method = "Q4_K_M"

# --- Step 0: Clean previous attempts ---
print(f"\n0. Cleaning up previous '{LLAMA_CPP_REPO_NAME}' clone and old GGUF files (from {KAGGLE_WORKING_DIR})...")
os.chdir(KAGGLE_WORKING_DIR) # Ensure we are in the base working directory before cleaning
if os.path.exists(LLAMA_CPP_DIR):
    shutil.rmtree(LLAMA_CPP_DIR)
if os.path.exists(output_gguf_fp32_path):
    os.remove(output_gguf_fp32_path)
if os.path.exists(final_quantized_output_gguf_path):
    os.remove(final_quantized_output_gguf_path)
print("Cleanup complete.")

# --- Step 1: Clone llama.cpp ---
print(f"\n1. Cloning '{LLAMA_CPP_REPO_NAME}' repository...")
try:
    !git clone https://github.com/ggerganov/{LLAMA_CPP_REPO_NAME}.git
    print(f"'{LLAMA_CPP_REPO_NAME}' cloned into: {LLAMA_CPP_DIR}")
except Exception as e:
    print(f"ERROR: Git clone failed: {e}. Please check your internet connection or Kaggle environment.")
    exit()

# --- DIAGNOSTIC: Verify clone contents ---
print(f"\nDIAGNOSTIC: Listing contents of '{LLAMA_CPP_DIR}':")
!ls -F {LLAMA_CPP_DIR}
print(f"\nDIAGNOSTIC: Checking for '{ACTIVE_CONVERT_SCRIPT_NAME}' at {ACTIVE_CONVERT_SCRIPT_PATH_IN_LLAMA_CPP}: {os.path.exists(ACTIVE_CONVERT_SCRIPT_PATH_IN_LLAMA_CPP)}")


# --- Step 2: Install Python requirements for llama.cpp (from its root requirements.txt and manually) ---
print(f"\n2. Installing Python requirements for '{LLAMA_CPP_REPO_NAME}'...")

# Attempt to install from llama.cpp's main requirements.txt
llama_cpp_main_requirements = os.path.join(LLAMA_CPP_DIR, "requirements.txt")
if os.path.exists(llama_cpp_main_requirements):
    print(f"Installing from main llama.cpp requirements.txt: {llama_cpp_main_requirements}")
    !pip install -r {llama_cpp_main_requirements}
else:
    print(f"WARNING: Main llama.cpp requirements.txt not found at {llama_cpp_main_requirements}. Proceeding with manual installs.")

# Ensure core dependencies for conversion are installed explicitly (especially for conversion scripts)
print("Ensuring core conversion dependencies (transformers, torch, sentencepiece, accelerate, huggingface_hub) are installed explicitly...")
!pip install transformers==4.52.4 torch sentencepiece accelerate huggingface_hub==0.30.0

# --- Step 3: Build llama.cpp using CMake ---
print(f"\n3. Building '{LLAMA_CPP_REPO_NAME}' using CMake...")
os.makedirs(LLAMA_CPP_BUILD_DIR, exist_ok=True)
current_dir_before_build = os.getcwd() # Save /kaggle/working
os.chdir(LLAMA_CPP_BUILD_DIR) # Change to /kaggle/working/llama.cpp/build

try:
    !cmake {LLAMA_CPP_DIR} # Configure CMake from llama.cpp root
    !cmake --build . --config Release # Build the project
    print("CMake build successful.")
except Exception as e:
    print(f"ERROR: CMake build failed: {e}. Check build logs above for details.")
    os.chdir(current_dir_before_build) # Change back
    exit() # Exit if build fails

os.chdir(current_dir_before_build) # Change back to /kaggle/working
print(f"'{LLAMA_CPP_REPO_NAME}' built. Binaries should be in: {LLAMA_CPP_BIN_DIR}")


# --- Step 4: Convert your Hugging Face model to GGUF (float32) ---
# IMPORTANT: Temporarily change directory to llama.cpp root BEFORE running the script
# as it might expect its own requirements or helper files relative to its location.
print(f"\n4. Converting Hugging Face model from '{source_model_path}' to GGUF (float32) using '{ACTIVE_CONVERT_SCRIPT_PATH_IN_LLAMA_CPP}'...")
if not os.path.exists(ACTIVE_CONVERT_SCRIPT_PATH_IN_LLAMA_CPP):
    print(f"CRITICAL ERROR: Conversion script not found at {ACTIVE_CONVERT_SCRIPT_PATH_IN_LLAMA_CPP}. Aborting.")
    exit()

current_dir_before_convert = os.getcwd() # Save /kaggle/working
os.chdir(LLAMA_CPP_DIR) # Change to /kaggle/working/llama.cpp

try:
    # Pass absolute paths for source and output as the script's current working directory has changed.
    # The script name is now relative to the current directory (llama.cpp root).
    !python {ACTIVE_CONVERT_SCRIPT_NAME} {source_model_path} --outfile {output_gguf_fp32_path} --outtype f32
    print("Conversion script executed.")
except Exception as e:
    print(f"ERROR: Conversion script execution failed: {e}. Review the traceback above.")
    os.chdir(current_dir_before_convert) # Change back
    exit() # Exit if conversion fails

os.chdir(current_dir_before_convert) # Change back to /kaggle/working


# --- Step 5: Quantize the GGUF model to a smaller, CPU-optimized format (e.g., Q4_K_M) ---
print(f"\n5. Quantizing GGUF model to {quantization_method}...")
if not os.path.exists(os.path.join(LLAMA_CPP_BIN_DIR, 'quantize')):
    print(f"CRITICAL ERROR: 'quantize' executable not found at {os.path.join(LLAMA_CPP_BIN_DIR, 'quantize')}. CMake build might have failed.")
    exit()
else:
    !{os.path.join(LLAMA_CPP_BIN_DIR, 'quantize')} {output_gguf_fp32_path} {final_quantized_output_gguf_path} {quantization_method}

# --- Step 6: Final Verification and Instructions ---
print("\n--- GGUF Conversion (via llama.cpp) Process Complete ---")
print(f"Your final quantized GGUF model is located at: {os.path.abspath(final_quantized_output_gguf_path)}")
print(f"You can find the tokenizer files from your original model at: {os.path.abspath(source_model_path)}")
print("\n--- If the GGUF file was created, proceed to upload it and tokenizer files to Hugging Face Hub. ---")

In [None]:
import os
import shutil
import subprocess

print("--- Starting GGUF conversion using llama.cpp (Debugging 'quantize' executable build) ---")

# Define base paths and file names
KAGGLE_WORKING_DIR = "/kaggle/working"
LLAMA_CPP_REPO_NAME = "llama.cpp"
LLAMA_CPP_DIR = os.path.join(KAGGLE_WORKING_DIR, LLAMA_CPP_REPO_NAME)
LLAMA_CPP_BUILD_DIR = os.path.join(LLAMA_CPP_DIR, "build")
LLAMA_CPP_BIN_DIR = os.path.join(LLAMA_CPP_BUILD_DIR, "bin") # For 'quantize' executable

# Updated conversion script path
ACTIVE_CONVERT_SCRIPT_NAME = "convert_hf_to_gguf.py"
ACTIVE_CONVERT_SCRIPT_PATH_IN_LLAMA_CPP = os.path.join(LLAMA_CPP_DIR, ACTIVE_CONVERT_SCRIPT_NAME)

# Model and output paths
source_model_path = os.path.join(KAGGLE_WORKING_DIR, "merged_fine_tuned_tinylama_dop")
output_gguf_fp32_path = os.path.join(KAGGLE_WORKING_DIR, "tinyllama_dop_fp32.gguf")
final_quantized_output_gguf_path = os.path.join(KAGGLE_WORKING_DIR, "tinyllama_dop_q4_k_m.gguf")
quantization_method = "Q4_K_M"

# --- Step 0: Clean previous attempts ---
print(f"\n0. Cleaning up previous '{LLAMA_CPP_REPO_NAME}' clone and old GGUF files (from {KAGGLE_WORKING_DIR})...")
os.chdir(KAGGLE_WORKING_DIR) # Ensure we are in the base working directory before cleaning
if os.path.exists(LLAMA_CPP_DIR):
    shutil.rmtree(LLAMA_CPP_DIR)
if os.path.exists(output_gguf_fp32_path):
    os.remove(output_gguf_fp32_path)
if os.path.exists(final_quantized_output_gguf_path):
    os.remove(final_quantized_output_gguf_path)
print("Cleanup complete.")

# --- Step 1: Clone llama.cpp ---
print(f"\n1. Cloning '{LLAMA_CPP_REPO_NAME}' repository...")
try:
    !git clone https://github.com/ggerganov/{LLAMA_CPP_REPO_NAME}.git
    print(f"'{LLAMA_CPP_REPO_NAME}' cloned into: {LLAMA_CPP_DIR}")
except Exception as e:
    print(f"ERROR: Git clone failed: {e}. Please check your internet connection or Kaggle environment.")
    exit()

# --- DIAGNOSTIC: Verify clone contents ---
print(f"\nDIAGNOSTIC: Listing contents of '{LLAMA_CPP_DIR}' after cloning:")
!ls -F {LLAMA_CPP_DIR}
print(f"\nDIAGNOSTIC: Checking for '{ACTIVE_CONVERT_SCRIPT_NAME}' at {ACTIVE_CONVERT_SCRIPT_PATH_IN_LLAMA_CPP}: {os.path.exists(ACTIVE_CONVERT_SCRIPT_PATH_IN_LLAMA_CPP)}")


# --- Step 2: Install Python requirements for llama.cpp (from its root requirements.txt and manually) ---
print(f"\n2. Installing Python requirements for '{LLAMA_CPP_REPO_NAME}'...")
llama_cpp_main_requirements = os.path.join(LLAMA_CPP_DIR, "requirements.txt")
if os.path.exists(llama_cpp_main_requirements):
    print(f"Installing from main llama.cpp requirements.txt: {llama_cpp_main_requirements}")
    !pip install -r {llama_cpp_main_requirements}
else:
    print(f"WARNING: Main llama.cpp requirements.txt not found at {llama_cpp_main_requirements}. Proceeding with manual installs.")

print("Ensuring core conversion dependencies (transformers, torch, sentencepiece, accelerate, huggingface_hub) are installed explicitly...")
!pip install transformers==4.52.4 torch sentencepiece accelerate huggingface_hub==0.30.0

# --- Step 3: Build llama.cpp using CMake (modern build process) ---
print(f"\n3. Building '{LLAMA_CPP_REPO_NAME}' using CMake (VERBOSE)...")
os.makedirs(LLAMA_CPP_BUILD_DIR, exist_ok=True)
current_dir_before_build = os.getcwd() # Save /kaggle/working
os.chdir(LLAMA_CPP_BUILD_DIR) # Change to /kaggle/working/llama.cpp/build

try:
    print("Running CMake configuration (verbose)...")
    !cmake {LLAMA_CPP_DIR}
    print("Running CMake build (verbose)...")
    # Add VERBOSE=1 to see all compilation details, which might reveal why 'quantize' fails.
    !cmake --build . --config Release -- VERBOSE=1
    print("CMake build process completed (check logs above for any specific warnings/errors for 'quantize').")
except Exception as e:
    print(f"ERROR: CMake build failed: {e}. Review the detailed build logs above.")
    os.chdir(current_dir_before_build) # Change back
    exit() # Exit if build fails

os.chdir(current_dir_before_build) # Change back to /kaggle/working

print(f"'{LLAMA_CPP_REPO_NAME}' built. Binaries should be in: {LLAMA_CPP_BIN_DIR}")

# --- DIAGNOSTIC: List contents of the build/bin directory AND check quantize explicitly ---
print(f"\nDIAGNOSTIC: Listing contents of '{LLAMA_CPP_BIN_DIR}':")
!ls -F {LLAMA_CPP_BIN_DIR}
print("-" * 50)
quantize_executable_path_check = os.path.join(LLAMA_CPP_BIN_DIR, 'quantize')
# Check both existence and executability
if os.path.exists(quantize_executable_path_check) and os.access(quantize_executable_path_check, os.X_OK):
    print(f"DIAGNOSTIC: 'quantize' executable found and is executable at {quantize_executable_path_check}")
else:
    print(f"DIAGNOSTIC: WARNING: 'quantize' executable NOT found or not executable at {quantize_executable_path_check}")
    print("This indicates a problem with the CMake build specifically for 'quantize'.")
print("-" * 50)


# --- Step 4: Convert your Hugging Face model to GGUF (float32) ---
print(f"\n4. Converting Hugging Face model from '{source_model_path}' to GGUF (float32) using '{ACTIVE_CONVERT_SCRIPT_PATH_IN_LLAMA_CPP}'...")
if not os.path.exists(ACTIVE_CONVERT_SCRIPT_PATH_IN_LLAMA_CPP):
    print(f"CRITICAL ERROR: Conversion script not found at {ACTIVE_CONVERT_SCRIPT_PATH_IN_LLAMA_CPP}. Aborting.")
    exit()

current_dir_before_convert = os.getcwd() # Save /kaggle/working
os.chdir(LLAMA_CPP_DIR) # Change to /kaggle/working/llama.cpp

try:
    !python {ACTIVE_CONVERT_SCRIPT_NAME} {source_model_path} --outfile {output_gguf_fp32_path} --outtype f32
    print("Conversion script executed.")
except Exception as e:
    print(f"ERROR: Conversion script execution failed: {e}. Review the traceback above.")
    os.chdir(current_dir_before_convert) # Change back
    exit()

os.chdir(current_dir_before_convert) # Change back to /kaggle/working


# --- Step 5: Quantize the GGUF model to a smaller, CPU-optimized format (e.g., Q4_K_M) ---
print(f"\n5. Quantizing GGUF model to {quantization_method}...")
quantize_executable_path = os.path.join(LLAMA_CPP_BIN_DIR, 'quantize')
if not (os.path.exists(quantize_executable_path) and os.access(quantize_executable_path, os.X_OK)): # Check existence AND executability
    print(f"CRITICAL ERROR: 'quantize' executable not found or not executable at {quantize_executable_path}. CMake build might have had issues.")
    exit() # Exit if quantize not found
else:
    print(f"Executing quantize: {quantize_executable_path}")
    !{quantize_executable_path} {output_gguf_fp32_path} {final_quantized_output_gguf_path} {quantization_method}

# --- Step 6: Final Verification and Instructions ---
print("\n--- GGUF Conversion (via llama.cpp) Process Complete ---")
print(f"Your final quantized GGUF model is located at: {os.path.abspath(final_quantized_output_gguf_path)}")
print(f"You can find the tokenizer files from your original model at: {os.path.abspath(source_model_path)}")
print("\n--- If the GGUF file was created, proceed to upload it and tokenizer files to Hugging Face Hub. ---")