In [None]:
import sys
import os
from pathlib import Path

# Set up cache directory
CACHE_DIR = Path("/kaggle/working/rag_cache")
CACHE_DIR.mkdir(exist_ok=True)

# --- FAISS Installation Block ---
print("Setting up FAISS...")

# First try GPU version (may fail in Kaggle)
!pip install --quiet faiss-gpu --no-deps || echo "GPU install failed, falling back to CPU"

# If GPU install failed, install CPU version
if "faiss-gpu" not in sys.modules:
    !pip install --quiet faiss-cpu

# Verify installation
try:
    import faiss
    print("✅ FAISS imported - GPU version" if "faiss-gpu" in sys.modules 
          else "✅ FAISS imported - CPU version")
    
    # Test basic functionality
    dimension = 128
    index = faiss.IndexFlatL2(dimension)
    
    # Try to use GPU if available
    USE_GPU = False
    if "faiss-gpu" in sys.modules and torch.cuda.is_available():
        try:
            res = faiss.StandardGpuResources()
            index = faiss.index_cpu_to_gpu(res, 0, index)
            USE_GPU = True
            print("FAISS GPU acceleration enabled")
        except Exception as e:
            print(f"GPU acceleration failed, using CPU: {str(e)}")
    
    print(f"Test index created (dim={dimension}):", index.is_trained)
    HAS_FAISS = True
    
except Exception as e:
    print("❌ FAISS installation failed completely:", str(e))
    HAS_FAISS = False

# --- Core Package Installation ---
print("\nInstalling other requirements...")

REQUIRED_PACKAGES = [
    'sentence-transformers',
    'scikit-learn',
    'transformers',
    'numpy',
    'torch'
]

!pip install --quiet {' '.join(REQUIRED_PACKAGES)}

# --- Verification Block ---
print("\nVerifying imports...")

try:
    from sentence_transformers import SentenceTransformer
    from transformers import AutoModelForCausalLM, AutoTokenizer
    import sklearn
    import torch
    import numpy as np
    print("✅ All core packages imported!")
    
    if HAS_FAISS:
        if USE_GPU:
            print("FAISS GPU available for accelerated search")
        else:
            print("FAISS CPU available (no GPU acceleration)")
    else:
        print("Using cosine similarity fallback")
        
except ImportError as e:
    print("❌ Import failed:", e)
    raise

In [2]:
import gc
import torch

torch.cuda.empty_cache()
gc.collect()

torch.cuda.empty_cache()  # Clears unused cached memory
torch.cuda.ipc_collect()

In [None]:

from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import login
from typing import List, Dict, Any





In [4]:
import pickle
from pathlib import Path
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import json
import random
import faiss
import time
from typing import Dict

# Kaggle-specific paths
CACHE_DIR = Path("/kaggle/working/rag_cache")
CACHE_DIR.mkdir(exist_ok=True)

In [5]:
class Retriever:
    def __init__(self, model_name='all-MiniLM-L6-v2', device='cuda'):
        self.model = SentenceTransformer(model_name, device=device)
        self.documents = []
        self.embeddings = None
        self.index = None
        
        # Initialize FAISS availability
        # self.has_faiss = self._initialize_faiss()
        self.has_faiss = False
        
    def _initialize_faiss(self):
        """Check and initialize FAISS availability"""
        try:
            import faiss
            self.faiss = faiss
            # Test basic FAISS functionality
            test_index = faiss.IndexFlatL2(128)
            return True
        except:
            print("FAISS not available, falling back to cosine similarity")
            return False
    
    def save_state(self):
        """Save state to disk"""
        state = {
            'documents': self.documents,
            'embeddings': self.embeddings,
            'has_faiss': self.has_faiss  # Save FAISS availability state
        }
        
        with open(CACHE_DIR/'retriever_state.pkl', 'wb') as f:
            pickle.dump(state, f)
            
        if self.has_faiss and self.index:
            self.faiss.write_index(self.index, str(CACHE_DIR/'faiss_index.bin'))
    
    def load_state(self):
        """Load state from disk"""
        try:
            with open(CACHE_DIR/'retriever_state.pkl', 'rb') as f:
                state = pickle.load(f)
            
            self.documents = state['documents']
            self.embeddings = state['embeddings']
            self.has_faiss = state.get('has_faiss', False)
            
            if self.has_faiss:
                import faiss
                self.faiss = faiss
                faiss_file = CACHE_DIR/'faiss_index.bin'
                if faiss_file.exists():
                    self.index = self.faiss.read_index(str(faiss_file))
            
            return True
        except FileNotFoundError:
            return False
    
    def add_documents(self, documents: list, batch_size=128, force_reindex=False):
        """Index documents with both methods"""
        if not force_reindex and self.load_state():
            print("Loaded cached document embeddings")
            return
            
        self.documents = []
        
        # Process documents
        for entry in documents:
            if not isinstance(entry, dict):
                continue
                
            question = entry.get('question', '')
            metadata = entry.get('metadata', {})
            
            for chunk in entry.get('context_chunks', []):
                self.documents.append({
                    'text': chunk.get('text', ''),
                    'question': question,
                    'contains_answer': chunk.get('contains_answer', False),
                    'chunk_score': chunk.get('score', 0),
                    'source': chunk.get('source', 'unknown'),
                    'metadata': metadata
                })
        
        # Generate embeddings
        pairs = [f"{doc['question']} [SEP] {doc['text']}" for doc in self.documents]
        self.embeddings = self.model.encode(
            pairs,
            batch_size=batch_size,
            show_progress_bar=True,
            convert_to_tensor=False
        )
        
        # Normalize embeddings
        self.embeddings = self.embeddings / np.linalg.norm(
            self.embeddings, axis=1, keepdims=True)
        
        # Build FAISS index if available
        if self.has_faiss:
            dimension = self.embeddings.shape[1]
            self.index = self.faiss.IndexFlatIP(dimension)
            self.index.add(self.embeddings.astype('float32'))
        
        self.save_state()
        print(f"Indexed {len(self.documents)} chunks")
    
    def query(self, question: str, k=5) -> List[dict]:
        """Unified query method that automatically uses the best available backend"""
        if not hasattr(self, 'has_faiss'):  # Backward compatibility
            self.has_faiss = self._initialize_faiss()
            
        if self.has_faiss and self.index is not None:
            print("Using FAISS for accelerated search")
            return self._query_faiss(question, k)
        else:
            print("Using cosine similarity (FAISS not available)")
            return self._query_cosine(question, k)
    
    def _query_faiss(self, question: str, k=5) -> List[dict]:
        """Modified FAISS implementation with score thresholding"""
        question_embedding = self.model.encode([question], convert_to_tensor=False)
        question_embedding = question_embedding / np.linalg.norm(question_embedding)
        question_embedding = question_embedding.astype('float32').reshape(1, -1)
        
        # Get more candidates initially
        distances, indices = self.index.search(question_embedding, k*5)
        
        results = []
        seen_texts = set()
        
        for idx, score in zip(indices[0], distances[0]):
            if idx < 0 or score < 0.65:  # New score threshold
                continue
                
            doc = self.documents[idx]
            if doc['text'] not in seen_texts:
                results.append({
                    'text': doc['text'],
                    'score': float(score),
                    # ... rest of fields
                })
                seen_texts.add(doc['text'])
                if len(results) >= k:
                    break
                    
        return results if results else [{'text': "NO_RELEVANT_CONTENT", 'score': 0}]
    
    def _query_cosine(self, question: str, k=5) -> List[dict]:
        """Cosine similarity fallback"""
        question_embedding = self.model.encode([question], convert_to_tensor=False)
        question_embedding = question_embedding / np.linalg.norm(question_embedding)
        
        scores = np.dot(self.embeddings, question_embedding.T).flatten()
        
        results = []
        seen_texts = set()
        
        for idx in np.argsort(scores)[-k*2:][::-1]:
            doc = self.documents[idx]
            if doc['text'] not in seen_texts:
                results.append({
                    'text': doc['text'],
                    'score': float(scores[idx]),
                    'question': doc['question'],
                    'contains_answer': doc['contains_answer'],
                    'source': doc['source'],
                    'metadata': doc['metadata']
                })
                seen_texts.add(doc['text'])
                if len(results) >= k:
                    break
                    
        return results



    def benchmark_retrieval(self, question: str, k: int = 5, runs: int = 10) -> Dict:
            """Benchmark both retrieval methods and return timing results"""
            results = {
                'question': question,
                'runs': runs,
                'faiss': {'times': [], 'avg': 0, 'results': None},
                'cosine': {'times': [], 'avg': 0, 'results': None}
            }
            
            # Only benchmark FAISS if it's available
            if self.has_faiss and self.index:
                for _ in range(runs):
                    start = time.perf_counter()
                    faiss_results = self._query_faiss(question, k)
                    elapsed = time.perf_counter() - start
                    results['faiss']['times'].append(elapsed)
                results['faiss']['avg'] = sum(results['faiss']['times'])/runs
                results['faiss']['results'] = faiss_results[:2]  # Sample results
            
            # Always benchmark cosine
            for _ in range(runs):
                start = time.perf_counter()
                cosine_results = self._query_cosine(question, k)
                elapsed = time.perf_counter() - start
                results['cosine']['times'].append(elapsed)
            results['cosine']['avg'] = sum(results['cosine']['times'])/runs
            results['cosine']['results'] = cosine_results[:2]  # Sample results
            
            return results

In [6]:
class Generator:
    def __init__(self, model_name='deepseek-ai/deepseek-coder-6.7b-instruct'):
        self.model_name = model_name
        self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            trust_remote_code=True,
            torch_dtype=torch.float16,
            device_map="auto"
        )
    
    def build_prompt(self, question: str, context_chunks: list[dict]) -> str:
        """More inclusive Python prompt covering ecosystem while staying strict"""
        
        # Filter chunks by score threshold
        valid_chunks = [c for c in context_chunks if c['score'] >= 0.65]
        
        if not valid_chunks:
            return f"""Answer the question following these rules:
    1. If the question isn't related to Python (language, ecosystem, tools, or community), 
       respond EXACTLY with: "<answer>I don't know</answer>"
    2. If no relevant context exists, respond EXACTLY with: "<answer>I don't know</answer>"
    
    Question: {question}
    
    <answer>I don't know</answer>"""
    
        context_str = "\n".join(
            f"### Context Block {i+1} [Relevance: {c['score']:.2f}]:\n{c['text']}\n"
            f"----------------------------"
            for i, c in enumerate(valid_chunks[:3])
        )
    
        return f"""You are a Python specialist assistant. Follow these rules STRICTLY:
    
    {context_str}
    
    ### Python Scope Includes:
    - Python programming language (any version)
    - Standard library and built-in functions
    - Popular frameworks (Django, Flask, etc.)
    - Scientific Python (NumPy, Pandas, etc.)
    - Python packaging and distribution (pip, PyPI)
    - Official Python documentation
    - PEP standards
    - Python community conventions
    
    ### Response Protocol:
    1. FIRST determine if the question relates to ANY aspect of Python (see scope above)
    2. THEN verify if the answer exists in the provided context
    3. If both conditions are met:
       - Cite which context block(s) you're using
       - Provide a concise answer
    4. If either condition fails:
       - Respond EXACTLY with "<answer>I don't know</answer>"
    5. ALWAYS wrap your final response in <answer></answer> tags
    
    Question: {question}
    
    ### Step-by-Step Verification:
    1. Python relevance check:
    2. Context adequacy check: 
    3. Source context block(s):
    4. Final answer:
    
    <answer>"""
    
    def generate_answer(self, prompt: str) -> str:
        inputs = self.tokenizer(prompt, return_tensors="pt").to('cuda')
        outputs = self.model.generate(
            **inputs,
            max_new_tokens=256,
            do_sample=True,
            temperature=0.7,
            top_p=0.95,
            top_k=50,
            repetition_penalty=1.1,
            pad_token_id=self.tokenizer.eos_token_id
        )
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)



In [7]:
class RAGPipeline:
    def __init__(self, retriever_model='all-MiniLM-L6-v2', 
                 generator_model='deepseek-ai/deepseek-coder-6.7b-instruct'):
        self.retriever = Retriever(model_name=retriever_model)
        self.generator = Generator(model_name=generator_model)
    
    def load_dataset(self, dataset_path):
        """Load and parse the JSON dataset"""
        with open(dataset_path, 'r') as f:
            return json.load(f)  # Returns list of QA objects
    
    def prepare_documents(self, dataset):
        """Pass through the data without transformation"""
        return dataset  # Already in correct format    
        
    def add_documents_from_dataset(self, dataset_path: str, force_reindex=False, batch_size: int = 1):
        """Kaggle-friendly document loading"""
        dataset = self.load_dataset(dataset_path)
        documents = self.prepare_documents(dataset)
        self.retriever.add_documents(documents, force_reindex=force_reindex, batch_size = batch_size)
    
    def query(self, question: str, k=5) -> dict:
        """End-to-end query with caching and formatted output"""
        chunks = self.retriever.query(question, k=k)
        prompt = self.generator.build_prompt(question, chunks)
        raw_answer = self.generator.generate_answer(prompt)
        
        # Extract answer between tags
        if "<answer>" in raw_answer and "</answer>" in raw_answer:
            answer = raw_answer.split("<answer>")[-1].split("</answer>")[0].strip()
        else:
            answer = "I don't know"
        
        # Format the output
        formatted_output = f"""
{'='*50}
QUESTION: {question}

{'='*50}
CONTEXT:
{'-'*50}
""" + "\n".join([
    f"Source {i+1} [Relevance: {chunk['score']:.2f}]:\n{chunk['text']}\n{'-'*50}"
    for i, chunk in enumerate(chunks[:3])  # Show top 3 chunks
]) + f"""

{'='*50}
ANSWER:
{'-'*50}
{answer if answer else "I don't know"}
{'='*50}
"""
        
        return {
            'question': question,
            'answer': answer if answer else "I don't know",
            'context': [chunk['text'] for chunk in chunks[:5]],
            'formatted': formatted_output  # Add this new field
        }

    def print_formatted_result(self, result: dict):
        """Prints the result in a human-readable format"""
        print(result.get('formatted', "No formatted output available"))

In [8]:
# Initialize ONCE (models load here)
pipeline = RAGPipeline(
    retriever_model='/kaggle/input/fine-tuned-retrievers/triplet_loss_finetuned_intfloat_e5-base',
    generator_model='deepseek-ai/deepseek-coder-6.7b-instruct'
)

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

In [9]:
import gc
import torch

torch.cuda.empty_cache()
gc.collect()

torch.cuda.empty_cache()  # Clears unused cached memory
torch.cuda.ipc_collect()

In [None]:
DATASET_PATH = '/kaggle/input/simple-python-qa-rag-dataset-v2/simple_python_qa_rag_dataset.json'
pipeline.add_documents_from_dataset(DATASET_PATH, force_reindex=True, batch_size=128)

In [None]:
questions = [
        "ModuleNotFoundError with pytest",
        ]
    
# Run queries
for question in questions:
    result = pipeline.query(question)
    pipeline.print_formatted_result(result)

In [None]:
benchmark_results = pipeline.retriever.benchmark_retrieval(
    question="How to format strings in Python",
    k=5,
    runs=200  # More runs = more accurate results
)

print(f"\nBenchmark Results (avg of {benchmark_results['runs']} runs):")
if benchmark_results['faiss']['avg']:
    print(f"FAISS: {benchmark_results['faiss']['avg']*1000:.2f}ms")
print(f"Cosine: {benchmark_results['cosine']['avg']*1000:.2f}ms")

if benchmark_results['faiss']['avg']:
    speedup = benchmark_results['cosine']['avg']/benchmark_results['faiss']['avg']
    print(f"\nFAISS is {speedup:.1f}x faster than cosine similarity")
else:
    print("\nFAISS not available for comparison")