In [13]:
# section1 Import required libraries
import os
import re
import nltk
import torch
import numpy as np
import logging
from typing import List, Dict, Any
from docx import Document
from tqdm.notebook import tqdm
from pymilvus import connections, Collection, utility, CollectionSchema, FieldSchema, DataType
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def verify_section(section_number: int, verification_func) -> bool:
    """Verify if a section was executed successfully."""
    try:
        result = verification_func()
        print_status(f"Section {section_number} Verification", True, "Successfully executed")
        return True
    except Exception as e:
        print_status(f"Section {section_number} Verification", False, f"Error: {str(e)}")
        return False



In [2]:
## section 2 
def print_status(section_name: str, status: bool, message: str = ""):
    """Print status of a section with colored output."""
    status_str = "✅ SUCCESS" if status else "❌ FAILED"
    print(f"\n{status_str} | {section_name}")
    if message:
        print(f"  └─ {message}")
# Add at the end of section 2
def verify_section2():
    # Test the print_status function
    print_status("Test", True, "Test message")
    return True

verify_section("Status Function", verify_section2)




✅ SUCCESS | Test
  └─ Test message

✅ SUCCESS | Section Status Function Verification
  └─ Successfully executed


True

In [14]:
def get_model_embedding_dim(model_name: str = "TurkuNLP/bert-base-finnish-cased-v1") -> int:
    """Get embedding dimension from model config."""
    model = AutoModel.from_pretrained(model_name)
    return model.config.hidden_size

# Get the embedding dimension from the model
EMBEDDING_DIM = get_model_embedding_dim()
print_status("Embedding Dimension", True, f"Using dimension: {EMBEDDING_DIM}")




✅ SUCCESS | Embedding Dimension
  └─ Using dimension: 768


In [4]:
## section or cell 3
def check_cuda():
    """Check CUDA availability and print status."""
    try:
        if torch.cuda.is_available():
            device_name = torch.cuda.get_device_name(0)
            print_status("CUDA Check", True, f"Using GPU: {device_name}")
            return True
        else:
            print_status("CUDA Check", True, "Using CPU")
            return True
    except Exception as e:
        print_status("CUDA Check", False, str(e))
        return False
def verify_section3():
    return check_cuda()

verify_section("CUDA Setup", verify_section3)




✅ SUCCESS | CUDA Check
  └─ Using GPU: NVIDIA GeForce RTX 2070 SUPER

✅ SUCCESS | Section CUDA Setup Verification
  └─ Successfully executed


True

In [5]:
## section or cell 4
def ensure_stopwords_downloaded(language='finnish'):
    """Download NLTK stopwords and print status."""
    try:
        nltk.download('stopwords', quiet=True)
        print_status("NLTK Setup", True, f"Downloaded {language} stopwords")
        return True
    except Exception as e:
        print_status("NLTK Setup", False, str(e))
        return False

# Milvus Connection Settings
MILVUS_HOST = "milvus-standalone"
MILVUS_PORT = "19530"
MILVUS_ALIAS = "default"
# Add at the end of section 4
def verify_section4():
    result = ensure_stopwords_downloaded()
    if not result:
        raise Exception("Failed to download stopwords")
    if not all([MILVUS_HOST, MILVUS_PORT, MILVUS_ALIAS, EMBEDDING_DIM]):
        raise Exception("Milvus settings not properly defined")
    return True

verify_section("NLTK and Milvus Settings", verify_section4)




✅ SUCCESS | NLTK Setup
  └─ Downloaded finnish stopwords

✅ SUCCESS | Section NLTK and Milvus Settings Verification
  └─ Successfully executed


True

In [6]:
## section or cell 5
class DocumentProcessor:
    def __init__(self, chunk_size=400, chunk_overlap=50):
        try:
            self.text_splitter = RecursiveCharacterTextSplitter(
                separators=["\n\n", "\n", ". ", ", ", " "],
                chunk_size=chunk_size,
                chunk_overlap=chunk_overlap,
                length_function=len,
                keep_separator=True,
                add_start_index=True
            )
            print_status("Document Processor", True, "Initialized successfully")
        except Exception as e:
            print_status("Document Processor", False, str(e))
            raise
        
    def extract_metadata_from_filename(self, filename: str) -> tuple:
        """Extract metadata from filename."""
        title = os.path.splitext(filename)[0]
        match = re.match(r'([A-Za-z]+)\s+(\d{1,3})v\s+([A-Za-z0-9\-]+)', title)
        if match:
            return match.group(1), int(match.group(2)), match.group(3)
        return None, None, None
    
    def preprocess_text(self, text: str) -> str:
        """Clean and normalize Finnish text."""
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'[^\w\s\.\,\?\!\-\:\;äöåÄÖÅ]', '', text)
        return text.strip()
    
    def process_document(self, file_path: str) -> List[Dict[str, Any]]:
        """Process a single document and return chunks with metadata."""
        try:
            # Read document
            doc = Document(file_path)
            text = "\n".join([para.text for para in doc.paragraphs])
            
            # Extract metadata
            filename = os.path.basename(file_path)
            name, age, doc_id = self.extract_metadata_from_filename(filename)
            
            # Preprocess and split text
            clean_text = self.preprocess_text(text)
            chunks = self.text_splitter.split_text(clean_text)
            
            # Create chunks with metadata
            processed_chunks = []
            for i, chunk in enumerate(chunks):
                processed_chunks.append({
                    "text": chunk,
                    "metadata": {
                        "source": filename,
                        "person_name": name,
                        "person_age": age,
                        "document_id": doc_id,
                        "chunk_index": i
                    }
                })
            
            print_status("Document Processing", True, f"Processed {filename}")
            return processed_chunks
        except Exception as e:
            print_status("Document Processing", False, f"Error processing {file_path}: {str(e)}")
            raise
# Add at the end of section 5
def verify_section5():
    processor = DocumentProcessor()
    # Test basic functionality
    test_text = "Test document content"
    processed = processor.preprocess_text(test_text)
    if not processed:
        raise Exception("Document processor initialization failed")
    return True

verify_section("DocumentProcessor", verify_section5)




✅ SUCCESS | Document Processor
  └─ Initialized successfully

✅ SUCCESS | Section DocumentProcessor Verification
  └─ Successfully executed


True

In [7]:
## section or cell 6
class MilvusManager:
    def __init__(self, host: str = "milvus-standalone", port: str = "19530", alias: str = "default"):
        self.host = host
        self.port = port
        self.alias = alias
        self.connected = False
        self.connect()
        
    def connect(self):
        """Establish connection to Milvus."""
        try:
            # First, check if there's an existing connection and remove it
            try:
                connections.remove_connection(alias=self.alias)
                print_status("Milvus Connection", True, "Cleaned up existing connection")
            except:
                pass
            
            # Attempt to establish new connection
            connections.connect(
                alias=self.alias,
                host=self.host,
                port=self.port,
                timeout=10.0  # Add timeout parameter
            )
            
            # Verify connection is working
            try:
                utility.get_server_version()
                self.connected = True
                print_status("Milvus Connection", True, f"Connected to {self.host}:{self.port}")
            except Exception as ve:
                raise Exception(f"Connection verification failed: {str(ve)}")
                
        except Exception as e:
            self.connected = False
            print_status("Milvus Connection", False, str(e))
            raise
        

    def create_collection(self, collection_name: str = "document_embeddings"):
        """Create Milvus collection with appropriate schema."""
        try:
            # First, check if collection exists and drop it to ensure correct dimension
            if utility.has_collection(collection_name):
                Collection(name=collection_name).drop()
                print_status("Milvus Collection", True, f"Dropped existing collection: {collection_name}")
                
            fields = [
                FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
                FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=65535),
                FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=EMBEDDING_DIM),
                FieldSchema(name="person_name", dtype=DataType.VARCHAR, max_length=100),
                FieldSchema(name="person_age", dtype=DataType.INT64),
                FieldSchema(name="document_id", dtype=DataType.VARCHAR, max_length=100),
                FieldSchema(name="chunk_index", dtype=DataType.INT64)
            ]
            
            schema = CollectionSchema(
                fields=fields,
                description="Document embeddings collection",
                enable_dynamic_field=False
            )
            collection = Collection(name=collection_name, schema=schema)
            
            # Create index and load collection
            self.create_and_load_index(collection)
            
            print_status("Milvus Collection", True, f"Created new collection: {collection_name} with dim={EMBEDDING_DIM}")
            return collection
        except Exception as e:
            print_status("Milvus Collection", False, str(e))
            raise

    def create_and_load_index(self, collection):
        """Create index and load collection into memory."""
        try:
            # Create IVF_FLAT index
            index_params = {
                "metric_type": "IP",
                "index_type": "IVF_FLAT",
                "params": {"nlist": 1024}
            }
            collection.create_index(field_name="embedding", index_params=index_params)
            print_status("Index Creation", True, "Created IVF_FLAT index")
            
            # Load collection into memory
            collection.load()
            print_status("Collection Load", True, "Loaded collection into memory")
            
        except Exception as e:
            print_status("Index Creation", False, str(e))
            raise

    def reload_collection(self, collection_name: str = "document_embeddings"):
        """Reload collection into memory if it exists."""
        try:
            if utility.has_collection(collection_name):
                collection = Collection(name=collection_name)
                collection.load()
                print_status("Collection Reload", True, f"Reloaded collection: {collection_name}")
                return collection
            else:
                raise Exception(f"Collection {collection_name} does not exist")
        except Exception as e:
            print_status("Collection Reload", False, str(e))
            raise
    def get_collection_info(self, collection_name: str = "document_embeddings"):
        """Get detailed collection information as a dictionary."""
        try:
            collection = Collection(collection_name)
            
            # Get collection stats
            stats = {
                "name": collection_name,
                "schema": collection.schema,
                "row_count": collection.num_entities,
            }
            
            # Get index information
            try:
                index_info = collection.index().params if collection.has_index() else None
                stats["index_info"] = index_info
            except Exception as e:
                stats["index_info"] = None
                print(f"Error getting index info: {e}")
                
            # Check load state using utility function instead
            try:
                load_status = utility.load_state(collection_name)
                stats["loaded"] = str(load_status) == "Loaded"
            except Exception as e:
                stats["loaded"] = None
                print(f"Error getting load state: {e}")
                
            return stats
        except Exception as e:
            print_status("Get Collection Info", False, str(e))
            return None

    def inspect_collection(self, collection_name: str = "document_embeddings"):
        """Inspect collection contents and statistics."""
        try:
            if not utility.has_collection(collection_name):
                raise Exception(f"Collection {collection_name} does not exist")
                    
            # Get collection
            collection = Collection(collection_name)
            
            # Print schema
            print("\nCollection Schema:")
            print(collection.schema)
            
            # Get sample records using search instead of query
            try:
                # Make sure collection is loaded
                collection.load()
                
                print("\nSample Records:")
                # Search with empty vector to get random samples
                results = collection.search(
                    data=[[0]*EMBEDDING_DIM],  # dummy vector
                    anns_field="embedding",
                    param={"metric_type": "IP", "params": {"nprobe": 10}},
                    limit=5,
                    output_fields=["text", "person_name", "document_id", "chunk_index"]
                )
                
                if results and results[0]:
                    for i, hit in enumerate(results[0]):
                        print(f"\nRecord {i+1}:")
                        print(f"Text: {hit.entity.get('text')[:200]}...")
                        print(f"Person: {hit.entity.get('person_name')}")
                        print(f"Document: {hit.entity.get('document_id')}")
                        print(f"Chunk Index: {hit.entity.get('chunk_index')}")
            except Exception as e:
                print(f"Error getting sample records: {e}")
            
            # Get collection statistics
            try:
                num_entities = collection.num_entities
                print("\nCollection Statistics:")
                print(f"Total entities: {num_entities}")
                
                # Get index information
                if collection.has_index():
                    index_info = collection.index().params
                    print("\nIndex Information:")
                    print(f"Index Type: {index_info}")
            except Exception as e:
                print(f"Error getting statistics: {e}")
            
            return collection
                
        except Exception as e:
            print_status("Collection Inspection", False, str(e))
            return None

# Add at the end of section 6 - Enhanced verification
def verify_section6():
    try:
        collection_name = "test_collection"  # Use consistent collection name
        
        # Test basic connection
        manager = MilvusManager()
        if not manager.connected:
            raise Exception("Failed to connect to Milvus")
            
        # Test collection creation
        collection = manager.create_collection(collection_name)
        if collection is None:
            raise Exception("Failed to create test collection")
        
        # Test collection inspection
        collection_info = manager.get_collection_info(collection_name)
        if collection_info:
            print_status("Collection Inspection", True, 
                        f"Successfully inspected collection with {collection_info['row_count']} records")
            
        # Optional: Print detailed inspection
        manager.inspect_collection(collection_name)
        
        # Clean up - optionally drop the test collection
        if utility.has_collection(collection_name):
            Collection(name=collection_name).drop()
            print_status("Test Cleanup", True, f"Dropped test collection: {collection_name}")
        
        print_status("MilvusManager Verification", True, "All tests passed")
        return True
    except Exception as e:
        print_status("MilvusManager Verification", False, str(e))
        raise Exception(f"MilvusManager verification failed: {str(e)}")

# Run verification
verify_section("MilvusManager", verify_section6)




✅ SUCCESS | Milvus Connection
  └─ Cleaned up existing connection

✅ SUCCESS | Milvus Connection
  └─ Connected to milvus-standalone:19530

✅ SUCCESS | Milvus Collection
  └─ Dropped existing collection: test_collection

✅ SUCCESS | Index Creation
  └─ Created IVF_FLAT index

✅ SUCCESS | Collection Load
  └─ Loaded collection into memory

✅ SUCCESS | Milvus Collection
  └─ Created new collection: test_collection with dim=768

✅ SUCCESS | Collection Inspection
  └─ Successfully inspected collection with 0 records

Collection Schema:
{'auto_id': True, 'description': 'Document embeddings collection', 'fields': [{'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}, {'name': 'text', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 65535}}, {'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 768}}, {'name': 'person_name', 'description': '', 'type': <DataType.VARCHAR: 21>, '

True

In [8]:
class EmbeddingGenerator:
    def __init__(self, model_name: str = "TurkuNLP/bert-base-finnish-cased-v1"):
        try:
            self.device = "cuda" if torch.cuda.is_available() else "cpu"
            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.model = AutoModel.from_pretrained(model_name).to(self.device)
            # Get the embedding dimension from the model config
            self.embedding_dim = self.model.config.hidden_size
            print_status("Embedding Model", True, f"Loaded {model_name} (dim={self.embedding_dim})")
        except Exception as e:
            print_status("Embedding Model", False, str(e))
            raise
        
    def mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output[0]
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        
    def generate(self, texts: List[str], batch_size: int = 32) -> np.ndarray:
        """Generate embeddings for a list of texts."""
        try:
            all_embeddings = []
            
            # Process in batches
            for i in range(0, len(texts), batch_size):
                batch_texts = texts[i:i + batch_size]
                
                # Tokenize texts
                encoded_input = self.tokenizer(
                    batch_texts,
                    padding=True,
                    truncation=True,
                    max_length=512,
                    return_tensors='pt'
                ).to(self.device)
                
                # Compute token embeddings
                with torch.no_grad():
                    model_output = self.model(**encoded_input)
                
                # Perform pooling
                sentence_embeddings = self.mean_pooling(model_output, encoded_input['attention_mask'])
                
                # Normalize embeddings
                sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
                
                all_embeddings.append(sentence_embeddings.cpu().numpy())
            
            result = np.concatenate(all_embeddings)
            
            # Fixed indentation here
            if result.shape[1] != EMBEDDING_DIM:
                raise ValueError(f"Embedding dimension mismatch. Expected {EMBEDDING_DIM}, got {result.shape[1]}")
            
            print_status("Embedding Generation", True, 
                    f"Generated {len(texts)} embeddings with dimension {result.shape[1]}")
            return result
        except Exception as e:
            print_status("Embedding Generation", False, str(e))
            raise

    def get_embedding_dim(self) -> int:
        """Return the embedding dimension."""
        return self.embedding_dim

# Add at the end of section 7
def verify_section7():
    try:
        # Initialize the generator
        generator = EmbeddingGenerator()
        
        # Get the embedding dimension
        embedding_dim = generator.get_embedding_dim()
        
        # Test with a simple text
        test_text = ["Test sentence for embedding generation"]
        embeddings = generator.generate(test_text)
        
        # Verify embedding shape
        if embeddings is None:
            raise Exception("Embeddings are None")
            
        if len(embeddings.shape) != 2:
            raise Exception(f"Expected 2D array, got shape {embeddings.shape}")
            
        if embeddings.shape[0] != len(test_text):
            raise Exception(f"Expected {len(test_text)} embeddings, got {embeddings.shape[0]}")
            
        if embeddings.shape[1] != embedding_dim:
            raise Exception(f"Expected dimension {embedding_dim}, got {embeddings.shape[1]}")
        
        # Verify embedding values
        if not np.all(np.isfinite(embeddings)):
            raise Exception("Embeddings contain invalid values")
            
        if not np.allclose(np.linalg.norm(embeddings, axis=1), 1.0, atol=1e-6):
            raise Exception("Embeddings are not properly normalized")
        
        print_status("Embedding Verification", True, 
                    f"Generated embeddings with shape {embeddings.shape}, dimension={embedding_dim}")
        return True
    
    except Exception as e:
        print_status("Embedding Verification", False, str(e))
        raise Exception(f"EmbeddingGenerator verification failed: {str(e)}")

# Run verification
verify_section("EmbeddingGenerator", verify_section7)

# You can also add a specific test function
def test_embedding_generation():
    try:
        generator = EmbeddingGenerator()
        test_texts = [
            "Tämä on testilause.",
            "Toinen testilause suomeksi."
        ]
        embeddings = generator.generate(test_texts)
        print(f"Generated embeddings shape: {embeddings.shape}")
        print(f"Embedding dimension: {generator.get_embedding_dim()}")
        return True
    except Exception as e:
        print(f"Test failed: {str(e)}")
        return False

# Run test if needed
# test_embedding_generation()



tokenizer_config.json:   0%|          | 0.00/56.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/424k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/816k [00:00<?, ?B/s]


✅ SUCCESS | Embedding Model
  └─ Loaded TurkuNLP/bert-base-finnish-cased-v1 (dim=768)

✅ SUCCESS | Embedding Generation
  └─ Generated 1 embeddings with dimension 768

✅ SUCCESS | Embedding Verification
  └─ Generated embeddings with shape (1, 768), dimension=768

✅ SUCCESS | Section EmbeddingGenerator Verification
  └─ Successfully executed


In [9]:
## section or cell 8
class RAGPipeline:
    def __init__(self, model_id: str = "Finnish-NLP/llama-7b-finnish-instruct-v0.2"):
        try:
            self.setup_llm(model_id)
            print_status("LLM Setup", True, f"Loaded {model_id}")
            
            self.doc_processor = DocumentProcessor()
            self.embedding_generator = EmbeddingGenerator()
            
            # Verify embedding dimension matches
            if self.embedding_generator.get_embedding_dim() != EMBEDDING_DIM:
                raise ValueError(f"Embedding dimension mismatch. Global: {EMBEDDING_DIM}, " 
                               f"Generator: {self.embedding_generator.get_embedding_dim()}")
            
            self.milvus_manager = MilvusManager(
                host=MILVUS_HOST,
                port=MILVUS_PORT,
                alias=MILVUS_ALIAS
            )
            self.collection = self.milvus_manager.create_collection()
            print_status("RAG Pipeline", True, "All components initialized")
        except Exception as e:
            print_status("RAG Pipeline", False, str(e))
            raise
        
    def setup_llm(self, model_id: str):
        """Initialize the LLM with optimized settings."""
        try:
            bnb_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_compute_dtype=torch.float16,
                bnb_4bit_use_double_quant=True,
                bnb_4bit_quant_type="nf4"
            )
            
            model = AutoModelForCausalLM.from_pretrained(
                model_id,
                quantization_config=bnb_config,
                torch_dtype=torch.float16,
                device_map="auto",
                max_memory={0: "6GiB"},
                offload_folder="offload"
            )
            
            tokenizer = AutoTokenizer.from_pretrained(model_id)
            self.pipeline = pipeline(
                "text-generation",
                model=model,
                tokenizer=tokenizer,
                max_new_tokens=512,
                do_sample=True,
                temperature=0.3,
                top_p=0.95,
                repetition_penalty=1.15
            )
            print_status("LLM Pipeline", True, "Pipeline configured successfully")
        except Exception as e:
            print_status("LLM Pipeline", False, str(e))
            raise
    def process_documents(self, folder_path: str):
        """Process all documents in the specified folder."""
        try:
            file_paths = [f for f in os.listdir(folder_path) if f.endswith('.docx')]
            all_chunks = []
            
            for file in tqdm(file_paths, desc="Processing documents"):
                file_path = os.path.join(folder_path, file)
                chunks = self.doc_processor.process_document(file_path)
                all_chunks.extend(chunks)
                
            # Generate embeddings
            texts = [chunk["text"] for chunk in all_chunks]
            embeddings = self.embedding_generator.generate(texts)
            
            # Prepare data for insertion - convert to list of dictionaries
            entities = []
            for i, (text, embedding, chunk) in enumerate(zip(texts, embeddings, all_chunks)):
                entity = {
                    "text": text,  # Single string, not a list
                    "embedding": embedding.tolist(),  # Single embedding vector
                    "person_name": chunk["metadata"]["person_name"],
                    "person_age": chunk["metadata"]["person_age"],
                    "document_id": chunk["metadata"]["document_id"],
                    "chunk_index": chunk["metadata"]["chunk_index"]
                }
                entities.append(entity)
            
            # Insert entities one by one or in small batches
            batch_size = 100
            for i in range(0, len(entities), batch_size):
                batch = entities[i:i + batch_size]
                self.collection.insert(batch)
                
            self.collection.flush()
            self.milvus_manager.create_and_load_index(self.collection)
            print_status("Document Processing", True, f"Inserted {len(texts)} chunks into Milvus")
        except Exception as e:
            print_status("Document Processing", False, str(e))
            raise   
        
    def query(self, question: str, top_k: int = 3):
        """Query the system with a question."""
        try:
            # Ensure collection is loaded
            self.collection = self.milvus_manager.reload_collection()
            
            # Generate question embedding
            question_embedding = self.embedding_generator.generate([question])[0]
            
            # Search in Milvus
            search_params = {"metric_type": "IP", "params": {"nprobe": 10}}
            results = self.collection.search(
                data=[question_embedding.tolist()],
                anns_field="embedding",
                param=search_params,
                limit=top_k,
                output_fields=["text", "person_name", "document_id"]
            )
            
            # Format context with clear document separation
            context_parts = []
            for i, hit in enumerate(results[0]):
                text = hit.entity.get('text')
                doc_id = hit.entity.get('document_id')
                person = hit.entity.get('person_name')
                context_parts.append(f"[Dokumentti {i+1}]\nID: {doc_id}\nHenkilö: {person}\nTeksti: {text}\n")
                
            context = "\n".join(context_parts)
            
            # Enhanced prompt with strict instruction to return exact quotes
            prompt = f"""Kysymys: {question}
    
            Konteksti:
            {context}
            
            Ohje: Etsi suora vastaus annetusta kontekstista. Käytä tarkkoja lainauksia.
            
            Jos löydät suoran vastauksen:
            1. Kerro ensin mistä dokumentista löysit vastauksen
            2. Lainaa tekstiä sanatarkasti
            
            Jos et löydä vastausta:
            - Vastaa vain: "En löydä suoraa vastausta annetusta kontekstista"
            
            Vastaus:"""
    
            response = self.pipeline(prompt)[0]["generated_text"].split("Vastaus:")[-1].strip()
            
            # Clean up any extra content after the answer
            if "*END*" in response:
                response = response.split("*END*")[0].strip()
                
            if "Ohje:" in response:
                response = response.split("Ohje:")[0].strip()
                
            print_status("Query", True, "Generated response successfully")
            return {
                "answer": response,
                "sources": [
                    {
                        "text": hit.entity.get('text'),
                        "person_name": hit.entity.get('person_name'),
                        "document_id": hit.entity.get('document_id')
                    }
                    for hit in results[0]
                ]
            }
        except Exception as e:
            print_status("Query", False, str(e))
            raise
# Add at the end of section 8
def verify_section8():
    pipeline = RAGPipeline()
    if not hasattr(pipeline, 'pipeline') or not pipeline.collection:
        raise Exception("RAG Pipeline initialization failed")
    return True

verify_section("RAGPipeline", verify_section8)



config.json:   0%|          | 0.00/689 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.04G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.89k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.99M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/554 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.



✅ SUCCESS | LLM Pipeline
  └─ Pipeline configured successfully

✅ SUCCESS | LLM Setup
  └─ Loaded Finnish-NLP/llama-7b-finnish-instruct-v0.2

✅ SUCCESS | Document Processor
  └─ Initialized successfully

✅ SUCCESS | Embedding Model
  └─ Loaded TurkuNLP/bert-base-finnish-cased-v1 (dim=768)

✅ SUCCESS | Milvus Connection
  └─ Cleaned up existing connection

✅ SUCCESS | Milvus Connection
  └─ Connected to milvus-standalone:19530

✅ SUCCESS | Milvus Collection
  └─ Dropped existing collection: document_embeddings

✅ SUCCESS | Index Creation
  └─ Created IVF_FLAT index

✅ SUCCESS | Collection Load
  └─ Loaded collection into memory

✅ SUCCESS | Milvus Collection
  └─ Created new collection: document_embeddings with dim=768

✅ SUCCESS | RAG Pipeline
  └─ All components initialized

✅ SUCCESS | Section RAGPipeline Verification
  └─ Successfully executed


True

In [10]:
# Test cell for data inspection
def test_data_inspection():
    try:
        manager = MilvusManager()
        
        # List all collections
        print("Available collections:")
        collections = utility.list_collections()
        print(collections)
        
        if collections:
            # Inspect each collection
            for collection_name in collections:
                print(f"\n{'='*50}")
                print(f"Inspecting collection: {collection_name}")
                print(f"{'='*50}")
                
                # Get collection info
                info = manager.get_collection_info(collection_name)
                if info:
                    print("\nCollection Info:")
                    print(f"Name: {info['name']}")
                    print(f"Row Count: {info['row_count']}")
                    print(f"Loaded: {info['loaded']}")
                    if info['index_info']:
                        print(f"Index Info: {info['index_info']}")
                
                # Detailed inspection
                print("\nDetailed Inspection:")
                manager.inspect_collection(collection_name)
                
        else:
            print("No collections found")
            
    except Exception as e:
        print(f"Error during inspection: {e}")

# Run the test
test_data_inspection()




✅ SUCCESS | Milvus Connection
  └─ Cleaned up existing connection

✅ SUCCESS | Milvus Connection
  └─ Connected to milvus-standalone:19530
Available collections:
['document_embeddings']

Inspecting collection: document_embeddings

Collection Info:
Name: document_embeddings
Row Count: 0
Loaded: True
Index Info: {'metric_type': 'IP', 'index_type': 'IVF_FLAT', 'params': {'nlist': 1024}}

Detailed Inspection:

Collection Schema:
{'auto_id': True, 'description': 'Document embeddings collection', 'fields': [{'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}, {'name': 'text', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 65535}}, {'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 768}}, {'name': 'person_name', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 100}}, {'name': 'person_age', 'description': '', 'type': <DataType.INT64: 5>}, {'name

In [11]:
def main():
    try:
        # Check CUDA
        check_cuda()
        
        # Download stopwords
        ensure_stopwords_downloaded()
        
        # Initialize the RAG pipeline
        rag = RAGPipeline()
        
        # Process documents
        folder_path = '/home/jovyan/work/notebooks/data/'  # Update this path
        print_status("Document Path", True, f"Using folder: {folder_path}")
        rag.process_documents(folder_path)
        
        # Run data inspection after processing documents
        print("\nInspecting processed data:")
        test_data_inspection()
        
        # Example queries
        questions = [
            "Onko Marjatta Eilan ystävä?",
            "Miten Sulo kokee sosiaalisen kanssakäymisen merkityksen?",
            "Montako sisarusta Sulolla on?"
        ]
        
        for i, question in enumerate(questions, 1):
            print(f"\nProcessing Query {i}/{len(questions)}")
            try:
                result = rag.query(question)
                print_status(f"Query {i}", True, f"Question: {question}")
                print(f"Answer: {result['answer']}")
                print("\nSources:")
                for source in result['sources']:
                    print(f"- {source['document_id']}: {source['text'][:100]}...")
            except Exception as e:
                print_status(f"Query {i}", False, f"Failed to process question: {str(e)}")
                continue
        
        print_status("Main Execution", True, "All operations completed successfully")
        
    except Exception as e:
        print_status("Main Execution", False, str(e))
        raise

# Add at the end of section 9
def verify_section9():
    # Test if main components are accessible
    if 'main' not in globals() or not callable(main):
        raise Exception("Main function not properly defined")
    return True

verify_section("Main Function", verify_section9)
if __name__ == "__main__":
    main()


✅ SUCCESS | Section Main Function Verification
  └─ Successfully executed

✅ SUCCESS | CUDA Check
  └─ Using GPU: NVIDIA GeForce RTX 2070 SUPER

✅ SUCCESS | NLTK Setup
  └─ Downloaded finnish stopwords


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]


✅ SUCCESS | LLM Pipeline
  └─ Pipeline configured successfully

✅ SUCCESS | LLM Setup
  └─ Loaded Finnish-NLP/llama-7b-finnish-instruct-v0.2

✅ SUCCESS | Document Processor
  └─ Initialized successfully

✅ SUCCESS | Embedding Model
  └─ Loaded TurkuNLP/bert-base-finnish-cased-v1 (dim=768)

✅ SUCCESS | Milvus Connection
  └─ Cleaned up existing connection

✅ SUCCESS | Milvus Connection
  └─ Connected to milvus-standalone:19530

✅ SUCCESS | Milvus Collection
  └─ Dropped existing collection: document_embeddings

✅ SUCCESS | Index Creation
  └─ Created IVF_FLAT index

✅ SUCCESS | Collection Load
  └─ Loaded collection into memory

✅ SUCCESS | Milvus Collection
  └─ Created new collection: document_embeddings with dim=768

✅ SUCCESS | RAG Pipeline
  └─ All components initialized

✅ SUCCESS | Document Path
  └─ Using folder: /home/jovyan/work/notebooks/data/


Processing documents:   0%|          | 0/2 [00:00<?, ?it/s]


✅ SUCCESS | Document Processing
  └─ Processed Eila 81v SH-4.docx

✅ SUCCESS | Document Processing
  └─ Processed Sulo 75v C5-50.docx

✅ SUCCESS | Embedding Generation
  └─ Generated 8 embeddings with dimension 768


INFO:torch.distributed.nn.jit.instantiator:Created a temporary directory at /tmp/tmp5ctf405t
INFO:torch.distributed.nn.jit.instantiator:Writing /tmp/tmp5ctf405t/_remote_module_non_scriptable.py



✅ SUCCESS | Index Creation
  └─ Created IVF_FLAT index

✅ SUCCESS | Collection Load
  └─ Loaded collection into memory

✅ SUCCESS | Document Processing
  └─ Inserted 8 chunks into Milvus

Inspecting processed data:

✅ SUCCESS | Milvus Connection
  └─ Cleaned up existing connection

✅ SUCCESS | Milvus Connection
  └─ Connected to milvus-standalone:19530
Available collections:
['document_embeddings']

Inspecting collection: document_embeddings

Collection Info:
Name: document_embeddings
Row Count: 8
Loaded: True
Index Info: {'index_type': 'IVF_FLAT', 'params': {'nlist': 1024}, 'metric_type': 'IP'}

Detailed Inspection:

Collection Schema:
{'auto_id': True, 'description': 'Document embeddings collection', 'fields': [{'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}, {'name': 'text', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 65535}}, {'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 