In [22]:
## setup and initialization
import os
import re
import nltk
from nltk.corpus import stopwords
from pymilvus import connections, Collection, utility, CollectionSchema, FieldSchema, DataType
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
import itertools
from docx import Document
import logging
import requests
import json

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)



In [23]:
# Download the Finnish STOP words
# Ensure stopwords are downloaded
def ensure_stopwords_downloaded(language='finnish'):
    try:
        stopwords.words(language)
        print(f"{language.capitalize()} stopwords are already downloaded.")
    except LookupError:
        print(f"{language.capitalize()} stopwords not found. Downloading...")
        nltk.download('stopwords')
        print(f"{language.capitalize()} stopwords downloaded successfully.")
    finally:
        print("Stopwords are ready to use.")

ensure_stopwords_downloaded('finnish')
finnish_stopwords = stopwords.words('finnish')
print("Finnish stopwords setup complete.")



Finnish stopwords are already downloaded.
Stopwords are ready to use.
Finnish stopwords setup complete.


In [24]:
# Connect to Milvus
MILVUS_HOST = "milvus-standalone"
MILVUS_PORT = "19530"
MILVUS_ALIAS = "default"

def connect_milvus():
    try:
        connections.connect(alias=MILVUS_ALIAS, host=MILVUS_HOST, port=MILVUS_PORT)
        print(f"Connected to Milvus at {MILVUS_HOST}:{MILVUS_PORT}")
    except Exception as e:
        print(f"Failed to connect to Milvus: {e}")

connect_milvus()
print("Milvus connection established.")



Connected to Milvus at milvus-standalone:19530
Milvus connection established.


In [25]:
# Create document schema
def create_document_schema():
    fields = [
        FieldSchema(name="doc_id", dtype=DataType.VARCHAR, max_length=100),
        FieldSchema(name="chunk_id", dtype=DataType.INT64, is_primary=True, auto_id=True),
        FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=384),
        FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=65535),
        FieldSchema(name="person_name", dtype=DataType.VARCHAR, max_length=100),
        FieldSchema(name="person_age", dtype=DataType.INT64),
        FieldSchema(name="document_id", dtype=DataType.VARCHAR, max_length=20)
    ]
    schema = CollectionSchema(fields=fields, description="Document embeddings with person metadata")
    return schema

print("Document schema created.")



Document schema created.


In [26]:
# Create document collection
def create_document_collection():
    schema = create_document_schema()
    collection_name = "document_embeddings"

    if not utility.has_collection(collection_name):
        collection = Collection(name=collection_name, schema=schema)
        print(f"Collection '{collection_name}' created!")
    else:
        collection = Collection(name=collection_name)
        print(f"Collection '{collection_name}' already exists.")

    return collection

collection = create_document_collection()
print("Document collection created and ready for use.")



Collection 'document_embeddings' created!
Document collection created and ready for use.


In [27]:
# Function to extract metadata from the filename
def extract_metadata_from_filename(filename):
    try:
        title = os.path.splitext(filename)[0]
        match = re.match(r'([A-Za-z]+)\s+(\d{1,3})v\s+([A-Za-z0-9\-]+)', title)
        if match:
            name = match.group(1)
            age = int(match.group(2))
            doc_id = match.group(3)
            print(f"Metadata extracted: Name = {name}, Age = {age}, Document ID = {doc_id}")
            return name, age, doc_id
        else:
            print(f"Failed to extract metadata from: {filename}")
            return None, None, None
    except Exception as e:
        print(f"Error while extracting metadata: {e}")
        return None, None, None

# Function to extract text from a .docx file
def extract_text_from_docx(file_path):
    try:
        doc = Document(file_path)
        text = "\n".join([para.text for para in doc.paragraphs])
        print(f"Text extracted from {file_path}")
        return text
    except Exception as e:
        print(f"Failed to extract text: {e}")
        return ""

print("Text processing and metadata extraction functions set up.")



Text processing and metadata extraction functions set up.


In [28]:
# Disable text preprocessing temporarily for debugging
def preprocess_text(text):
    try:
        text = text.lower()
        text = re.sub(r'\W+', ' ', text)
        return text
    except Exception as e:
        print(f"Failed to preprocess text: {e}")
        return text

def chunk_text(text, chunk_size=512):
    try:
        words = text.split()
        chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
        print(f"Text split into {len(chunks)} chunks.")
        print("Text chunking successful.")
        return chunks
    except Exception as e:
        print(f"Failed to chunk text: {e}")
        return []




In [29]:
# Load embedding model and tokenizer
EMBEDDING_MODEL_NAME = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
embedding_tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL_NAME)
embedding_model = AutoModel.from_pretrained(EMBEDDING_MODEL_NAME)
embedding_model.eval()
logger.info("Embedding model and tokenizer loaded successfully.")
print("Embedding model and tokenizer loaded.")



INFO:__main__:Embedding model and tokenizer loaded successfully.


Embedding model and tokenizer loaded.


In [30]:
# Generate embeddings
def batched(iterable, n):
    it = iter(iterable)
    while batch := list(itertools.islice(it, n)):
        yield batch

def generate_embeddings_local(texts, max_batch_size=32):
    try:
        embeddings = []
        for batch in batched(texts, max_batch_size):
            inputs = embedding_tokenizer(
                batch,
                padding=True,
                truncation=True,
                return_tensors="pt",
                max_length=512,
            )
            with torch.no_grad():
                outputs = embedding_model(**inputs)
            batch_embeddings = outputs.last_hidden_state.mean(dim=1)
            batch_embeddings = torch.nn.functional.normalize(batch_embeddings, p=2, dim=1)
            embeddings.append(batch_embeddings.cpu())
        embeddings = torch.cat(embeddings)
        return embeddings.numpy().astype(np.float32)
    except Exception as e:
        logger.error(f"Failed to generate embeddings: {e}")
        return None
print("Embedding generation function ready.")



Embedding generation function ready.


In [31]:
# Insert data into Milvus
def insert_data_with_metadata(collection, doc_ids, embeddings, texts, person_name, person_age, document_id):
    collection.insert([doc_ids, embeddings.tolist(), texts, [person_name]*len(doc_ids), 
                       [person_age]*len(doc_ids), [document_id]*len(doc_ids)])
    collection.flush()

# Create and load index after data insertion
def create_and_load_index(collection):
    index_params = {"index_type": "IVF_FLAT", "metric_type": "IP", "params": {"nlist": 1024}}
    collection.create_index(field_name="embedding", index_params=index_params)
    collection.load()

print("Data insertion and indexing functions are ready.")



Data insertion and indexing functions are ready.


In [32]:
# Process and insert documents
folder_path = '/home/jovyan/work/notebooks/data/'
file_paths = [f for f in os.listdir(folder_path) if f.endswith('.docx')]

for file in file_paths:
    name, age, doc_id = extract_metadata_from_filename(file)
    file_path = os.path.join(folder_path, file)
    extracted_text = extract_text_from_docx(file_path)
    cleaned_text = preprocess_text(extracted_text)
    chunks = chunk_text(cleaned_text)

    embeddings = generate_embeddings_local(chunks)
    doc_ids = [f"{doc_id}_chunk_{i+1}" for i in range(len(chunks))]
    insert_data_with_metadata(collection, doc_ids, embeddings, chunks, name, age, doc_id)

create_and_load_index(collection)
print("Data processing and insertion completed.")



Metadata extracted: Name = Eila, Age = 81, Document ID = SH-4
Text extracted from /home/jovyan/work/notebooks/data/Eila 81v SH-4.docx
Text split into 1 chunks.
Text chunking successful.
Metadata extracted: Name = Sulo, Age = 75, Document ID = C5-50
Text extracted from /home/jovyan/work/notebooks/data/Sulo 75v C5-50.docx
Text split into 1 chunks.
Text chunking successful.
Data processing and insertion completed.


In [34]:
# Check collection entities
def check_collection_entities(collection_name="document_embeddings"):
    collection = Collection(name=collection_name)
    entity_count = collection.num_entities
    print(f"Collection '{collection_name}' contains {entity_count} entities.")
    return entity_count

check_collection_entities()
print("Collection entities checked.")



Collection 'document_embeddings' contains 2 entities.
Collection entities checked.


In [35]:
## Querying and search functions
OLLAMA_URL = "http://ollama:11434/api/generate"

## llama3.2 tested and it is quite ok but needs further consideration sepcially with output
## example output for the llama3.2 :
## 
## testing mistral:7b

def query_ollama(prompt, model="mistral:7b"):
    headers = {
        "Content-Type": "application/json"
    }
    data = {
        "model": model,
        "prompt": prompt
    }

    with requests.post(OLLAMA_URL, headers=headers, json=data, stream=True) as response:
        if response.status_code == 200:
            result_text = ""
            for chunk in response.iter_lines(decode_unicode=True):
                if chunk:  # Filter out keep-alive new chunks
                    try:
                        chunk_data = json.loads(chunk)
                        result_text += chunk_data.get("response", "")
                    except json.JSONDecodeError:
                        print(f"Failed to parse chunk: {chunk}")
            return result_text
        else:
            print(f"Error querying Ollama: {response.status_code}, {response.text}")
            return None

def generate_query_embedding(query):
    inputs = embedding_tokenizer(query, padding=True, truncation=True, return_tensors="pt", max_length=512)
    with torch.no_grad():
        outputs = embedding_model(**inputs)
    query_embedding = outputs.last_hidden_state.mean(dim=1)
    query_embedding = torch.nn.functional.normalize(query_embedding, p=2, dim=1)
    return query_embedding.cpu().numpy().astype(np.float32)

def search_similar_documents(query_embedding, collection_name="document_embeddings", top_k=5):
    query_embedding = query_embedding.astype(np.float32)
    collection = Collection(name=collection_name)

    search_params = {"metric_type": "IP", "params": {"nprobe": 10}}

    try:
        results = collection.search(
            data=query_embedding.tolist(),
            anns_field="embedding",
            param=search_params,
            limit=top_k,
            output_fields=["text", "person_name", "person_age", "document_id"]
        )
        
        if not results or len(results[0]) == 0:
            print("No relevant documents found in Milvus.")
            return None

        relevant_texts = []
        for result in results[0]:
            document_id = result.entity.get("document_id")
            text_snippet = result.entity.get("text")[:300]
            person_name = result.entity.get("person_name")
            person_age = result.entity.get("person_age")
            print(f"Score: {result.score}, Document ID: {document_id}, Name: {person_name}, Age: {person_age}")
            print(f"Text: {text_snippet}...\n")
            relevant_texts.append(result.entity.get("text"))

        return relevant_texts
    except Exception as e:
        print(f"Error during search: {e}")
        return None
print("Search functions are ready.")



Search functions are ready.


In [39]:
# Function to generate query embedding
def get_query_embedding(query):
    query_embedding = generate_query_embedding(query)
    if query_embedding is not None and query_embedding.size > 0:  # Check if the embedding is not empty
        print(f"Query embedding for '{query}' generated successfully.")
    else:
        print(f"Failed to generate query embedding for '{query}'.")
    return query_embedding

# Function to search Milvus
def search_milvus(query_embedding, collection_name="document_embeddings", top_k=5):
    top_results = search_similar_documents(query_embedding, collection_name, top_k)
    if top_results:
        relevant_texts = " ".join(top_results).strip()
        if not relevant_texts:
            print("No relevant context found in Milvus.")
        return relevant_texts
    else:
        print("No top results found in Milvus for the query.")
        return None

# Function to query Ollama with retrieved context
def query_ollama_with_context(query, context):
    prompt_template = """
    Sinulle on annettu seuraava teksti:

    {context}

    Vastaa seuraavaan kysymykseen perustuen yllä olevaan tekstiin:

    {question}
    """
    prompt = prompt_template.format(context=context, question=query)
    print(f"Prompt sent to LLM:\n{prompt}")

    response = query_ollama(prompt)
    if response:
        print(f"Ollama response for '{query}' received successfully.")
    else:
        response = "I don't know, the information is not provided in the documents."
        print(f"Failed to get response from Ollama for '{query}', returning fallback.")
    return response


## Main function to process the entire query pipeline
def process_query(question):
    # Step 1: Generate embedding for the question
    query_embedding = get_query_embedding(question)
    
    # Use .size > 0 to check if the array is not empty
    if query_embedding is not None and query_embedding.size > 0:
        # Step 2: Search for relevant documents in Milvus
        relevant_texts = search_milvus(query_embedding)
        
        if relevant_texts:
            # Step 3: Query Ollama for the final answer based on context
            ollama_response = query_ollama_with_context(question, relevant_texts)
            print(f"Ollama Response: {ollama_response}")
        else:
            print("No relevant context found for the query. Returning fallback response.")
            print("Ollama Response: I don't know, the information is not provided in the documents.")
    else:
        print("Failed to generate a valid query embedding. Returning fallback response.")


        

In [40]:
# Example 3
question_3 = "Missä 81-vuotias nainen asuu?"
process_query(question_3)


Query embedding for 'Missä 81-vuotias nainen asuu?' generated successfully.
Score: 0.20528644323349, Document ID: C5-50, Name: Sulo, Age: 75
Text: tunnistetieto c5 50 haastattelun päivämäärä 22 6 23 syntymävuosi 1948 75 sukupuoli mies kansalaisuus suomi maakunta keski pohjanmaa talotyyppi muu tuettu palveluasuminen asiasanat trauma sosiaalisuus arjen kuvaus oikeanpuoleinen heikkouteni ja jäykkyyteni hankaloittaa toimintaani mutta pystyn itse k...

Score: 0.15264736115932465, Document ID: SH-4, Name: Eila, Age: 81
Text: syntymävuosi 1942 81v sukupuoli nainen kansalaisuus suomi maakunta uusimaa talotyyppi ryhmäkoti omannäköinen arki arjessa teen milloin mitäkin arjen kohokohtia ovat tapaamiset omaisten kanssa toivon että omaiset pitävät hyvänä minua saan nimenomaan elää omannäköistä elämää ja päättää omista asioista...

Prompt sent to LLM:

    Sinulle on annettu seuraava teksti:

    tunnistetieto c5 50 haastattelun päivämäärä 22 6 23 syntymävuosi 1948 75 sukupuoli mies kansalaisuus suo

In [41]:
#Example Query
question_1 = "Mikä on 81-vuotiaan naisen kansalaisuus?"
process_query(question_1)

Query embedding for 'Mikä on 81-vuotiaan naisen kansalaisuus?' generated successfully.
Score: 0.2212529480457306, Document ID: C5-50, Name: Sulo, Age: 75
Text: tunnistetieto c5 50 haastattelun päivämäärä 22 6 23 syntymävuosi 1948 75 sukupuoli mies kansalaisuus suomi maakunta keski pohjanmaa talotyyppi muu tuettu palveluasuminen asiasanat trauma sosiaalisuus arjen kuvaus oikeanpuoleinen heikkouteni ja jäykkyyteni hankaloittaa toimintaani mutta pystyn itse k...

Score: 0.11063401401042938, Document ID: SH-4, Name: Eila, Age: 81
Text: syntymävuosi 1942 81v sukupuoli nainen kansalaisuus suomi maakunta uusimaa talotyyppi ryhmäkoti omannäköinen arki arjessa teen milloin mitäkin arjen kohokohtia ovat tapaamiset omaisten kanssa toivon että omaiset pitävät hyvänä minua saan nimenomaan elää omannäköistä elämää ja päättää omista asioista...

Prompt sent to LLM:

    Sinulle on annettu seuraava teksti:

    tunnistetieto c5 50 haastattelun päivämäärä 22 6 23 syntymävuosi 1948 75 sukupuoli mies kan

In [42]:
# Example 
question_2 = "Mitä kieliä 75-vuotias mies puhuu?"
process_query(question_2)

Query embedding for 'Mitä kieliä 75-vuotias mies puhuu?' generated successfully.
Score: 0.3213534951210022, Document ID: C5-50, Name: Sulo, Age: 75
Text: tunnistetieto c5 50 haastattelun päivämäärä 22 6 23 syntymävuosi 1948 75 sukupuoli mies kansalaisuus suomi maakunta keski pohjanmaa talotyyppi muu tuettu palveluasuminen asiasanat trauma sosiaalisuus arjen kuvaus oikeanpuoleinen heikkouteni ja jäykkyyteni hankaloittaa toimintaani mutta pystyn itse k...

Score: 0.24913422763347626, Document ID: SH-4, Name: Eila, Age: 81
Text: syntymävuosi 1942 81v sukupuoli nainen kansalaisuus suomi maakunta uusimaa talotyyppi ryhmäkoti omannäköinen arki arjessa teen milloin mitäkin arjen kohokohtia ovat tapaamiset omaisten kanssa toivon että omaiset pitävät hyvänä minua saan nimenomaan elää omannäköistä elämää ja päättää omista asioista...

Prompt sent to LLM:

    Sinulle on annettu seuraava teksti:

    tunnistetieto c5 50 haastattelun päivämäärä 22 6 23 syntymävuosi 1948 75 sukupuoli mies kansalais