Collecting neo4j (from -r requirements.txt (line 3))
  Downloading neo4j-5.25.0-py3-none-any.whl.metadata (5.7 kB)
Collecting transformers==4.31.0 (from -r requirements.txt (line 4))
  Downloading transformers-4.31.0-py3-none-any.whl.metadata (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.9/116.9 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting librosa==0.10.0 (from -r requirements.txt (line 5))
  Downloading librosa-0.10.0-py3-none-any.whl.metadata (8.3 kB)
Collecting torch==2.0.1 (from -r requirements.txt (line 6))
  Downloading torch-2.0.1-cp311-cp311-manylinux2014_aarch64.whl.metadata (23 kB)
Collecting fastapi==0.100.0 (from -r requirements.txt (line 7))
  Downloading fastapi-0.100.0-py3-none-any.whl.metadata (23 kB)
Collecting uvicorn==0.22.0 (from -r requirements.txt (line 8))
  Downloading uvicorn-0.22.0-py3-none-any.whl.metadata (6.3 kB)
Collecting pydantic>=2.3.0 (from -r requirements.txt (line 9))
  Downloading pyd

In [2]:
# Import necessary libraries
import os
from pymilvus import connections, Collection, utility, CollectionSchema, FieldSchema, DataType
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
import itertools
from docx import Document
import re
from nltk.corpus import stopwords
import nltk

# Define a function to check if stopwords are downloaded
def ensure_stopwords_downloaded(language='finnish'):
    try:
        # Try to get the stopwords list to check if it's available
        stopwords.words(language)
        print(f"{language.capitalize()} stopwords are already downloaded.")
    except LookupError:
        # If not available, download the stopwords package
        print(f"{language.capitalize()} stopwords not found. Downloading...")
        nltk.download('stopwords')
        print(f"{language.capitalize()} stopwords downloaded successfully.")

# Check if the Finnish stopwords are already available
ensure_stopwords_downloaded('finnish')

# Load Finnish stopwords
finnish_stopwords = stopwords.words('finnish')

print("Libraries imported and stopwords downloaded successfully.")



ImportError: cannot import name 'tarfile' from 'backports' (/opt/conda/lib/python3.11/site-packages/backports/__init__.py)

In [None]:
# Connect to Milvus
MILVUS_HOST = "milvus-standalone"
MILVUS_PORT = "19530"
MILVUS_ALIAS = "default"

def connect_milvus():
    try:
        connections.connect(
            alias=MILVUS_ALIAS,
            host=MILVUS_HOST,
            port=MILVUS_PORT
        )
        print(f"Connected to Milvus at {MILVUS_HOST}:{MILVUS_PORT}")
    except Exception as e:
        print(f"Failed to connect to Milvus: {e}")

def disconnect_milvus():
    try:
        connections.disconnect(MILVUS_ALIAS)
        print("Disconnected from Milvus")
    except Exception as e:
        print(f"Failed to disconnect: {e}")

# Call connection function
connect_milvus()


In [None]:
def create_document_schema():
    fields = [
        FieldSchema(name="doc_id", dtype=DataType.VARCHAR, max_length=100),
        FieldSchema(name="chunk_id", dtype=DataType.INT64, is_primary=True, auto_id=True),
        FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=768),  # Adjust dimension based on model
        FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=65535)
    ]
    schema = CollectionSchema(fields=fields, description="Document embeddings")
    return schema

def create_document_collection():
    schema = create_document_schema()
    collection_name = "document_embeddings"
    
    if not utility.has_collection(collection_name):
        collection = Collection(name=collection_name, schema=schema)
        index_params = {
            "index_type": "IVF_FLAT",
            "metric_type": "IP",
            "params": {"nlist": 1024}
        }
        collection.create_index(field_name="embedding", index_params=index_params)
        print(f"Collection '{collection_name}' and index created!")
    else:
        collection = Collection(name=collection_name)
        print(f"Collection '{collection_name}' already exists.")
    
    collection.load()
    return collection

# Create the collection
collection = create_document_collection()


In [None]:
def extract_text_from_docx(file_path):
    try:
        doc = Document(file_path)
        text = "\n".join([para.text for para in doc.paragraphs])
        print(f"Text extracted from document: {file_path}")
        return text
    except Exception as e:
        print(f"Failed to extract text: {e}")
        return ""

# Example usage:
file_paths = [
    '/home/jovyan/work/notebooks/Eila 81v SH-4.docx',
    '/home/jovyan/work/notebooks/Sulo 75v C5-50.docx'
]

# Extract text from documents
for file_path in file_paths:
    extracted_text = extract_text_from_docx(file_path)


In [None]:
def preprocess_text(text):
    try:
        # Lowercasing
        text = text.lower()
        # Remove special characters
        text = re.sub(r'[^a-zA-ZåäöÅÄÖ\s]', '', text)
        # Remove stopwords
        words = text.split()
        words = [word for word in words if word not in finnish_stopwords]
        cleaned_text = ' '.join(words)
        print("Text preprocessed successfully.")
        return cleaned_text
    except Exception as e:
        print(f"Failed to preprocess text: {e}")
        return text

# Example usage:
cleaned_text = preprocess_text(extracted_text)


In [None]:
def chunk_text(text, chunk_size=512):
    try:
        words = text.split()
        chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
        print(f"Text split into {len(chunks)} chunks.")
        return chunks
    except Exception as e:
        print(f"Failed to chunk text: {e}")
        return []

# Example usage:
chunks = chunk_text(cleaned_text)


In [None]:
# Load model and tokenizer locally
model_name = "Finnish-NLP/convbert-base-finnish"
tokenizer = AutoTokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=False)
model = AutoModel.from_pretrained(model_name)

print("Model and tokenizer loaded successfully.")

def batched(iterable, n):
    it = iter(iterable)
    while batch := list(itertools.islice(it, n)):
        yield batch

def generate_embeddings_local(texts, max_batch_size=32):
    try:
        embeddings = []
        for batch in batched(texts, max_batch_size):
            inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
            with torch.no_grad():
                outputs = model(**inputs)
            batch_embeddings = outputs.last_hidden_state.mean(dim=1)
            embeddings.append(batch_embeddings)
        
        # Normalize embeddings
        embeddings = torch.cat(embeddings)
        embeddings = embeddings / torch.norm(embeddings, dim=1, keepdim=True)
        print("Embeddings generated successfully.")
        return embeddings.numpy()
    except Exception as e:
        print(f"Failed to generate embeddings: {e}")
        return None

# Example usage:
embeddings = generate_embeddings_local(chunks)


In [None]:
def insert_data_into_collection(collection, doc_ids, embeddings, texts):
    try:
        entities = [doc_ids, embeddings, texts]
        collection.insert(entities)
        print(f"Data inserted into collection: {collection.name}")
    except Exception as e:
        print(f"Failed to insert data: {e}")

# Example usage:
doc_ids = [f"doc_{i+1}" for i in range(len(chunks))]
insert_data_into_collection(collection, doc_ids, embeddings, chunks)


In [None]:
# Disconnect from Milvus
disconnect_milvus()
