In [None]:
import json
jsonl_path = "h:/ML_Models/_gemma/customdocs/converted/dogs_dataset_english.jsonl"

def load_jsonl(path):
    try:
        with open(jsonl_path, 'r', encoding='utf-8') as file:
            print("Loading JSONL file...")
            data = [json.loads(line) for line in file]
        return data
    except FileNotFoundError:
        print(f"File not found: {jsonl_path}")
        data = []
    except Exception as e:
        print(f"Error loading JSONL file: {e}")
        data = []

def extract_documents_and_ids(data):
    breed_documents = []
    breed_ids = []
    metadatas = []
    for record in data: 
        input_text = record.get("input", "")
        breed_documents.append(input_text)
        index = input_text.find(" Nationality: ") # find first space
        if index != -1:
            id = input_text[:index] # take first word as id
        else:
            id = input_text
        breed_ids.append(id)

        index_nationality = input_text.find(". Origin") # find first space
        nationality = ""
        if index_nationality != -1:
            nationality = input_text[ index + len(" Nationality: ") : index_nationality].strip() # take text between "Nacionalidad" and ". Origen"
            metadatas.append({"source": "dogs_dataset.jsonl", "Nationality": nationality})
        else:
            metadatas.append({"source": "dogs_dataset.jsonl", "Nationality": "unknown: " + input_text[:index]})
    return breed_documents, breed_ids, metadatas


In [None]:
from langchain_core.documents import Document

data = load_jsonl(jsonl_path)
texts, ids, metadatas = extract_documents_and_ids(data)
print(f"Loaded {len(texts)} documents.")

documents = [Document(page_content=texts[i], metadata=metadatas[i]) for i in range(len(texts))]
print("documents created.")

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_milvus import Milvus

# Embedding model
embedding_model = HuggingFaceEmbeddings(model_name="multi-qa-mpnet-base-dot-v1")

# Connect to the existing collection
vector_store = Milvus(
    embedding_model,
    collection_name="Dogs_Breeds_milvus_EN_1",
    connection_args={"host": "localhost", "port": "19530"},
)

vector_store.add_documents(documents=documents, ids=ids)
print("Vector store loaded.")

In [None]:
results = vector_store.similarity_search(
    "rescue dogs",
    k=5,
    
    # search metadata
    #expr='source == "dogs_dataset.jsonl"', 
    
    # Search params for the index type
    #param={"metric_type": "COSINE", "params": {"nprobe": 16}} 

)
print(f"Search results: {len(results)}")
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")