# Milvus

Mulvus is a simple and efficient similarity search engine. It is designed to search for similar vectors in large collections of high-dimensional vectors.

more details of its use with python can be found in the [documentation](https://milvus.io/docs/manage_databases.md)

In [None]:
# Dependencies

%pip install openai
%pip install polars
%pip install pymilvus


%pip install PyMuPDF

In [None]:
import os
import json
import polars as pl
from pymilvus import MilvusClient
from openai import OpenAI


class MilvusManager:
    def __init__(self, collection_name):
        self.openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
        self.model_name = "text-embedding-3-small"
        self.milvus_client = MilvusClient(uri="http://localhost:19530")
        self.collection_name = collection_name


    def create_embeddings(self, text):
        response = self.openai_client.embeddings.create(input=text, model=self.model_name)
        response_json = json.loads(response.model_dump_json())
        embedding = response_json['data'][0]['embedding']
        # logger.info("Embedding generated for text")
        return embedding
    

    def create_collection(self):
        try:
            if self.milvus_client.has_collection(collection_name=self.collection_name):
                self.milvus_client.drop_collection(collection_name=self.collection_name)
            self.milvus_client.create_cxollection(
                collection_name=self.collection_name,
                dimension=1536,  # The vectors we will use in this demo has 768 dimensions
                auto_id=False, 
            )
            # logger.info(f"Collection created: {self.collection_name}")
            print(f"Collection created: {self.collection_name}")
        except Exception as e:
            # logger.error(f"Error creating collection: {e}")
            print(f"Error creating collection: {e}")


    def insert_points(self, df: pl.DataFrame):
        dtf = df.with_columns((pl.col("text").map_elements(self.create_embeddings, return_dtype=pl.List(pl.Float64))).alias("vector"))
        data = dtf.to_dicts()
        res = self.milvus_client.upsert(
            collection_name=self.collection_name,
            data=data
        )
        print(res)


    def search(self, input_text, limit=3) -> str:
        search_params = {
            "metric_type": "COSINE",
            "params": {
                "radius": 0.4, # Radius of the search circle
                "range_filter": 0.5 # Range filter to filter out vectors that are not within the search circle
            }
        }
        query_embedding = self.create_embeddings(input_text)
        res = self.milvus_client.search(
            collection_name=self.collection_name,
            data=[query_embedding],
            limit=limit, 
            search_params=search_params, # Search parameters
            output_fields=["text", "metadata"], # Output fields to return
        )
        return json.dumps(res)

#### Ejemplo de uso

We need the **TextChunk** class to get the DataFrame

In [3]:
import json
import sqlite3
import copy
from typing import List, Dict, Optional, Union
import polars as pl
from sqlalchemy import create_engine


class TextChunk():

    def __init__(self, current_df: Optional[pl.DataFrame] = pl.DataFrame()):
        self.current_df = current_df

    def __pdf_chunk(self, json_data: List[Dict]) -> pl.DataFrame:
        # Agrupar los elementos por número de página
        pages = {}
        for item in json_data:
            page_number = item['metadata']['page_number']
            if page_number not in pages:
                pages[page_number] = []
            pages[page_number].append(item['text'])
        
        # Crear una lista de diccionarios con la estructura deseada
        data = []
        for page_number, texts in pages.items():
            data.append({
                'metadata': json.dumps({'page_number': page_number, 'filename': json_data[0]['metadata']['filename']}),
                'text': ' '.join(texts)
            })
        
        # Crear el DataFrame de Polars
        return pl.DataFrame(data).with_row_index('id', offset=len(self.current_df)+1)
    

    def __rtf_chunk(self, json_data: List[Dict]) -> pl.DataFrame:
        metadata = {
            "filetype": json_data[0]['metadata']['filetype'], 
            "filename": json_data[0]['metadata']['filename']
        }
        text = []

        for item in json_data:
            text.append(item['text'])

        data = {
            'metadata': json.dumps(metadata),
            'text': ' '.join(text)
        }

        # Crear el DataFrame de Polars
        return pl.DataFrame(data).with_row_index('id', offset=len(self.current_df)+1) 

    def __add_if_not_exists(self, new_data: Union[pl.DataFrame, Dict], key_columns: Optional[List]=None) -> pl.DataFrame:
        """
        Agrega nuevas filas al DataFrame si no existen basándose en columnas clave.
        
        :param df: DataFrame de Polars existente
        :param nuevos_datos: DataFrame o diccionario con los nuevos datos
        :param columnas_clave: Lista de nombres de columnas para verificar la existencia
        :return: DataFrame actualizado
        """
        if key_columns is None:
            key_columns = ['metadata', 'text']
        # Si nuevos_datos es un diccionario, convertirlo a DataFrame
        if isinstance(new_data, dict):
            new_data = pl.DataFrame([new_data])
        if not isinstance(new_data, pl.DataFrame):
            raise TypeError("nuevos_datos debe ser un DataFrame de Polars o un diccionario")
        
        if self.current_df.is_empty():
            self.current_df = self.current_df.vstack(new_data)
            return self.current_df

        # Crear una expresión para verificar si los datos ya existen
        condition = pl.all_horizontal([
            pl.col(col).is_in(new_data[col])
            for col in key_columns
        ])

        # Filtrar los datos existentes
        existing_data = self.current_df.filter(condition)

        # Identificar los datos nuevos
        new = new_data.join(
            existing_data.select(key_columns),
            on=key_columns,
            how="anti"
        )

        # Si hay datos nuevos, agregarlos al DataFrame original
        if not new.is_empty():
            print("Se han encontrado datos nuevos para agregar")
            self.current_df = pl.concat([self.current_df, new], how="vertical")
        else:
            print("No hay datos nuevos para agregar")
        return self.current_df
    

    def text_chunks_to_dataframe(self, json_data: List[Dict]) -> pl.DataFrame:
        filetype = json_data[0]['metadata']['filetype']
        if filetype == "application/pdf":
            df = self.__pdf_chunk(json_data)
        elif filetype == "text/rtf":
            df = self.__rtf_chunk(json_data)
        elif filetype.startswith('text'):
            data = copy.deepcopy(json_data)
            data[0]['metadata'] = json.dumps(data[0]['metadata'])
            df = pl.DataFrame(data).with_row_index('id', offset=len(self.current_df)+1)

        self.__add_if_not_exists(new_data=df)
        return self.current_df


    def save_checkpoint(self, checkpoint_path: str, table_name: Optional[str] = 'ocr_data') -> None:
        """
        Save the current DataFrame to a SQLite database checkpoint.

        This method saves the current DataFrame to a SQLite database checkpoint file. If the file already exists, it
        will be overwritten.

        Parameters:
            checkpoint_path (str): The path to the SQLite database checkpoint file.
        """

        conn = sqlite3.connect(checkpoint_path)
        temp_df = self.current_df.clone()
        temp_df.drop_in_place('id')
        temp_df.write_database(table_name=table_name, connection=f"sqlite:///{checkpoint_path}", if_table_exists="replace")
        conn.close()
    
    
    def load_checkpoint(self, checkpoint_path: str, table_name: Optional[str] = 'ocr_data') -> pl.DataFrame:
        conn = create_engine(f"sqlite:///{checkpoint_path}")
        query = f"SELECT * FROM {table_name}"
        self.current_df = pl.read_database(query=query, connection=conn.connect()).with_row_index('id')
        return self.current_df

In [24]:
manager = MilvusManager("collection")
# text = TextChunk()

In [None]:
ddf = text.load_checkpoint("checkpoint.db")
manager.create_collection()
manager.create_embeddings("Texto para ser convertido en embedding")
manager.insert_points(ddf)
print(manager.search("No existe informacion de esto."))
print(manager.search("archivo de texto"))

In [41]:
from pprint import pprint

data = manager.search("Inteligencia artificial")

pprint(json.loads(data))



[[{'distance': 0.4898538887500763,
   'entity': {'metadata': '{"filetype": "application/pdf", "filename": '
                          '"presentation_1.48.16_p.m..pdf", "page_number": 17}',
              'text': '—7 ) ESTADO DEL ARTE\n'
                      '49\n'
                      'Microsoft N \\\\ Reclaim.ai\n'
                      'Copilot ANY\n'},
   'id': 17},
  {'distance': 0.4183509051799774,
   'entity': {'metadata': '{"filetype": "application/pdf", "filename": '
                          '"presentation_1.48.16_p.m..pdf", "page_number": 3}',
              'text': 'INTRODUCCI\n'
                      'actividades laborales y personales.\n'
                      'afin\n'
                      '1© Google Workspace facilita la organización de\n'
                      'Los Modelos de Lenguaje Grande (LLM)\n'
                      'C2 permiten entender\n'
                      ' \n'
                      'y generar lenguaje natural\n'
                      'QSzg) de manera simil

In [48]:
import json

data = manager.search("Inteligencia artificial")
points = json.loads(data)
context = []
for point in points[0]:
    context.append(json.dumps(point['entity']))

pprint(context)



['{"metadata": "{\\"filetype\\": \\"application/pdf\\", \\"filename\\": '
 '\\"presentation_1.48.16_p.m..pdf\\", \\"page_number\\": 17}", "text": '
 '"\\u20147 ) ESTADO DEL ARTE\\n49\\nMicrosoft N \\\\\\\\ Reclaim.ai\\nCopilot '
 'ANY\\n"}',
 '{"metadata": "{\\"filetype\\": \\"application/pdf\\", \\"filename\\": '
 '\\"presentation_1.48.16_p.m..pdf\\", \\"page_number\\": 3}", "text": '
 '"INTRODUCCI\\nactividades laborales y personales.\\nafin\\n1\\u00a9 Google '
 'Workspace facilita la organizaci\\u00f3n de\\nLos Modelos de Lenguaje Grande '
 '(LLM)\\nC2 permiten entender\\n \\ny generar lenguaje natural\\nQSzg) de '
 'manera similar a un humano.\\n\\u2014 . e ah\\nLa generaci\\u00f3n mejorada '
 'por recuperaci\\u00f3n ere RN Da\\n(RAG) es el proceso de optimizaci\\u00f3n '
 "de la salida ema , s | me - and psy\\nde un LLM We ee \\u00f3 h'xstof_*_ "
 'aki0 of en\\nN 4 nine as a ox {0}\\n, \\u2014 EEl\\ne IS o Nd p A\\n"}']


In [2]:
from pymilvus import MilvusClient

milvus_client = MilvusClient(uri="http://localhost:19530")


In [9]:
from pprint import pprint

res = milvus_client.get(collection_name="original_name", ids=[-2014741237850642074, 8981368537327681524], output_fields=["source", "page"])
res

data: ["{'page': 0, 'pk': '-2014741237850642074', 'source': 'example.pdf'}"] 

In [12]:
res = milvus_client.query(
    collection_name="original_name",
    filter="source == 'example.pdf'",
    output_fields=["source", "page"],
    limit=10
)
res

data: ["{'source': 'example.pdf', 'page': 0, 'pk': '-2014741237850642074'}", "{'source': 'example.pdf', 'page': 1, 'pk': '8981368537327681524'}"] 