In [28]:
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss

# Datos de ejemplo
data = ["Juan Pérez 12345678", "Juana Peréz 12345679", "John Doe 87654321"]

# # Cargar modelo y generar embeddings
model = SentenceTransformer("dccuchile/bert-base-spanish-wwm-cased")
embeddings = np.array(model.encode(data), dtype="float32")



No sentence-transformers model found with name dccuchile/bert-base-spanish-wwm-cased. Creating a new one with mean pooling.
Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
# Crear índice FAISS
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

In [30]:
# Generar embedding para la consulta
query = "Juan Perez 12345678"
query_embedding = np.array(model.encode([query]), dtype="float32")

# Realizar búsqueda
distances, indices = index.search(query_embedding, k=len(data))



In [35]:
distances

array([[10.613998, 23.246489, 37.040253]], dtype=float32)

In [34]:
# Umbral de similitud (90%)
threshold = 0.7

# Calcular similitud y filtrar por umbral
results = []
for distance, idx in zip(distances[0], indices[0]):
  similarity = 1 - distance / max(distances[0])  
  # Similitud como porcentaje
  if similarity >= threshold:
    results.append({"texto": data[idx], "similaridad": similarity})


# Mostrar resultados
for result in results:
    print(f"Resultado: {result['texto']}, Similaridad: {result['similaridad']:.2%}")


Resultado: Juan Pérez 12345678, Similaridad: 71.34%
