In [2]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv('cleaned_items_with_metadata.csv')
queries_df = pd.read_csv('./data/queries.csv')

item_texts = data["full_text"].tolist()
query_texts = queries_df["search_term_pt"].tolist()


In [4]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('PORTULAN/serafim-100m-portuguese-pt-sentence-encoder')

# Embed your texts
item_embeddings = model.encode(item_texts, show_progress_bar=True, batch_size=64)
query_embeddings = model.encode(query_texts, show_progress_bar=True, batch_size=64)


Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
import faiss
import numpy as np

# Convert embeddings to float32 (required by FAISS)
item_embeddings = np.array(item_embeddings).astype("float32")
query_embeddings = np.array(query_embeddings).astype("float32")


In [7]:
from sklearn.preprocessing import normalize
import numpy as np

item_embeddings = normalize(item_embeddings, axis=1)
query_embeddings = normalize(query_embeddings, axis=1)

# Convert to float32 for FAISS
item_embeddings = item_embeddings.astype("float32")
query_embeddings = query_embeddings.astype("float32")


In [8]:
import faiss

dimension = item_embeddings.shape[1]  # 768 for Serafim
index = faiss.IndexFlatIP(dimension)
index.add(item_embeddings)  # adds all 5000 items


In [9]:
top_k = 3
distances, indices = index.search(query_embeddings, top_k)


In [10]:
results = []

for i, query in enumerate(query_texts):
    top_matches = []
    for rank, (idx, score) in enumerate(zip(indices[i], distances[i]), 1):
        top_matches.append({
            "rank": rank,
            "item_text": item_texts[idx],
            "score": float(score)
        })
    results.append({
        "query": query,
        "matches": top_matches
    })


In [11]:
results[0]

{'query': 'Batatas fritas de rua carregadas',
 'matches': [{'rank': 1,
   'item_text': "Balde de Batata Frita. As nossas Batatas crocantes e irresistíveis, sempre levemente salgadas, servidas em nosso Balde especial! São o acompanhamento perfeito para dividir!. Categoria: Acompanhamentos. Taxonomia: {'l0': 'ALIMENTOS_PREPARADOS', 'l1': 'PRATOS', 'l2': 'BATATAS_PREPARADAS'}. Tags: VEGETARIAN, SERVES_2",
   'score': 0.7015063166618347},
  {'rank': 2,
   'item_text': "Batata Doce Branca. Compra por peso. Categoria: Feira. Taxonomia: {'l0': 'FLV', 'l1': 'LEGUMES', 'l2': 'BATATA_DOCE'}",
   'score': 0.697460949420929},
  {'rank': 3,
   'item_text': "190 - porção de fritas. Categoria: Porções. Taxonomia: {'l0': 'ALIMENTOS_PREPARADOS', 'l1': 'PRATOS', 'l2': 'BATATAS_PREPARADAS'}. Tags: SERVES_3",
   'score': 0.6922088265419006}]}