# Embeddings and Index

In [1]:
import sys
current_dir = "/Users/josephtolsma/Documents/dev/yelp_rag"
sys.path.insert(0,current_dir)

In [2]:
import os
from sentence_transformers import SentenceTransformer
from typing import List
import pandas as pd
import numpy as np
import faiss
from src.config import DATA_DIR_PROC, EMBEDDING_MODEL_NAME,EMBED_BATCH_SIZE,\
                        COL_RESTAURANT_ID,INDEX_METRIC,INDEX_DIR

In [3]:
chunks_df = pd.read_parquet(os.path.join(DATA_DIR_PROC,"review_chunks.parquet"))

In [4]:
def load_embedding_model(model_name,device = "mps") -> SentenceTransformer:
    """
    Load a local embedding model using sentence_transformer library
    """
    return SentenceTransformer(model_name,device)

In [5]:
def embed_texts(
        model:SentenceTransformer,
        texts:List[str],
        batch_size:int = 64,
        normalize_flag:bool = True
) -> np.ndarray:
    
    embeddings = model.encode(
        sentences = texts,
        batch_size = batch_size,
        normalize_embeddings = normalize_flag,
        show_progress_bar = True,
        convert_to_numpy = True,
    )

    return embeddings.astype(np.float32,copy = False)

In [6]:
texts = chunks_df["chunk"].tolist()

In [7]:
model = load_embedding_model(EMBEDDING_MODEL_NAME,"mps")

X = embed_texts(model,texts,batch_size=EMBED_BATCH_SIZE, normalize_flag=True)

Batches:   0%|          | 0/39 [00:00<?, ?it/s]

In [8]:
def build_faiss_index(X,metric = "cosine"):

    assert X.dtype == np.float32
    n,d = X.shape

    if metric == "cosine":
        index = faiss.IndexFlatIP(d)
    elif metric == "l2":
        index = faiss.IndexFlatL2(d)
    else:
        raise ValueError("metric must be 'cosine' or 'l2'")
    
    index.add(X)
    return index

In [None]:
def create_faiss_by_restaurant(df,embeddings):
    for restaurant_id, idx in df.groupby(COL_RESTAURANT_ID).groups.items():
        idx_list = list(idx)
        X_r = embeddings[idx_list]
        meta_r = df.loc[idx_list].copy()

        index = build_faiss_index(X_r,metric = INDEX_METRIC)
        faiss.write_index(index,os.path.join(INDEX_DIR,f"{restaurant_id}.faiss"))

        meta_r.reset_index(drop = True).to_parquet(os.path.join(INDEX_DIR,f"{restaurant_id}_meta.parquet"),engine = "pyarrow",index = False)

In [10]:
create_faiss_by_restaurant(chunks_df,X)

IS0B5nLJPluVT8NwGgospA Index([   7,   15,   17,   34,   37,   64,   69,   76,  108,  146,
       ...
       2164, 2179, 2185, 2187, 2188, 2221, 2265, 2343, 2407, 2448],
      dtype='int64', length=132)
MG_wIwRBwyNnCAEMXe9Jqw Index([1051, 1062, 1231, 1244, 1260, 1316, 1317, 1322, 1341, 1344,
       ...
       2424, 2425, 2426, 2432, 2438, 2445, 2447, 2452, 2453, 2462],
      dtype='int64', length=284)
QkyEr9j7il9lJqseTbPe5w Index([   1,    2,    4,   10,   26,   31,   42,   57,   84,   86,
       ...
       2324, 2334, 2339, 2347, 2358, 2388, 2389, 2409, 2431, 2455],
      dtype='int64', length=156)
ROcx9FXq206zfI77MGo1Yw Index([   8,   70,   77,   80,   82,  130,  159,  187,  215,  239,
       ...
       2268, 2283, 2300, 2320, 2330, 2342, 2378, 2434, 2458, 2461],
      dtype='int64', length=126)
a0_wSrpAqg_eHnNej273Sw Index([  43,   46,   49,   74,   93,   96,  119,  141,  164,  213,
       ...
       2199, 2280, 2291, 2297, 2305, 2329, 2333, 2361, 2398, 2429],
      dtype='int64', le

In [11]:
rid = chunks_df[COL_RESTAURANT_ID].iloc[0]
index = faiss.read_index(os.path.join(INDEX_DIR,f"{rid}.faiss"))
meta = pd.read_parquet(os.path.join(INDEX_DIR,f"{rid}_meta.parquet"))
assert index.ntotal == len(meta)
print(f"OK: {index.ntotal} vectors")

OK: 364 vectors


In [15]:
from sentence_transformers import SentenceTransformer
query = "service was slow and rude"

q = model.encode([query],convert_to_numpy=True,normalize_embeddings=True).astype("float32")

D,I = index.search(q,k=5)
print(I[0])
print(meta.loc[I[0],"chunk"].tolist())

[  4 322 360  14 152]
['Well, good food, reasonable price. But as they themselves advertised, the service is indeed slow. I waited 20 mins to get my water, and another 10 mins to place the order, and yet another 20 mins to get the food. I went there twice. Both time, I experience good food slow services. Will I come again? Well, maybe. I will make sure I sit inside or near the door at porch so that servers could see me.', "The rudest customer service I've ever experienced. We were ignored at the door, and when we finally approached a staff member to ask about a table we were treated like a total inconvenience. The restaurant was only half full. Too bad, I'm sure the food was good but we will never go back.", 'Slowest service ever when there were only 5-6 other tables occupied. Food was mediocre and even that is a complement. My daughters French toast was flavorless, all the toast was burnt and the hash brown casserole tasted like it was a week old and had the hardest, driest, unmelted 