# Embeddings and Index

In [2]:
import sys
current_dir = "/Users/josephtolsma/Documents/dev/yelp_rag"
sys.path.insert(0,current_dir)

In [3]:
import os
from sentence_transformers import SentenceTransformer
from typing import List
import pandas as pd
import numpy as np
import faiss
from src.config import DATA_DIR_PROC, EMBEDDING_MODEL_NAME,EMBED_BATCH_SIZE,\
                        COL_RESTAURANT_ID,INDEX_METRIC,INDEX_DIR

In [4]:
chunks_df = pd.read_parquet(os.path.join(DATA_DIR_PROC,"review_chunks.parquet"))

In [5]:
def load_embedding_model(model_name,device = "mps") -> SentenceTransformer:
    """
    Load a local embedding model using sentence_transformer library
    """
    return SentenceTransformer(model_name,device)

In [6]:
def embed_texts(
        model:SentenceTransformer,
        texts:List[str],
        batch_size:int = 64,
        normalize_flag:bool = True
) -> np.ndarray:
    
    embeddings = model.encode(
        sentences = texts,
        batch_size = batch_size,
        normalize_embeddings = normalize_flag,
        show_progress_bar = True,
        convert_to_numpy = True,
    )

    return embeddings.astype(np.float32,copy = False)

In [7]:
texts = chunks_df["chunk"].tolist()

In [8]:
model = load_embedding_model(EMBEDDING_MODEL_NAME,"mps")

X = embed_texts(model,texts,batch_size=EMBED_BATCH_SIZE, normalize_flag=True)

Batches:   0%|          | 0/39 [00:00<?, ?it/s]

In [9]:
def build_faiss_index(X,metric = "cosine"):

    assert X.dtype == np.float32
    n,d = X.shape

    if metric == "cosine":
        index = faiss.IndexFlatIP(d)
    elif metric == "l2":
        index = faiss.IndexFlatL2(d)
    else:
        raise ValueError("metric must be 'cosine' or 'l2'")
    
    index.add(X)
    return index

In [10]:
def create_faiss_by_restaurant(df,embeddings):
    for restaurant_id, idx in df.groupby(COL_RESTAURANT_ID).groups.items():
        idx_list = list(idx)
        X_r = embeddings[idx_list]
        meta_r = df.loc[idx_list].copy()

        index = build_faiss_index(X_r,metric = INDEX_METRIC)
        faiss.write_index(index,os.path.join(INDEX_DIR,f"{restaurant_id}.faiss"))

        meta_r.reset_index(drop = True).to_parquet(os.path.join(INDEX_DIR,f"{restaurant_id}_meta.parquet"),engine = "pyarrow",index = False)

In [11]:
create_faiss_by_restaurant(chunks_df,X)

In [None]:
# rid = chunks_df[COL_RESTAURANT_ID].iloc[0]
# index = faiss.read_index(os.path.join(INDEX_DIR,f"{rid}.faiss"))
# meta = pd.read_parquet(os.path.join(INDEX_DIR,f"{rid}_meta.parquet"))
# assert index.ntotal == len(meta)
# print(f"OK: {index.ntotal} vectors")

OK: 364 vectors


In [39]:
r_id = chunks_df[COL_RESTAURANT_ID].sample(random_state=42).values[0]
index = faiss.read_index(os.path.join(INDEX_DIR,f"{r_id}.faiss"))
meta = pd.read_parquet(os.path.join(INDEX_DIR,f"{r_id}_meta.parquet"))
query = "I have dietary restrictions and my restrictions make my life difficult."
query_encoded = model.encode([query],convert_to_numpy=True,normalize_embeddings=True)

In [40]:
pd.set_option("display.max_colwidth",None)
D,I = index.search(query_encoded,k=5)
chunks = meta.loc[I[0]]["chunk"].tolist()
pd.Series(chunks)


0                                                                                                I am vegan so finding something in New Orleans has been hard. These guys made me a special order. They even have soy milk! So happy!!
1                                                                         Sat at the counter in front of the kitchen and now I want to have everything. Good portions. Friendly service. Looks like our breakfast place for this trip.
2    This place is amazing! It is rare to go to a place with a small menu and want to try everything. The bread is to die for and everything we tried (we came here everyday of our trip) was amazing. I cant wait to come back again.
3               After waiting in line for 20 mins, my wife and I were denied service because I have I service dog that I need with me at all times. Very disappointing to be denied when service dogs are protected under federal law.
4                                                                           

In [25]:
meta

Unnamed: 0,chunk_index,chunk,business_id,review_id,restaurant_name,chunk_id,n_chars,stars,date
0,0,Came to French toast on our trip to New Orlean...,MG_wIwRBwyNnCAEMXe9Jqw,KMh--tSMtHrrFjw1ruaGIw,French Toast,KMh--tSMtHrrFjw1ruaGIw_0,527,4.5,2018-11-16 18:45:13
1,0,A very accessible place for breakfast on Decat...,MG_wIwRBwyNnCAEMXe9Jqw,FDCy6i2guGYQObxTAI0p-Q,French Toast,FDCy6i2guGYQObxTAI0p-Q_0,318,4.5,2018-10-20 15:10:08
2,0,Fantastic breakfast spot. Always packed. The M...,MG_wIwRBwyNnCAEMXe9Jqw,jcXZ_vCjvogsMCHVNLggcA,French Toast,jcXZ_vCjvogsMCHVNLggcA_0,138,4.5,2018-12-18 02:39:36
3,0,Made reservations through OpenTable. Arrived a...,MG_wIwRBwyNnCAEMXe9Jqw,ejZ3oHmlmgtjd1CYyd95fQ,French Toast,ejZ3oHmlmgtjd1CYyd95fQ_0,255,4.5,2019-02-18 04:05:16
4,0,Great place to stop in when in town. Servers w...,MG_wIwRBwyNnCAEMXe9Jqw,rHyZ8n-3T4m9L8-5RVGH-Q,French Toast,rHyZ8n-3T4m9L8-5RVGH-Q_0,179,4.5,2018-12-30 17:15:07
...,...,...,...,...,...,...,...,...,...
279,0,We were looking for a nice breakfast spot off ...,MG_wIwRBwyNnCAEMXe9Jqw,H42ANOt0vI4OHdjszMk5zA,French Toast,H42ANOt0vI4OHdjszMk5zA_0,633,4.5,2021-08-22 19:07:55
280,0,Good food for sure! I ordered chicken and waff...,MG_wIwRBwyNnCAEMXe9Jqw,4D0JJh7iVJxVLa7_uAymKg,French Toast,4D0JJh7iVJxVLa7_uAymKg_0,585,4.5,2019-08-01 15:15:03
281,0,First time going to New Orleans and in search ...,MG_wIwRBwyNnCAEMXe9Jqw,fPJOUCA-eJIPtHo0Vr0xFw,French Toast,fPJOUCA-eJIPtHo0Vr0xFw_0,221,4.5,2020-03-22 19:03:57
282,0,"wait took forever, they lied about our table b...",MG_wIwRBwyNnCAEMXe9Jqw,_eI6aUNdAE8QFuf8KnZXcA,French Toast,_eI6aUNdAE8QFuf8KnZXcA_0,183,4.5,2021-06-09 17:09:02
