# Relevant Text Retrieval

In [1]:
import sys
current_dir = "/Users/josephtolsma/Documents/dev/yelp_rag"
sys.path.insert(0,current_dir)

In [2]:
import pandas as pd
import os
import faiss
import src.config as config
from sentence_transformers import SentenceTransformer
from pathlib import Path

In [3]:
pd.set_option("display.max_colwidth",None)

**Pick a sample review id to use for the initial function build**

In [4]:
chunk_df = pd.read_parquet(os.path.join(config.DATA_DIR_PROC,"review_chunks.parquet"))
rid = chunk_df["business_id"].sample(random_state=42).values[0]

**Build retrieval functionality**

In [5]:
def load_embedding_model(model_name,device = "mps") -> SentenceTransformer:
    """
    Load a local embedding model using sentence_transformer library
    """
    return SentenceTransformer(model_name,device)

In [6]:
def convert_similarity_arrays_to_df(score_array,index_array,metadata,metadata_cols):
    """
    Converts raw similarity arrays from FAISS to a dataframe of relevant text chunks.
    Limits returned values to one chunk per business to avoid duplicated statements.
    @param score_array: similarity scores returned by searching query verbatim in the corpus of reviews
    @param index_array: indicies associated with each chunk returned by the FAISS query search
    @param metadata: metadata dataframe associated with each index, such as business id and (non-encoded) text chunk
    @param metadata_cols: columns in the metadata dataframe relevant to the final result
    @returns dataframe of most similar text chunks and identifiers for each chunk
    """

    chunk_id_col = metadata_cols["chunk_id_col"]
    business_id_col = metadata_cols["business_id_col"]
    restaurant_name_col = metadata_cols["restaurant_name_col"]
    review_id_col = metadata_cols["review_id_col"]
    chunk_col = metadata_cols["chunk_col"]

    df = pd.DataFrame({
    "scores":score_array[0],
    "indicies":index_array[0]
    }).assign(
        chunk_id = lambda df: metadata.iloc[df["indicies"]][chunk_id_col].values,
        business_id = lambda df: metadata.iloc[df["indicies"]][business_id_col].values,
        restaurant_name = lambda df: metadata.iloc[df["indicies"]][restaurant_name_col].values,
        review_id = lambda df: metadata.iloc[df["indicies"]][review_id_col].values,
        chunk = lambda df: metadata.iloc[df["indicies"]][chunk_col].values
    )

    df = df.sort_values(by = "scores",ascending=False)
    df = df.drop_duplicates(subset = review_id_col,keep = "first")
    return df


In [7]:
index = faiss.read_index(os.path.join(config.INDEX_DIR,f"{rid}.faiss"))
meta = pd.read_parquet(os.path.join(config.INDEX_DIR,f"{rid}_meta.parquet"))

In [8]:
model = load_embedding_model(config.EMBEDDING_MODEL_NAME,"mps")
query = model.encode(["I want chicken, I want liver, meow mix meow mix please deliver"],convert_to_numpy=True,normalize_embeddings=True)

In [9]:
D,I = index.search(query,k = 3)
metadata_cols = config.METADATA_COLS
convert_similarity_arrays_to_df(D,I,meta,metadata_cols)

Unnamed: 0,scores,indicies,chunk_id,business_id,restaurant_name,review_id,chunk
0,0.424237,262,eRajiChYeFpgXQ1zeZVAWg_0,MG_wIwRBwyNnCAEMXe9Jqw,French Toast,eRajiChYeFpgXQ1zeZVAWg,HIGHLY recommend the chicken and waffles. Definitely ask for extra siracha butter... absolutely made the dish
1,0.405039,159,AWJId9LQ7sUzqtwI3WbjDg_0,MG_wIwRBwyNnCAEMXe9Jqw,French Toast,AWJId9LQ7sUzqtwI3WbjDg,"What a fantastic breakfast! I had the special-Eggs Benedict on sourdough and it was very good. Husband had chicken and waffles-perfection. Good coffee, good service, cute and clean. Called ahead and didn't have to wait. We will definitely be back."
2,0.394571,161,j372EuMmOZ_3uODKhq4JVA_0,MG_wIwRBwyNnCAEMXe9Jqw,French Toast,j372EuMmOZ_3uODKhq4JVA,Food 10/10 Service 6/10 Crepes 10/10 Coffee 10/10 Chicken 9/10 Potatoes 10/10 Ha. Hope this makes sense.


**Loop Retrieval Functionality Through All Topics (One Restauarant)**

In [10]:
result_df = pd.DataFrame()

for topic,keywords in config.TOPICS.items():
    query = model.encode([keywords],convert_to_numpy=True,normalize_embeddings=True)
    D,I = index.search(query,k = 12)
    df = convert_similarity_arrays_to_df(D,I,meta,metadata_cols)
    df.insert(0,"topic",topic)
    result_df = pd.concat([result_df,df],axis = 0)

In [11]:
# top results for each topic at the chosen restauarant
result_df.loc[0]

Unnamed: 0,topic,scores,indicies,chunk_id,business_id,restaurant_name,review_id,chunk
0,food,0.509729,114,PrAs9_Xh3SyeGMS3CJ3yFw_0,MG_wIwRBwyNnCAEMXe9Jqw,French Toast,PrAs9_Xh3SyeGMS3CJ3yFw,"Visited this place on a Thursday morning and had to wait 20 mins! It was packed with people! We went in a party of four, and ordered 5 dishes and got to tried all of fhem:) Prosciutto, ricotta & honey toast Savory crepe Banana nutella crepe Coconut stuffed pineapple french toast Omelette They are all pretty good I like the toast best -- mainly because the serving size is reasonable and it's not too buttery or sweet like the other one The interior is bright and clean with a high ceiling! Service is great, our server was very attentive!"
0,service,0.658517,226,mn-nhx60sbtnKjRrGjVKQw_0,MG_wIwRBwyNnCAEMXe9Jqw,French Toast,mn-nhx60sbtnKjRrGjVKQw,Hostess was really rude. I get you're doing your job but you're working in an area that caters to people coming from out of town. If this is how you act towards everyone then it's on you. You have an app system that tells people that tables are ready and we come and you give attitude
0,ambiance,0.271022,129,kE414G1lzKDhUncUHcyaxA_0,MG_wIwRBwyNnCAEMXe9Jqw,French Toast,kE414G1lzKDhUncUHcyaxA,These crepes are the best I've ever had! We loved the atmosphere and how it was located next tot he French Market. I definitely recommend this place and we will be coming back.


**Loop Retrieval Functionality Through All Topics (All Restaurants)**

In [12]:
faiss_files = sorted(config.INDEX_DIR.glob("*.faiss"))

In [13]:
index_pairs = []
for faiss_path in faiss_files:
    restaurant_id = faiss_path.stem
    meta_path = config.INDEX_DIR / f"{restaurant_id}_meta.parquet"

    if not meta_path.exists():
        raise KeyError(f"Missing metadata for restaurant id {restaurant_id}")
    
    index_pairs.append({
        "restaurant_id":restaurant_id,
        "index_file":faiss_path,
        "metadata_file":meta_path
    })

In [17]:
query_embed = {}
for topic,keywords in config.TOPICS.items():
    query = model.encode([keywords],convert_to_numpy=True,normalize_embeddings=True)
    query_embed.update({topic:query})

In [48]:
result_df = pd.DataFrame()

for file in index_pairs:
    index_file_path = str(file["index_file"])
    meta_file_path = file["metadata_file"]
    index = faiss.read_index(index_file_path)
    meta = pd.read_parquet(meta_file_path)
    for topic in config.TOPICS.keys():
        D,I = index.search(query_embed[topic],k = config.TOP_K_PER_TOPIC)
        df = convert_similarity_arrays_to_df(D,I,meta,config.METADATA_COLS)
        df.insert(0,"topic",topic)
        df = df.iloc[:config.MAX_CHUNKS_PER_TOPIC]
        result_df = pd.concat([result_df,df],axis = 0,ignore_index=True)


In [58]:
pd.concat([
    result_df[result_df["topic"] == "food"].sample(1,random_state=1),
    result_df[result_df["topic"] == "service"].sample(1,random_state=2),
    result_df[result_df["topic"] == "ambiance"].sample(1,random_state=4),
],axis = 0, ignore_index=True
)

Unnamed: 0,topic,scores,indicies,chunk_id,business_id,restaurant_name,review_id,chunk
0,food,0.430436,80,8sv2E19V1ivElTQopQh8Fg_0,rOsC1rDpIxvNoHxEvNFjeA,Backspace Bar & Kitchen,8sv2E19V1ivElTQopQh8Fg,"We searched yelp and found this jewel. Was kinda over the creole thing and needed a good burger. Best place ever. I actually opted for the Korean BBQ skewers and grilled veggies. Best damn decision I made in three days. Even though they were small they were delicious. Hubby had regular burger with chips. He loved it. The chips were so good, yep ate a couple of his. I must mention the waitress, Chelsea is super nice and very knowledgeable on the menu. My advice is go with her suggestion."
1,service,0.324812,37,7QefDR34zVqwuwybTTvuOg_0,ROcx9FXq206zfI77MGo1Yw,Nook A Paleo Influenced Diner,7QefDR34zVqwuwybTTvuOg,"As soon as we walked into the restaurant, the hostess was very welcoming. Our server provided us with a shot of olive oil before our meal. Something unique that they started at the restaurant. Then we went ahead and ordered the brussels sprouts and the bacon guacamole with paleo chips. They were amazing! For dinner we ordered the nook paleo burger.... it was Maeve especially paired with a glass of Pinot noir. The Pinot noir, meomi, was the perfect match! The service and food is amazing! A lot of healthy options! Our server is Jacinda... ask for her! Her recommendations are amazing and unique! The-front of the house manager was welcoming and generous! You must eat here!"
2,ambiance,0.265995,3,ejZ3oHmlmgtjd1CYyd95fQ_0,MG_wIwRBwyNnCAEMXe9Jqw,French Toast,ejZ3oHmlmgtjd1CYyd95fQ,Made reservations through OpenTable. Arrived and it was busy weekend so we were seated at bar (party of two). Mimosas for drinks and cool part is we could see through the receiving window and watch our food cooked. Energy was upbeat and food was delicious
