# Relevant Text Retrieval

In [1]:
import sys
current_dir = "/Users/josephtolsma/Documents/dev/yelp_rag"
sys.path.insert(0,current_dir)

In [2]:
import pandas as pd
import os
import faiss
from src.config import DATA_DIR_PROC,INDEX_DIR,TOPICS,EMBEDDING_MODEL_NAME
from sentence_transformers import SentenceTransformer

In [23]:
pd.set_option("display.max_colwidth",None)

**Pick a sample review id to use for the initial function build**

In [3]:
chunk_df = pd.read_parquet(os.path.join(DATA_DIR_PROC,"review_chunks.parquet"))
rid = chunk_df["business_id"].sample(random_state=42).values[0]

**Build retrieval functionality**

In [4]:
def load_embedding_model(model_name,device = "mps") -> SentenceTransformer:
    """
    Load a local embedding model using sentence_transformer library
    """
    return SentenceTransformer(model_name,device)

In [18]:
def convert_similarity_arrays_to_df(score_array,index_array,metadata,metadata_cols):
    """
    Converts raw similarity arrays from FAISS to a dataframe of relevant text chunks.
    Limits returned values to one chunk per business to avoid duplicated statements.
    @param score_array: similarity scores returned by searching query verbatim in the corpus of reviews
    @param index_array: indicies associated with each chunk returned by the FAISS query search
    @param metadata: metadata dataframe associated with each index, such as business id and (non-encoded) text chunk
    @param metadata_cols: columns in the metadata dataframe relevant to the final result
    @returns dataframe of most similar text chunks and identifiers for each chunk
    """

    chunk_id_col = metadata_cols["chunk_id_col"]
    business_id_col = metadata_cols["business_id_col"]
    chunk_col = metadata_cols["chunk_col"]

    df = pd.DataFrame({
    "scores":score_array[0],
    "indicies":index_array[0]
    }).assign(
        chunk_id = lambda df: metadata.iloc[df["indicies"]][chunk_id_col].values,
        business_id = lambda df: metadata.loc[df["indicies"]][business_id_col].values,
        chunk = lambda df: metadata.loc[df["indicies"]][chunk_col].values
    )

    df = df.sort_values(by = "scores",ascending=False)
    # note to future self: right now, the notebook is pulling in only one restaurant file so dropping duplicates will shrink this dataframe to one row always
    df = df.drop_duplicates(subset = business_id_col,keep = "first")
    return df


In [19]:
index = faiss.read_index(os.path.join(INDEX_DIR,f"{rid}.faiss"))
meta = pd.read_parquet(os.path.join(INDEX_DIR,f"{rid}_meta.parquet"))

In [20]:
model = load_embedding_model(EMBEDDING_MODEL_NAME,"mps")
query = model.encode(["I like chicken, I like liver, meow mix please deliver"],convert_to_numpy=True,normalize_embeddings=True)

In [24]:
D,I = index.search(query,k = 10)
metadata_cols = {
    "chunk_id_col":"chunk_id",
    "business_id_col":"business_id",
    "chunk_col":"chunk"
}
convert_similarity_arrays_to_df(D,I,meta,metadata_cols)

Unnamed: 0,scores,indicies,chunk_id,business_id,chunk
0,0.480943,116,Y9VMWfb3lmIh_s80uJ2P1A_0,MG_wIwRBwyNnCAEMXe9Jqw,Went here from all the reviews! Best chicken I've had. I like the corn meal batter. The butter and maple syrup was deliciously amazing. The waffle now that could've been cooked longer it was still gooey however the chicken takes the cake here!


**Loop Retrieval Functionality Through All Topics & Restaurants**

In [7]:
for topic,keywords in TOPICS.items():
    query = model.encode([keywords],convert_to_numpy=True,normalize_embeddings=True)
    D,I = index.search(query,k = 25)
    meta = pd.concat(
        [meta,
        pd.Series(D)],
        axis = 1
        )

ValueError: Data must be 1-dimensional, got ndarray of shape (1, 25) instead