In [1]:
from dotenv import load_dotenv

load_dotenv()

True

In [16]:
import pandas as pd
import numpy as np
from ast import literal_eval

datafile_path = "data/fine_food_reviews_with_embeddings_1k.csv"

df = pd.read_csv(datafile_path)
df["embedding"] = df.embedding.apply(literal_eval)


In [17]:
df["embedding"]

0      [0.03599238395690918, -0.02116263099014759, -0...
1      [-0.07042013108730316, -0.03175969794392586, -...
2      [0.05692615360021591, -0.005402443464845419, 0...
3      [-0.011223138310015202, -0.049720242619514465,...
4      [0.05692615360021591, -0.005402443464845419, 0...
                             ...                        
995    [-0.04803164303302765, 0.04621649533510208, 0....
996    [0.02654704451560974, -0.027484629303216934, -...
997    [-0.011052397079765797, -0.029021456837654114,...
998    [-0.0058358414098620415, 0.021213747560977936,...
999    [0.019206926226615906, -0.019285108894109726, ...
Name: embedding, Length: 1000, dtype: object

In [18]:
from openai import OpenAI
from sklearn.metrics.pairwise import cosine_similarity

def get_embedding(text: str, model: str = "text-embedding-ada-002") -> list[float]:
    client = OpenAI()
    response = client.embeddings.create(input=text, model=model)

    return response.data[0].embedding

def get_cosine_similarity(x: list[float], y: list[float]):
    return cosine_similarity(
        np.array(x).reshape(1, -1),
        np.array(y).reshape(1, -1)
    )[0][0]

In [25]:
def search_reviews(df, product_description, n=3, pprint=True):
    product_embedding = get_embedding(
        product_description,
        model="text-embedding-ada-002"
    )
    df["similarity"] = df.embedding.apply(lambda x: get_cosine_similarity(x, product_embedding))
    results = df.sort_values("similarity", ascending=False)
    return results


results = search_reviews(df, "delicious beans", n=3)


In [26]:
results

Unnamed: 0.1,Unnamed: 0,ProductId,UserId,Score,Summary,Text,combined,n_tokens,embedding,similarity
9,289,B0048GRNZM,AXG287OY16WWL,1,Cute,"For some reason I thought that you got three ""...",Title: Cute; Content: For some reason I though...,77,"[0.029848122969269753, -0.03594868630170822, -...",0.050866
717,499,B0024NUM7W,A1SFA2BYES0OH,3,"eh,,ok, but...","I love popcorn, and this product is good excep...","Title: eh,,ok, but...; Content: I love popcorn...",72,"[0.006537809036672115, 0.01702781766653061, -0...",0.049285
403,114,B006W6YHV4,A27DIIBWR2ASZY,5,Crack for dogs.,These thing are like crack for dogs. I am not ...,Title: Crack for dogs.; Content: These thing a...,32,"[0.01634955033659935, 0.013635904528200626, -0...",0.048756
426,147,B006W6YHHI,A27DIIBWR2ASZY,5,Crack for dogs.,These thing are like crack for dogs. I am not ...,Title: Crack for dogs.; Content: These thing a...,32,"[0.01634955033659935, 0.013635904528200626, -0...",0.048756
548,778,B000RZSPTG,A3GF0GES2U2V19,1,Made in China,I didn't realize this product is made in China...,Title: Made in China; Content: I didn't realiz...,44,"[0.048240773379802704, -0.005016401410102844, ...",0.047771
...,...,...,...,...,...,...,...,...,...,...
941,597,B001ELJJRY,A3UCQK5288MQ98,5,Great wine,I bought this wine kit because I wanted to fin...,Title: Great wine; Content: I bought this wine...,116,"[-0.0252769123762846, -0.013530069030821323, -...",-0.067334
540,753,B001E5E3S0,A27NSDZXTFJXVZ,5,Great product,McCann's Irish Quick cooking is the best tasti...,Title: Great product; Content: McCann's Irish ...,61,"[0.01610550843179226, -0.006157613359391689, -...",-0.068587
465,803,B003XKF6CQ,A3IYSIAKYOMKTO,5,Mellow,This honey made from blueberry blossoms has a ...,Title: Mellow; Content: This honey made from b...,50,"[0.03164437785744667, -0.013095518574118614, -...",-0.073465
26,299,B005C3IVN8,A3HECKQ9P79EY9,5,Andersons grade B Pure Maple Syrup,"Being a hard core ""all things from Vermont are...",Title: Andersons grade B Pure Maple Syrup; Con...,135,"[-0.023515749722719193, -0.012200198136270046,...",-0.074166
