In [1]:
import textwrap as tr
from typing import List, Optional

import matplotlib.pyplot as plt
import plotly.express as px
from scipy import spatial
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import average_precision_score, precision_recall_curve

import requests
import json
import numpy as np
import pandas as pd
import os
from dotenv import load_dotenv
from typing import List
import tiktoken


# Load environment variables from .env file
load_dotenv()


OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
RELAY_EMBEDDINGS_URL = os.getenv("RELAY_EMBEDDINGS_URL")

embedding_model = "text-embedding-3-small"  # "text-embedding-ada-002"
embedding_encoding = "cl100k_base"
max_tokens = 8000  # the maximum for text-embedding-3-small is 8191

In [2]:
# def num_tokens_from_string(string: str, encoding_name: str) -> int:
#     """Returns the number of tokens in a text string."""
#     encoding = tiktoken.get_encoding(encoding_name)
#     num_tokens = len(encoding.encode(string))
#     return num_tokens

# num_tokens_from_string("tiktoken is great!", "cl100k_base")

In [2]:
# load & inspect dataset
input_datapath = (
    "data/fine_food_reviews_1k.csv"  # to save space, we provide a pre-filtered dataset
)
df = pd.read_csv(input_datapath, index_col=0)
df = df[["Time", "ProductId", "UserId", "Score", "Summary", "Text"]]
df = df.dropna()
df["combined"] = (
    "Title: " + df.Summary.str.strip() + "; Content: " + df.Text.str.strip()
)
df.head(2)

Unnamed: 0_level_0,Time,ProductId,UserId,Score,Summary,Text,combined
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1303862400,B001E4KFG0,A3SGXH7AUHU8GW,5,Good Quality Dog Food,I have bought several of the Vitality canned d...,Title: Good Quality Dog Food; Content: I have ...
2,1346976000,B00813GRG4,A1D87F6ZCVE5NK,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,Title: Not as Advertised; Content: Product arr...


In [3]:
# def get_embedding(text: str, model="text-embedding-3-small", **kwargs) -> List[float]:
#     # replace newlines, which can negatively affect performance.
#     text = text.replace("\n", " ")

#     response = client.embeddings.create(
#         input=[text],
#         model=model,
#         **kwargs,
#     )

#     return response.data[0].embedding

In [4]:
def get_embedding(text: str, model, **kwargs) -> List[float]:
    # replace newlines, which can negatively affect performance.
    text = text.replace("\n", " ")

    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {OPENAI_API_KEY}",
    }

    data = {
        "input": text,
        "model": model,
        **kwargs,
    }

    response = requests.post(
        RELAY_EMBEDDINGS_URL,
        json=data,
        headers=headers,
    )
    response_data = response.json()

    return response_data["data"][0]["embedding"]

    # return response.data[0].embedding

In [5]:
#  Testing function
text = "I love the taste of this product."
embedding = get_embedding(text, model=embedding_model, encoding=embedding_encoding)
embedding

[0.021390485,
 -0.029843178,
 -0.0718602,
 0.004060003,
 -0.025234858,
 -0.049065042,
 0.022622656,
 0.05086401,
 0.0013915829,
 -0.029621387,
 0.015204988,
 -0.03260324,
 -0.03196251,
 -0.022314614,
 0.032332163,
 0.0015017082,
 -0.027847061,
 -0.008298671,
 -0.021957284,
 -0.042386677,
 0.020712791,
 0.028019564,
 -0.047216784,
 0.068755135,
 -0.02019528,
 -0.037605852,
 -0.033860054,
 -0.0084834965,
 0.00093953026,
 -0.049779702,
 -0.037901573,
 -0.026122022,
 -0.019209543,
 -0.063580014,
 0.029818535,
 -0.033638265,
 0.036891196,
 0.016930027,
 0.0126420725,
 0.012223135,
 -0.025777014,
 0.0119767,
 0.010405683,
 0.015882682,
 -0.012728324,
 -0.011446867,
 -0.048227165,
 0.03792622,
 0.024939137,
 -0.005215163,
 -0.022376223,
 0.016338585,
 0.004851673,
 0.00844037,
 0.04578747,
 -0.022524083,
 0.04058771,
 0.07048017,
 0.020454036,
 -0.005230565,
 0.025308788,
 0.003354585,
 0.0064935405,
 -0.017693972,
 -0.030779628,
 -0.055792697,
 0.0053044953,
 -0.044727802,
 -0.023411246,
 0.

In [6]:
#  Testing function
text = "I love the taste of this product."
embedding = get_embedding(
    text, model="text-embedding-ada-002", encoding=embedding_encoding
)
embedding

[0.009474204,
 -0.017164724,
 -0.010566387,
 -0.02560491,
 -0.0063430634,
 0.010385434,
 0.008931343,
 -0.032338962,
 -0.0079361005,
 -0.02035726,
 0.0038711107,
 0.0077228337,
 -0.042989362,
 -0.02198584,
 -0.011128635,
 0.011690884,
 0.03869818,
 -0.0011963929,
 0.005528773,
 -0.004666013,
 -0.005680645,
 0.00875039,
 -0.0035996805,
 -0.019982427,
 0.0071347347,
 -0.0014040046,
 0.011671496,
 -0.027375668,
 -0.0025866646,
 0.0264709,
 0.01220143,
 0.002580202,
 -0.011955851,
 -0.0005057002,
 -0.013313002,
 0.0014314709,
 -0.017914388,
 -0.017087173,
 0.014204844,
 0.0026302873,
 -0.018586501,
 0.003958356,
 0.030762082,
 -0.0076388195,
 -0.029934866,
 0.005793741,
 -0.0039228117,
 -0.0002978865,
 0.009002432,
 -0.0026092837,
 -0.0011358059,
 0.006155648,
 -0.019607596,
 -0.0045852303,
 0.010385434,
 -0.008155829,
 -0.00031121564,
 -0.011703809,
 0.034148496,
 -0.022128018,
 0.0056741824,
 0.02035726,
 -0.021158624,
 -0.0012497095,
 0.008944269,
 -0.011652107,
 -0.012686127,
 -0.01052

In [7]:
# subsample to 1k most recent reviews and remove samples that are too long
top_n = 2
df = df.sort_values("Time").tail(
    top_n * 2
)  # first cut to first 2k entries, assuming less than half will be filtered out
df.drop("Time", axis=1, inplace=True)

encoding = tiktoken.get_encoding(embedding_encoding)

# omit reviews that are too long to embed
df["n_tokens"] = df.combined.apply(lambda x: len(encoding.encode(x)))
df = df[df.n_tokens <= max_tokens].tail(top_n)
len(df)

2

In [8]:
# Ensure you have your API key set in your environment per the README: https://github.com/openai/openai-python#usage

# This may take a few minutes
df["embedding"] = df.combined.apply(lambda x: get_embedding(x, model=embedding_model))
df.to_csv("data/fine_food_reviews_with_embeddings_1k.csv")

In [9]:
a = get_embedding("hi", model=embedding_model)
a

[-0.003753675,
 -0.0191169,
 0.012146363,
 0.03280159,
 0.016224528,
 -0.03720825,
 -0.028202638,
 0.06499426,
 0.00096946565,
 -0.057462875,
 0.0014351697,
 -0.032641347,
 -0.030734465,
 0.0047832313,
 0.03291376,
 0.019229071,
 -0.041919373,
 -0.0029524635,
 0.024741404,
 0.046630498,
 0.037817173,
 0.03416365,
 -0.002475743,
 0.03621475,
 0.016360734,
 0.0018347738,
 0.0021532553,
 -0.008837361,
 0.032144595,
 -0.026343826,
 0.0013941076,
 -0.037785124,
 0.025799002,
 -0.03698391,
 -0.016537001,
 -0.012643114,
 -0.025638761,
 0.033009905,
 0.015527475,
 -0.037785124,
 0.024997791,
 -0.0049354616,
 0.046438206,
 0.015823923,
 -0.01983799,
 0.015679704,
 -0.026920699,
 0.016392782,
 0.003833796,
 0.028875655,
 -0.020735348,
 0.004667056,
 0.015904045,
 0.11159271,
 0.042496245,
 -0.008356634,
 0.0548349,
 0.031952307,
 -0.008500852,
 0.012402751,
 -0.019485459,
 0.010824365,
 0.011249007,
 0.016184468,
 0.0032288814,
 0.00066901144,
 -0.031183142,
 0.016713267,
 -0.018475933,
 -0.0190