# https://github.com/openai/openai-cookbook/blob/main/examples/utils/embeddings_utils.py


# Get embeddings from dataset

This notebook gives an example on how to get embeddings from a large dataset.

## 1. Load the dataset

The dataset used in this example is [fine-food reviews](https://www.kaggle.com/snap/amazon-fine-food-reviews) from Amazon. The dataset contains a total of 568,454 food reviews Amazon users left up to October 2012. We will use a subset of this dataset, consisting of 1,000 most recent reviews for illustration purposes. The reviews are in English and tend to be positive or negative. Each review has a ProductId, UserId, Score, review title (Summary) and review body (Text).

We will combine the review summary and review text into a single combined text. The model will encode this combined text and it will output a single vector embedding.


To run this notebook, you will need to install: pandas, openai, transformers, plotly, matplotlib, scikit-learn, torch (transformer dep), torchvision, and scipy.


In [1]:
# import pandas as pd

# # reading only 1k rows of the data
# data = pd.read_csv('data/Reviews.csv', nrows=1000)

# # Saving it as fine_food_reviews_1k.csv
# data.to_csv('data/fine_food_reviews_1k.csv', index=False)

In [2]:
import pandas as pd
import tiktoken

from utils.embeddings_utils import get_embedding

In [3]:
embedding_model = "text-embedding-3-small"  # # "text-embedding-ada-002"
embedding_encoding = "cl100k_base"
max_tokens = 8000  # the maximum for text-embedding-3-small is 8191

In [4]:
# load & inspect dataset
input_datapath = (
    "data/fine_food_reviews_1k.csv"  # to save space, we provide a pre-filtered dataset
)
df = pd.read_csv(input_datapath, index_col=0)
df = df[["Time", "ProductId", "UserId", "Score", "Summary", "Text"]]
df = df.dropna()
df["combined"] = (
    "Title: " + df.Summary.str.strip() + "; Content: " + df.Text.str.strip()
)
df.head(2)

Unnamed: 0_level_0,Time,ProductId,UserId,Score,Summary,Text,combined
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1303862400,B001E4KFG0,A3SGXH7AUHU8GW,5,Good Quality Dog Food,I have bought several of the Vitality canned d...,Title: Good Quality Dog Food; Content: I have ...
2,1346976000,B00813GRG4,A1D87F6ZCVE5NK,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,Title: Not as Advertised; Content: Product arr...


In [5]:
# subsample to 1k most recent reviews and remove samples that are too long
top_n = 50
df = df.sort_values("Time").tail(
    top_n * 2
)  # first cut to first 2k entries, assuming less than half will be filtered out
df.drop("Time", axis=1, inplace=True)

encoding = tiktoken.get_encoding(embedding_encoding)

# omit reviews that are too long to embed
df["n_tokens"] = df.combined.apply(lambda x: len(encoding.encode(x)))
df = df[df.n_tokens <= max_tokens].tail(top_n)
len(df)

50

## 2. Get embeddings and save them for future reuse


In [6]:
# Ensure you have your API key set in your environment per the README: https://github.com/openai/openai-python#usage

# This may take a few minutes
df["embedding"] = df.combined.apply(lambda x: get_embedding(x, model=embedding_model))
df.to_csv("data/fine_food_reviews_with_embeddings_1k.csv")

In [7]:
a = get_embedding("hi", model=embedding_model)
a

[-0.003753675,
 -0.0191169,
 0.012146363,
 0.03280159,
 0.016224528,
 -0.03720825,
 -0.028202638,
 0.06499426,
 0.00096946565,
 -0.057462875,
 0.0014351697,
 -0.032641347,
 -0.030734465,
 0.0047832313,
 0.03291376,
 0.019229071,
 -0.041919373,
 -0.0029524635,
 0.024741404,
 0.046630498,
 0.037817173,
 0.03416365,
 -0.002475743,
 0.03621475,
 0.016360734,
 0.0018347738,
 0.0021532553,
 -0.008837361,
 0.032144595,
 -0.026343826,
 0.0013941076,
 -0.037785124,
 0.025799002,
 -0.03698391,
 -0.016537001,
 -0.012643114,
 -0.025638761,
 0.033009905,
 0.015527475,
 -0.037785124,
 0.024997791,
 -0.0049354616,
 0.046438206,
 0.015823923,
 -0.01983799,
 0.015679704,
 -0.026920699,
 0.016392782,
 0.003833796,
 0.028875655,
 -0.020735348,
 0.004667056,
 0.015904045,
 0.11159271,
 0.042496245,
 -0.008356634,
 0.0548349,
 0.031952307,
 -0.008500852,
 0.012402751,
 -0.019485459,
 0.010824365,
 0.011249007,
 0.016184468,
 0.0032288814,
 0.00066901144,
 -0.031183142,
 0.016713267,
 -0.018475933,
 -0.0190