In [None]:
## Activate virtual env
# $ !python3 -m virtualenv .venv
# $ !source .venv/bin/activate

## Install OpenAI package
# $ !pip install openai

## Export Gilas.io API key
# $ os.environ["GILAS_API_KEY"]='...'

In [None]:
import os
import tiktoken
from dotenv import load_dotenv
from openai import OpenAI

load_dotenv()  

client = OpenAI(
    api_key=os.environ.get("GILAS_API_KEY"),
    base_url="https://api.gilas.io/v1/"
)

In [None]:
def get_embedding(text, dimentions=1531, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], dimensions=dimentions, model=model).data[0].embedding

In [None]:
# https://www.kaggle.com/
# We use a dataset of Amazon reviews

import pandas as pd

# load & inspect dataset
input_datapath = "../data/reviews.csv"  # to save space, we provide a pre-filtered dataset
df = pd.read_csv(input_datapath, index_col=0, delimiter=";")
df = df[["ProductId", "Score", "Text"]]
df = df.dropna()
df.head(2)

In [None]:
embedding_encoding = "cl100k_base"
max_tokens = 8000  # the maximum for text-embedding-3-small is 8191
encoding = tiktoken.get_encoding(embedding_encoding)

# omit reviews that are too long to embed
df["n_tokens"] = df.Text.apply(lambda x: len(encoding.encode(x)))
df = df[df.n_tokens <= max_tokens].tail(30)
len(df)

In [None]:
# Get embeddings and save them for future reuse
# This may take a few minutes

df["embedding"] = df.Text.apply(lambda x: get_embedding(x, dimentions=100))

In [None]:
df.to_csv("../data/reviews_with_embeddings_30.csv", sep=";")