In [88]:
import os
import openai
import requests
import json
import sseclient
import pandas as pd
import json
from pathlib import Path
from tqdm import tqdm

import dotenv

In [56]:
env_file = '.env'
dotenv.load_dotenv(env_file, override=True)
openai.api_key = os.getenv("OPENAI_API_KEY")

In [57]:
docs_chunks_path = "create_chunks/docs_chunks.json"
with open(docs_chunks_path, 'r', encoding="utf8") as f:
    docs_chunks = json.load(f)

len(docs_chunks), docs_chunks[:2]

(338,
 [{'header': ['Alert'],
   'link': '/docs/alert',
   'content': "A small box to quickly grab the user's attention and communicate a brief message. An alert is an element that notifies the user of something important that is not user initiated. If you need to alert the user because of something they did, consider using a toast notification or an inline message. Instead, an alert is used to notify the user of something unrelated to what they just did. Examples of this might be announcing a new feature within the product, or alerting the user of scheduled server maintenance and downtime."},
  {'header': ['Alert', 'Appearance'],
   'link': '/docs/alert#appearance',
   'content': 'The alert message must be concise, we recommend no more than 256 characters. If you feel the need to explain in further detail, you may use an optional hyperlink at the end of the alert message to explain more on a new page. You may also include an optional close icon (close icon is required for sticky alert

In [58]:
def get_embeddings(input_content, model="text-embedding-ada-002"):
    response = openai.Embedding.create(
        model=model,
        input=f"{input_content}"
    )
    return response

In [63]:
pd.read_parquet("../../data/embeddings_df.parquet", engine='fastparquet').columns

Index(['chunk', 'object', 'embedding_object', 'index', 'embedding',
       'prompt_tokens', 'total_tokens'],
      dtype='object')

In [70]:
try:
  embeddings_df = pd.read_parquet("data/embeddings_df.parquet", engine='fastparquet')
except FileNotFoundError:
  # Create data directory if it doesn't exist
  Path("data").mkdir(parents=True, exist_ok=True)

  embeddings_df = pd.DataFrame(columns=['chunk', 'object', 'embedding_object', 'index', 'embedding', 'prompt_tokens', 'total_tokens'])
  embeddings_df.to_parquet("data/embeddings_df.parquet", engine='fastparquet')

embeddings_df.head()

Unnamed: 0,chunk,object,embedding_object,index,embedding,prompt_tokens,total_tokens


In [71]:
# Check if `chunk` is already in `embeddings_df`
def check_if_chunk_in_embeddings_df(chunk):
  return embeddings_df[embeddings_df["chunk"] == chunk].shape[0] > 0

In [73]:
print(check_if_chunk_in_embeddings_df(docs_chunks[0]))
print(check_if_chunk_in_embeddings_df(docs_chunks[1]))
# print(check_if_chunk_in_embeddings_df(embeddings_df.loc[0, "chunk"]))
# print(check_if_chunk_in_embeddings_df(embeddings_df.loc[1, "chunk"]))

False
False


In [89]:
remaining_chunks = [chunk for chunk in docs_chunks if not check_if_chunk_in_embeddings_df(chunk)]

# # Safety check: Confirm code works before running on all chunks
# remaining_chunks = remaining_chunks[:1]

f"{len(remaining_chunks)}/{len(docs_chunks)} chunks remaining to be embedded"

'197/338 chunks remaining to be embedded'

In [90]:
response_cache = None

for chunk in tqdm(remaining_chunks):
    # One more check, just to be sure
    if check_if_chunk_in_embeddings_df(chunk):
        continue

    response = get_embeddings(chunk)
    response_cache = response

    embeddings_df.loc[len(embeddings_df)] = {
        "chunk": chunk,
        "object": response["object"],
        "embedding_object": response["data"][0]["object"],
        "index": response["data"][0]["index"],
        "embedding": response["data"][0]["embedding"],
        "prompt_tokens": response["usage"]["prompt_tokens"],
        "total_tokens": response["usage"]["total_tokens"],
    }

    # # Safety check: Confirm code works before running on all chunks
    # break

100%|██████████| 197/197 [00:55<00:00,  3.53it/s]


In [83]:
response_cache

<OpenAIObject list at 0x216cb041b30> JSON: {
  "data": [
    {
      "embedding": [
        -0.002292848890647292,
        0.04747709631919861,
        0.008545598946511745,
        -0.002765672979876399,
        0.013482438400387764,
        0.02117973566055298,
        -0.006337928120046854,
        -0.00494379224255681,
        -0.019010307267308235,
        -0.007697296794503927,
        -0.009352181106805801,
        0.02099894918501377,
        -0.03142889216542244,
        0.004627417307347059,
        -0.009713752195239067,
        0.02280680648982525,
        0.025838442146778107,
        -0.018189817667007446,
        1.6921494534472004e-05,
        -0.018593110144138336,
        -0.0007140164379961789,
        0.00734963221475482,
        -0.017119010910391808,
        0.006403984036296606,
        -0.01405260805040598,
        0.010916672646999359,
        0.014643638394773006,
        -0.026728464290499687,
        -0.013044380582869053,
        -0.021443959325551987,
    

In [100]:
embeddings_df.loc[0:10, "embedding"].map(len)

0     1536
1     1536
2     1536
3     1536
4     1536
5     1536
6     1536
7     1536
8     1536
9     1536
10    1536
Name: embedding, dtype: int64

In [101]:
embeddings_df.to_parquet("data/embeddings_df.parquet")