In [1]:
import os
import openai
from openai.embeddings_utils import get_embedding, cosine_similarity
import requests
import json
import sseclient
import pandas as pd
import json
from pathlib import Path
from tqdm import tqdm

import dotenv

In [2]:
env_file = '.env'
dotenv.load_dotenv(env_file, override=True)
openai.api_key = os.getenv("OPENAI_API_KEY")

In [3]:
embeddings_df = pd.read_parquet("data/embeddings_df.parquet", engine='fastparquet')
embeddings_df.head()

Unnamed: 0_level_0,chunk,object,embedding_object,index,embedding,prompt_tokens,total_tokens
level_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,"{'header': ['Alert'], 'link': '/docs/alert', '...",list,embedding,0,"[-0.022364800795912743, 0.031609658151865005, ...",119,119
1,"{'header': ['Alert', 'Appearance'], 'link': '/...",list,embedding,0,"[-0.002292848890647292, 0.04747709631919861, 0...",88,88
2,"{'header': ['Alert', 'Appearance', 'Informatio...",list,embedding,0,"[-0.0034569110721349716, 0.047789670526981354,...",60,60
3,"{'header': ['Alert', 'Appearance', 'Positive']...",list,embedding,0,"[0.004593235906213522, 0.036937415599823, 0.00...",40,40
4,"{'header': ['Alert', 'Appearance', 'Warning'],...",list,embedding,0,"[-0.002690419787541032, 0.03875597566366196, 0...",53,53


In [4]:
try:
    # Read `embeddings_cache` from parquet file
    embeddings_cache = pd.read_parquet("_data/embeddings_cache.parquet", engine='fastparquet')
except:
    # If file does not exist, create an empty dataframe
    embeddings_cache = pd.DataFrame(columns=['embedding'])
    # Write to parquet file
    embeddings_cache.to_parquet("_data/embeddings_cache.parquet", engine='fastparquet')

embeddings_cache
    

Unnamed: 0,embedding


In [17]:
def get_embeddings_same_as_df(input_content, model="text-embedding-ada-002"):
    response = openai.Embedding.create(
        model=model,
        input=f"{input_content}"
    )
    return response

In [46]:
def get_cached_embedding(query, model='text-embedding-ada-002'):
    if query in embeddings_cache.index:
        return embeddings_cache.loc[query, "embedding"]
    else:
        print("CALLING OPENAI API")
        embeddings_cache.loc[query, "embedding"] = get_embeddings_same_as_df(query)["data"][0]["embedding"]
        # print("SAVING TO PARQUET", embeddings_cache.loc[query, "embedding"])
        embeddings_cache.to_parquet("_data/embeddings_cache.parquet", engine='fastparquet')
        return embeddings_cache.loc[query, "embedding"]

In [47]:
def get_similarities(df, query, pprint=True):
  res = df.copy()

  embedding = get_cached_embedding(query)
  # print(embedding)

  # print(type(embedding), type(res["embedding"]), len(embedding), len(res["embedding"][0]))
  res['similarities'] = res["embedding"].apply(lambda x: cosine_similarity(x, embedding))
  res = res.sort_values('similarities', ascending=False)
  return res

res = get_similarities(embeddings_df, 'What are the different variants of Button?')
res

Unnamed: 0_level_0,chunk,object,embedding_object,index,embedding,prompt_tokens,total_tokens,similarities
level_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
55,"{'header': ['Button', 'Variants', 'Size'], 'li...",list,embedding,0,"[0.007360226474702358, 0.028180686756968498, -...",70,70,0.833012
50,"{'header': ['Button', 'Variants'], 'link': '/d...",list,embedding,0,"[-0.013625074177980423, 0.012804698199033737, ...",24,24,0.830656
52,"{'header': ['Button', 'Variants', 'High visibi...",list,embedding,0,"[-0.012566307559609413, 0.015580866485834122, ...",86,86,0.819525
54,"{'header': ['Button', 'Variants', 'Borderless'...",list,embedding,0,"[-0.03419613465666771, 0.02507716417312622, 0....",80,80,0.817050
66,"{'header': ['Button', 'Usage', 'Style types'],...",list,embedding,0,"[-0.01531047560274601, 0.02948579005897045, 0....",120,120,0.815964
...,...,...,...,...,...,...,...,...
184,"{'header': ['Getting started'], 'link': '/docs...",list,embedding,0,"[-0.003294056747108698, 0.023934420198202133, ...",19,19,0.694815
220,"{'header': ['Non ideal state', 'Usage', 'Error...",list,embedding,0,"[-0.005601025652140379, 0.005997162312269211, ...",83,83,0.694734
218,"{'header': ['Non ideal state', 'Usage', 'Error...",list,embedding,0,"[-0.0007478818297386169, 0.010538834147155285,...",79,79,0.693467
219,"{'header': ['Non ideal state', 'Usage', 'Error...",list,embedding,0,"[-0.010261344723403454, 0.016437670215964317, ...",82,82,0.691517


In [50]:
top_chunks = res.head(10)
top_chunks

Unnamed: 0_level_0,chunk,object,embedding_object,index,embedding,prompt_tokens,total_tokens,similarities
level_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
55,"{'header': ['Button', 'Variants', 'Size'], 'li...",list,embedding,0,"[0.007360226474702358, 0.028180686756968498, -...",70,70,0.833012
50,"{'header': ['Button', 'Variants'], 'link': '/d...",list,embedding,0,"[-0.013625074177980423, 0.012804698199033737, ...",24,24,0.830656
52,"{'header': ['Button', 'Variants', 'High visibi...",list,embedding,0,"[-0.012566307559609413, 0.015580866485834122, ...",86,86,0.819525
54,"{'header': ['Button', 'Variants', 'Borderless'...",list,embedding,0,"[-0.03419613465666771, 0.02507716417312622, 0....",80,80,0.81705
66,"{'header': ['Button', 'Usage', 'Style types'],...",list,embedding,0,"[-0.01531047560274601, 0.02948579005897045, 0....",120,120,0.815964
51,"{'header': ['Button', 'Variants', 'Default'], ...",list,embedding,0,"[-0.0070247650146484375, 0.027673523873090744,...",52,52,0.814043
53,"{'header': ['Button', 'Variants', 'Call-to-act...",list,embedding,0,"[-0.028330324217677116, 0.017614779993891716, ...",109,109,0.810828
49,"{'header': ['Button'], 'link': '/docs/button',...",list,embedding,0,"[-0.026493458077311516, 0.013911099173128605, ...",84,84,0.808077
247,"{'header': ['Side navigation', 'Variants', 'Ba...",list,embedding,0,"[-0.0031764411833137274, 0.02409714087843895, ...",130,130,0.803335
264,"{'header': ['Stepper', 'Variants'], 'link': '/...",list,embedding,0,"[0.010442211292684078, 0.02130099944770336, -0...",47,47,0.801529


In [56]:
top_chunks = res.loc[:10, ["chunk", "similarities"]]
top_chunks

Unnamed: 0_level_0,chunk,similarities
level_0,Unnamed: 1_level_1,Unnamed: 2_level_1
55,"{'header': ['Button', 'Variants', 'Size'], 'li...",0.833012
50,"{'header': ['Button', 'Variants'], 'link': '/d...",0.830656
52,"{'header': ['Button', 'Variants', 'High visibi...",0.819525
54,"{'header': ['Button', 'Variants', 'Borderless'...",0.817050
66,"{'header': ['Button', 'Usage', 'Style types'],...",0.815964
...,...,...
290,"{'header': ['Tag', 'Tag container', 'Scrollabl...",0.713201
82,"{'header': ['Carousel', 'Subcomponents', 'Slid...",0.713143
221,"{'header': ['Non ideal state', 'Usage', 'Timed...",0.712930
277,"{'header': ['Surface', 'Props'], 'link': '/doc...",0.712642


In [62]:
# Add similarity score to the top chunks's "chunk" json
# top_chunks["chunk"] = top_chunks["chunk"].apply(lambda x: json.loads(x))
top_chunks_1 = top_chunks.copy()
top_chunks_1["chunk"] = top_chunks["chunk"].apply(lambda x: {**x, "similarity": top_chunks.loc[top_chunks["chunk"] == x, "similarities"].values[0]})
top_chunks_1.loc[55, :].chunk

{'header': ['Button', 'Variants', 'Size'],
 'link': '/docs/button#size',
 'content': 'There are 3 different sizes available, which can be applied to any button. The medium size is a default and should always be the first choice, unless there are good reasons to use the small or large version.',
 'similarity': 0.833011876068822}

In [66]:
# Return an json array of top chunks
json.loads(top_chunks_1["chunk"].to_json(orient="records"))

[{'header': ['Button', 'Variants', 'Size'],
  'link': '/docs/button#size',
  'content': 'There are 3 different sizes available, which can be applied to any button. The medium size is a default and should always be the first choice, unless there are good reasons to use the small or large version.',
  'similarity': 0.8330118761},
 {'header': ['Button', 'Variants'],
  'link': '/docs/button#variants',
  'content': '',
  'similarity': 0.8306555135},
 {'header': ['Button', 'Variants', 'High visibility'],
  'link': '/docs/button#high-visibility',
  'content': 'This is the button to emphasize an action applicable to an area of the page or a [dialog](dialog). High visibility buttons are used to draw the user’s attention to the main action of the page. There should only be one high visibility button per area of the user interface.',
  'similarity': 0.819525496},
 {'header': ['Button', 'Variants', 'Borderless'],
  'link': '/docs/button#borderless',
  'content': 'The borderless button is useful in

In [67]:
# Return an json array of top chunks
def get_top_chunks(query):
    res = get_similarities(embeddings_df, query)
    top_chunks = res.loc[:10, ["chunk", "similarities"]]
    top_chunks["chunk"] = top_chunks["chunk"].apply(lambda x: {**x, "similarity": top_chunks.loc[top_chunks["chunk"] == x, "similarities"].values[0]})
    return json.loads(top_chunks["chunk"].to_json(orient="records"))

get_top_chunks("What are the different variants of Button?")
    

[{'header': ['Button', 'Variants', 'Size'],
  'link': '/docs/button#size',
  'content': 'There are 3 different sizes available, which can be applied to any button. The medium size is a default and should always be the first choice, unless there are good reasons to use the small or large version.',
  'similarity': 0.8330118761},
 {'header': ['Button', 'Variants'],
  'link': '/docs/button#variants',
  'content': '',
  'similarity': 0.8306555135},
 {'header': ['Button', 'Variants', 'High visibility'],
  'link': '/docs/button#high-visibility',
  'content': 'This is the button to emphasize an action applicable to an area of the page or a [dialog](dialog). High visibility buttons are used to draw the user’s attention to the main action of the page. There should only be one high visibility button per area of the user interface.',
  'similarity': 0.819525496},
 {'header': ['Button', 'Variants', 'Borderless'],
  'link': '/docs/button#borderless',
  'content': 'The borderless button is useful in