In [1]:
import os, openai
from dotenv import load_dotenv, dotenv_values, find_dotenv

load_dotenv()
config = dotenv_values(".env")
openai.api_key = config["OPENAI_API_KEY"]

In [2]:
from langchain.embeddings import OpenAIEmbeddings

<font color='green'>
Initialize the OpenAIEmbeddings object
<font>

In [3]:
embeddings = OpenAIEmbeddings()

<font color='green'>
Let's read our input data and get its embedding representation, so that we use it up for our future tasks
<font>

In [8]:
import pandas as pd
df = pd.read_csv('Data.csv')
print(df)

         Words
0     Elephant
1         Lion
2        Tiger
3          Dog
4      Cricket
5      Footbal
6       Tennis
7   Basketball
8        Apple
9       Orange
10      Banana


<font color='green'>
    We can use "apply" to apply the get_embedding function to each row in the dataframe because our words are stored in a pandas dataframe. In order to save time and to save the calculated word embeddings in a new csv file called "word_embeddings.csv" rather than calling OpenAI once more to carry out these computations.
    <font>

In [9]:
df['embedding'] = df['Words'].apply(lambda x: embeddings.embed_query(x))
df.to_csv('word_embeddings.csv')

<font color='green'>
    Let's load the existing file, which contains the embeddings, so that we can save chargers by not hitting the API repeatedly
    <font>

In [10]:
new_df = pd.read_csv('word_embeddings.csv')
print(new_df)

    Unnamed: 0       Words                                          embedding
0            0    Elephant  [-0.017855134320424067, -0.008739002273680945,...
1            1        Lion  [-0.001514446088710819, -0.010011047775235734,...
2            2       Tiger  [-0.013353539911357312, -0.0096088277332399, -...
3            3         Dog  [-0.0009933243881749651, -0.015114395874422861...
4            4     Cricket  [0.003939178751371585, -0.0071971946945413035,...
5            5     Footbal  [-0.011457636465443599, -0.008117706169887448,...
6            6      Tennis  [-0.022966790023950585, 0.0016101307056064378,...
7            7  Basketball  [-0.012779986743709601, -0.013293189227440112,...
8            8       Apple  [0.014476476473958088, -0.00390673109200909, -...
9            9      Orange  [0.02067122263312988, -0.029222075283705507, 9...
10          10      Banana  [-0.012999765903696864, -0.01998321619391984, ...


<font color='green'>
Let's get the embeddings for our text
<font>

In [11]:
our_Text = "Mango"

In [12]:
text_embedding = embeddings.embed_query(our_Text)

In [13]:
print (f"Our embedding is {text_embedding}")

Our embedding is [-0.0033961603494176745, -0.019859389833244062, 0.010447266248561465, -0.016127768497880515, 0.0063833755696348915, 0.008479218716502984, -0.02520123433196036, -0.014223618074734925, 0.0011437680517106163, -0.03299674681191176, -0.0011805091920153584, 0.0036868945698438384, 0.0019632554118714615, 0.016741185958850564, -0.021674084862705133, 0.004725231670073206, 0.03266448072797766, -0.008146950769923783, 0.009335446867722615, -0.015769941335207135, -0.015872176336938743, 0.0028562251690571095, 0.0014672499013421258, -0.012396144368549316, 0.004511174166607023, 0.009578258489294746, 0.02444724304511744, -0.013622979290489412, 0.011603814257565128, 0.003110218249178531, 0.04789768038785218, -0.014939271468759131, -0.004351429997340966, -0.009916916239897492, -0.01754629567788184, 0.004680503041908396, -0.010351420119530852, -0.0063290627010960405, -0.006297113680978319, -0.018223611179087332, 0.01444086954889033, -0.008402541068220451, -0.015782720943254225, -0.01801913

<font color='green'>
    We can determine how similar a word is to other words in our dataframe after we have a vector representing that word.
    <br>
By computing the cosine similarity of the word vector for our search term to each word embedding in our dataframe.
    <font>

In [17]:
from openai.embeddings_utils import cosine_similarity

df["similarity score"] = df['embedding'].apply(lambda x: cosine_similarity(x, text_embedding))

df

ModuleNotFoundError: No module named 'sklearn'

<font color='green'>
    Sorting by similarity values in dataframe reveals Banana, Orange, and Apple are closest to searched term, such as Mango.
    <font>

In [14]:
df.sort_values("similarity score", ascending=False).head(10)

Unnamed: 0,Words,embedding,similarity score
6,Banana,"[-0.013021613471210003, -0.019990751519799232,...",0.898687
5,Orange,"[0.020670153200626373, -0.029327111318707466, ...",0.843855
4,Apple,"[0.01444941945374012, -0.0039136698469519615, ...",0.813925
3,Bike,"[0.0054834443144500256, -0.013623781502246857,...",0.797835
1,College,"[0.008637277409434319, -0.009738443419337273, ...",0.782224
0,School,"[0.00558300968259573, 0.009224693290889263, -0...",0.781657
2,Car,"[-0.004948626272380352, -0.012337295338511467,...",0.780015
