In [1]:
# I will start with a collection of documents and a clasification criteria pre labeled
# reference https://github.com/openai/openai-cookbook/blob/main/examples/Classification_using_embeddings.ipynb

doc_list = [
    # Food-related
    {"doc": "I love trying out new recipes; it's a way to travel without leaving my kitchen.", "intention": "food"},
    {"doc": "The aroma of freshly baked bread fills the air, making the whole house feel cozy.", "intention": "food"},
    {"doc": "I went to a sushi bar last night, and the experience was simply amazing.", "intention": "food"},
    {"doc": "Farm-to-table restaurants provide not just a meal, but a story about the food's origins.", "intention": "food"},
    {"doc": "Nothing beats the joy of cooking pasta to perfection, al dente as the Italians say.", "intention": "food"},
    
    # Travel-related
    {"doc": "The cobblestone streets of Rome give you a sense of history at every turn.", "intention": "travel"},
    {"doc": "Hiking in the Swiss Alps, you can't help but feel amazed at the grandeur of nature.", "intention": "travel"},
    {"doc": "A tropical getaway is the best way to escape the winter blues.", "intention": "travel"},
    {"doc": "Exploring the local markets is my favorite way to immerse myself in a new culture.", "intention": "travel"},
    {"doc": "The Northern Lights are a natural spectacle that everyone should try to see at least once.", "intention": "travel"},
    
    # Mathematics-related
    {"doc": "Calculus is the mathematical study of change, just like geometry is the study of shape.", "intention": "mathematics"},
    {"doc": "The Fibonacci sequence appears in many aspects of nature, from flower petals to pinecones.", "intention": "mathematics"},
    {"doc": "Statistics help us make sense of real-world data and make informed decisions.", "intention": "mathematics"},
    {"doc": "Algebra provides the foundation for solving equations and understanding mathematical structures.", "intention": "mathematics"},
    {"doc": "Mathematics is a universal language that transcends cultural and geographic boundaries.", "intention": "mathematics"},
    
    # Gardening-related
    {"doc": "Pruning roses not only helps their appearance but also encourages healthier growth.", "intention": "gardening"},
    {"doc": "A well-tended vegetable garden can provide fresh produce all summer long.", "intention": "gardening"},
    {"doc": "Composting is an eco-friendly way to recycle kitchen waste while enriching your garden soil.", "intention": "gardening"},
    {"doc": "Growing herbs indoors allows you to have fresh flavors at your fingertips year-round.", "intention": "gardening"},
    {"doc": "Mulching protects your garden beds, conserving moisture and improving soil quality.", "intention": "gardening"}
]


In [4]:
# let's put it in a df

import pandas as pd

df = pd.DataFrame(doc_list)
df.head()

Unnamed: 0,doc,intention
0,I love trying out new recipes; it's a way to t...,food
1,The aroma of freshly baked bread fills the air...,food
2,"I went to a sushi bar last night, and the expe...",food
3,Farm-to-table restaurants provide not just a m...,food
4,Nothing beats the joy of cooking pasta to perf...,food


In [5]:
# let's calculate the embeddings for each one of this docs, and add it to the df
from openai.embeddings_utils import get_embedding

# embedding model parameters
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191

df["embedding"] = df.doc.apply(lambda x: get_embedding(x, engine=embedding_model))
df.head()

Unnamed: 0,doc,intention,embedding
0,I love trying out new recipes; it's a way to t...,food,"[0.009610418230295181, -0.009120222181081772, ..."
1,The aroma of freshly baked bread fills the air...,food,"[0.026805980131030083, -0.008292862214148045, ..."
2,"I went to a sushi bar last night, and the expe...",food,"[0.0073952148668468, -0.010177254676818848, 0...."
3,Farm-to-table restaurants provide not just a m...,food,"[0.004827000200748444, -0.03561137244105339, 0..."
4,Nothing beats the joy of cooking pasta to perf...,food,"[0.011868170462548733, -0.009151360020041466, ..."


In [23]:
# let's get a sample doc to classify
query = "I love my grandma's pizza, it is so good..."

# get the embedding for the new doc

q_emb = get_embedding(query, engine=embedding_model)
q_emb

[0.017950478941202164,
 -0.010772758163511753,
 0.007863372564315796,
 -0.03587625175714493,
 -0.017876354977488518,
 -0.009037010371685028,
 0.0011558786500245333,
 -0.01976653002202511,
 -0.004777940921485424,
 -0.02353452332317829,
 0.01341653149574995,
 0.026388315483927727,
 -0.01802460290491581,
 0.0027611369732767344,
 -0.006899754051119089,
 0.007931319996714592,
 0.03187352791428566,
 0.017604565247893333,
 0.002653038827702403,
 -0.024325184524059296,
 0.00571993924677372,
 -0.01229848712682724,
 0.013614197261631489,
 -0.029896875843405724,
 -0.00787572655826807,
 0.01803695783019066,
 -0.00839459802955389,
 -0.008864052593708038,
 0.02491817995905876,
 0.00787572655826807,
 0.011112495325505733,
 0.005942313000559807,
 -0.019148824736475945,
 -0.00943234097212553,
 -0.012724703177809715,
 -0.011427524499595165,
 -0.01733277551829815,
 0.0018098728032782674,
 -0.007968381978571415,
 0.012959430925548077,
 -0.001453148783184588,
 -0.017641626298427582,
 0.0009528084774501622,

In [24]:
# now we will calculate similarity with each one of the statements in the db

from openai.embeddings_utils import cosine_similarity

df["distance"] = df.embedding.apply(lambda x: cosine_similarity(x, q_emb))
df.head()

Unnamed: 0,doc,intention,embedding,distance
0,I love trying out new recipes; it's a way to t...,food,"[0.009610418230295181, -0.009120222181081772, ...",0.809775
1,The aroma of freshly baked bread fills the air...,food,"[0.026805980131030083, -0.008292862214148045, ...",0.809049
2,"I went to a sushi bar last night, and the expe...",food,"[0.0073952148668468, -0.010177254676818848, 0....",0.787737
3,Farm-to-table restaurants provide not just a m...,food,"[0.004827000200748444, -0.03561137244105339, 0...",0.765304
4,Nothing beats the joy of cooking pasta to perf...,food,"[0.011868170462548733, -0.009151360020041466, ...",0.817684


In [25]:
# now we need to get creative on how to identify a winner intention... 
# a very basic way would be this:

# Calculate average distance for each intention
average_distance_by_intention = df.groupby('intention')['distance'].mean().reset_index()

# Sort by distance, largest first
average_distance_by_intention_sorted = average_distance_by_intention.sort_values(by='distance', ascending=False).reset_index(drop=True)

print(average_distance_by_intention_sorted)
    

     intention  distance
0         food  0.797910
1       travel  0.752592
2    gardening  0.740693
3  mathematics  0.713223


In [28]:
# let't set up a  loop to have some fun

while True:
    query = input("enter your text:")
    print ("")
    q_emb = get_embedding(query, engine=embedding_model)
    df["distance"] = df.embedding.apply(lambda x: cosine_similarity(x, q_emb))
    average_distance_by_intention = df.groupby('intention')['distance'].mean().reset_index()
    average_distance_by_intention_sorted = average_distance_by_intention.sort_values(by='distance', ascending=False).reset_index(drop=True)
    print(average_distance_by_intention_sorted,"\n")
    
    

enter your text:I need to repair my car

     intention  distance
0         food  0.729083
1    gardening  0.725857
2       travel  0.723247
3  mathematics  0.706884 

enter your text:I love cigaretes

     intention  distance
0         food  0.769958
1       travel  0.765711
2    gardening  0.736094
3  mathematics  0.725244 

enter your text:tomorrow is monday

     intention  distance
0  mathematics  0.715681
1       travel  0.712783
2         food  0.711942
3    gardening  0.696588 



KeyboardInterrupt: Interrupted by user