# Uso de modelos de embeddings de OpenAI

## Instalación y carga de librerías

In [None]:
!pip install openai
!pip install tiktoken

Collecting openai
  Downloading openai-0.27.8-py3-none-any.whl (73 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.6/73.6 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: openai
Successfully installed openai-0.27.8
Collecting tiktoken
  Downloading tiktoken-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tiktoken
Successfully installed tiktoken-0.4.0


In [None]:
import openai
from getpass import getpass
import os
import pandas as pd

In [None]:
OPENAI_API_KEY = getpass('Enter the secret value: ')
openai.api_key = OPENAI_API_KEY

Enter the secret value: ··········


## Cargar dataset

In [None]:
df = pd.read_csv('generic-food.csv')

In [None]:
df.head()

Unnamed: 0,FOOD NAME,SCIENTIFIC NAME,GROUP,SUB GROUP
0,Angelica,Angelica keiskei,Herbs and Spices,Herbs
1,Savoy cabbage,Brassica oleracea var. sabauda,Vegetables,Cabbages
2,Silver linden,Tilia argentea,Herbs and Spices,Herbs
3,Kiwi,Actinidia chinensis,Fruits,Tropical fruits
4,Allium (Onion),Allium,Vegetables,Onion-family vegetables


## Evaluar cantidad de tokens a procesar

In [None]:
import tiktoken

def num_tokens_from_string(text, encodig_name):
    encoding = tiktoken.get_encoding(encodig_name)
    num_tokens = len(encoding.encode(text))
    return num_tokens


In [None]:
df['total_tokens'] = df['FOOD NAME'].apply(lambda x : num_tokens_from_string(x,'cl100k_base') )

In [None]:
sum(df['total_tokens'])

2947

## Generando emebeddings

In [None]:
def get_embedding(text, model='text-embedding-ada-002'):
    text = text.replace('\n','')
    return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']

In [None]:
df['ada_embeddings'] =  df['FOOD NAME'].apply(lambda x :get_embedding(x, model='text-embedding-ada-002') )

In [None]:
df.head()

Unnamed: 0,FOOD NAME,SCIENTIFIC NAME,GROUP,SUB GROUP,total_tokens,ada_embeddings
0,Angelica,Angelica keiskei,Herbs and Spices,Herbs,2,"[0.0061722793616354465, -0.010086163878440857,..."
1,Savoy cabbage,Brassica oleracea var. sabauda,Vegetables,Cabbages,4,"[0.005525844171643257, -0.004996671807020903, ..."
2,Silver linden,Tilia argentea,Herbs and Spices,Herbs,3,"[-0.004518164321780205, 0.020059844478964806, ..."
3,Kiwi,Actinidia chinensis,Fruits,Tropical fruits,2,"[-0.004589142743498087, -0.010032080113887787,..."
4,Allium (Onion),Allium,Vegetables,Onion-family vegetables,6,"[0.013222076930105686, -0.019921524450182915, ..."


In [None]:
embedding_prueba = get_embedding('esto es una prueba de embeddings para openAI')

In [None]:
len(embedding_prueba)

1536

## Creando datasets para visualizar

In [None]:
df_embeddings = pd.DataFrame(list(df['ada_embeddings']))

In [None]:
df_embeddings.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1526,1527,1528,1529,1530,1531,1532,1533,1534,1535
0,0.006172,-0.010086,0.0047,-0.028833,-0.005396,0.013111,-0.016375,0.007222,-0.00996,-0.02814,...,0.024303,-0.011538,0.00447,-0.022611,-0.00888,0.030938,0.008394,-0.003977,-0.022477,-0.015109
1,0.005526,-0.004997,0.014958,-0.010917,-0.002466,0.021154,-0.008922,-0.02431,-0.01805,-0.039742,...,0.017947,-0.019807,0.026837,-0.032636,-0.019397,0.020012,-0.025593,0.010494,-0.007736,0.00072
2,-0.004518,0.02006,-0.002007,-0.024115,-0.01669,0.020476,-0.041274,-0.00194,-0.001276,-0.035474,...,0.002878,-0.022356,0.038562,-0.026344,-0.032439,0.021174,-0.00027,-0.020436,-0.011561,-0.03346
3,-0.004589,-0.010032,-0.009249,-0.010006,-0.007794,0.023091,-0.014398,-0.016183,-0.001687,-0.036532,...,0.030944,-0.012121,0.022496,-0.011804,-0.005365,0.010601,-0.021759,-0.001688,-0.014049,-0.020672
4,0.013222,-0.019922,0.002163,-0.010105,-0.018612,0.022279,-0.018114,-0.015285,-0.0089,-0.020511,...,0.02021,-0.034447,0.031513,-0.010989,-0.019371,0.000302,0.010269,0.001256,-0.001141,0.012417


In [None]:
df_embeddings.to_csv('embedding_food.tsv',sep='\t',index=False, header=False)

In [None]:
df[['FOOD NAME','GROUP','SUB GROUP']].to_csv('labels_food.tsv',sep='\t',index=False, header=True)