# Uso de modelos de embeddings de OpenAI

## Instalación y carga de librerías

In [1]:
# !pip install openai
# !pip install tiktoken

In [1]:
import openai
import os
import pandas as pd

In [3]:
from config import settings

In [11]:
OPENAI_API_KEY = settings["openai"]
openai.api_key = OPENAI_API_KEY
model = "text-embedding-3-small"

## Cargar dataset

In [6]:
from paths import RAW_DIR

In [7]:
df = pd.read_csv(filepath_or_buffer=(str(RAW_DIR / "generic-food.csv")))

In [23]:
df.shape

(906, 5)

In [24]:
df.head()

Unnamed: 0,FOOD NAME,SCIENTIFIC NAME,GROUP,SUB GROUP,total_tokens
0,Angelica,Angelica keiskei,Herbs and Spices,Herbs,2
1,Savoy cabbage,Brassica oleracea var. sabauda,Vegetables,Cabbages,4
2,Silver linden,Tilia argentea,Herbs and Spices,Herbs,3
3,Kiwi,Actinidia chinensis,Fruits,Tropical fruits,2
4,Allium (Onion),Allium,Vegetables,Onion-family vegetables,6


## Evaluar cantidad de tokens a procesar

In [9]:
import tiktoken

In [12]:

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

num_tokens_from_string("tiktoken is great!", "cl100k_base")

6

In [13]:
df['total_tokens'] = df['FOOD NAME'].apply(lambda x : num_tokens_from_string(x,'cl100k_base') )

In [14]:
df.head()

Unnamed: 0,FOOD NAME,SCIENTIFIC NAME,GROUP,SUB GROUP,total_tokens
0,Angelica,Angelica keiskei,Herbs and Spices,Herbs,2
1,Savoy cabbage,Brassica oleracea var. sabauda,Vegetables,Cabbages,4
2,Silver linden,Tilia argentea,Herbs and Spices,Herbs,3
3,Kiwi,Actinidia chinensis,Fruits,Tropical fruits,2
4,Allium (Onion),Allium,Vegetables,Onion-family vegetables,6


In [15]:
sum(df['total_tokens'])

2947

## Generando emebeddings

In [20]:
from openai import OpenAI
client = OpenAI(api_key=OPENAI_API_KEY)

def get_embedding(text, model="text-embedding-3-small"):
    text = text.replace("\n", " ")
    return client.embeddings.create(input = [text], model=model).data[0].embedding

In [25]:
from tqdm import tqdm

def get_embeddings_batch(texts, model="text-embedding-3-small"):
    texts = [t.replace("\n", " ") for t in texts]
    response = client.embeddings.create(input=texts, model=model).data
    return [r.embedding for r in response]

batch_size = 100
embeddings = []

for i in tqdm(range(0, len(df), batch_size)):
    batch_texts = df['FOOD NAME'].iloc[i:i+batch_size].tolist()
    batch_embeddings = get_embeddings_batch(batch_texts)
    embeddings.extend(batch_embeddings)

df['ada_embedding'] = embeddings

100%|██████████| 10/10 [07:52<00:00, 47.27s/it]


In [26]:
df.head()

Unnamed: 0,FOOD NAME,SCIENTIFIC NAME,GROUP,SUB GROUP,total_tokens,ada_embedding
0,Angelica,Angelica keiskei,Herbs and Spices,Herbs,2,"[0.019295772537589073, -0.012197259813547134, ..."
1,Savoy cabbage,Brassica oleracea var. sabauda,Vegetables,Cabbages,4,"[0.004985727369785309, -0.00917512271553278, 0..."
2,Silver linden,Tilia argentea,Herbs and Spices,Herbs,3,"[-0.0038337542209774256, -0.02693667821586132,..."
3,Kiwi,Actinidia chinensis,Fruits,Tropical fruits,2,"[0.019479969516396523, 0.004792865831404924, -..."
4,Allium (Onion),Allium,Vegetables,Onion-family vegetables,6,"[0.015512133948504925, -0.029778411611914635, ..."


In [27]:
embedding_prueba = get_embedding('esto es una prueba de embeddings para openAI')
len(embedding_prueba)

1536

In [28]:
from paths import PREPROCESSED_DIR

In [29]:
df.to_csv(path_or_buf=(str(PREPROCESSED_DIR / 'embedded_1k_reviews.csv')), index=False)

## Creando datasets para visualizar

In [31]:
df_embeddings = pd.DataFrame(list(df['ada_embedding']))

In [32]:
df_embeddings.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1526,1527,1528,1529,1530,1531,1532,1533,1534,1535
0,0.019296,-0.012197,-0.003164,-0.000139,-0.019481,-0.022991,0.007277,0.018038,0.013455,-0.022607,...,0.041585,-0.01572,-0.012138,0.001753,0.027573,0.022938,0.030698,-0.012608,0.013667,0.020474
1,0.004986,-0.009175,0.003261,-0.037171,-0.050494,-0.016896,0.002824,-0.048528,-0.002728,-0.021979,...,0.003014,0.010886,0.015996,0.017228,0.009722,-0.002993,-0.012381,-0.00707,0.004858,-0.049691
2,-0.003834,-0.026937,0.009787,-0.025479,0.016629,-0.036263,0.021106,0.03305,-0.01764,-0.010657,...,0.018905,0.019916,0.029153,0.026624,0.003319,0.017224,-0.010293,-0.015484,-0.014197,0.032931
3,0.01948,0.004793,-0.039019,0.037256,-0.019348,0.005142,0.019818,0.040576,0.01804,-0.021302,...,0.005208,-0.000274,0.01685,-0.001807,0.016351,0.00017,-0.001457,-0.00494,0.007544,-0.009255
4,0.015512,-0.029778,0.021143,0.026676,-0.018492,-0.055404,-0.011365,-0.00921,-0.019213,-0.01379,...,-0.014217,-0.000514,0.022352,0.012581,0.035031,0.037058,0.025699,0.024722,-0.005557,-0.037449


In [34]:
df_embeddings.shape

(906, 1536)

In [35]:
df_embeddings.to_csv(path_or_buf=(str(PREPROCESSED_DIR / 'embedding_food.tsv')),sep='\t',index=False, header=False)

In [36]:
df[['FOOD NAME','GROUP','SUB GROUP']].to_csv(path_or_buf=(str(PREPROCESSED_DIR / 'labels_food.tsv')),sep='\t',index=False, header=True)