In [1]:
import pandas as pd
from openai import OpenAI
from functools import partial

In [2]:
%load_ext dotenv
%dotenv

In [3]:
outputs_df = pd.read_excel("../data/raw/WP sugerencia de outputs.xlsx", sheet_name="SELECCION")

In [4]:
client = OpenAI()

def get_embedding(text: str, model="text-embedding-3-small"):
   return client.embeddings.create(input = [text], model=model).data[0].embedding

# PPA processing

In [5]:
outputs_df["PPAs_list"] = outputs_df.PPAs.apply(lambda d: d.split(", "))
classes = outputs_df["PPAs_list"].explode().unique()

for ppa in classes:
    outputs_df[ppa] = outputs_df["PPAs_list"].apply(lambda x: ppa in x)
    
outputs_df["primary_ppa"] = outputs_df["Primary PPA"].str[:4]
outputs_df["primary_better"] = outputs_df["primary_ppa"].str[:2]

# Text processing

In [6]:
output_prefix = r"\b(Producto|MPP\sProducto|Output|Resultado\sMPP|Producto\sMPP|MPP\sOutput)\s\d+(\.\d+)+\b"
outputs_df["output_text_clean"] = outputs_df["Output Statement"].replace(output_prefix, "", regex=True).str.strip()


# Embeddings

In [7]:
def list_col_to_df(list_col: pd.Series, prefix: str):
    list_df = pd.DataFrame(
        list_col.to_list(),
        index=list_col.index
        ).add_prefix(prefix)
    return list_df

In [8]:
outputs_df["embedding_small"] = outputs_df["output_text_clean"].apply(partial(get_embedding, model="text-embedding-3-small"))
outputs_df = outputs_df.join(list_col_to_df(outputs_df["embedding_small"], "openai_embedding_small_")) 

In [10]:
outputs_df["embedding_large"] = outputs_df["output_text_clean"].apply(partial(get_embedding, model="text-embedding-3-large"))
outputs_df = outputs_df.join(list_col_to_df(outputs_df["embedding_large"], "openai_embedding_large_")) 

# Train test sets

- Test set: at least two rows per PPA
- Train set: the remaning rows


In [11]:
test_set_idx = outputs_df.groupby("primary_ppa").sample(2, random_state=100).index
outputs_df["test_set"] = outputs_df.index.isin(test_set_idx)
outputs_df["test_set"].value_counts()

test_set
False    73
True     40
Name: count, dtype: int64

# Save data

In [18]:
del outputs_df["embedding_small"]
del outputs_df["embedding_large"]

In [21]:
outputs_df.to_parquet("../data/processed/outputs_openai_embeddings_v1.parquet", index=False)