In [1]:
import pandas as pd
import tiktoken
import os
import openai
from ast import literal_eval
import numpy as np
from dotenv import load_dotenv
# https://cookbook.openai.com/examples/get_embeddings_from_dataset
from utils.embeddings_utils import get_embedding

In [2]:
# loading the data
load_dotenv()
emb_path = 'data/css_topic_embeddings.csv'
openai.api_key = os.environ["OPENAI_API_KEY"]

In [3]:
embedding_model = "text-embedding-3-small"
embedding_encoding = "cl100k_base"
max_tokens = 8000  # the maximum for text-embedding-3-small is 8191

In [4]:
# load & inspect dataset
input_datapath = "css.csv"  # to save space, we provide a pre-filtered dataset
df = pd.read_csv(input_datapath)
print(df.columns)

Index(['Type', 'Speaker', 'Topic', 'Year'], dtype='object')


In [7]:
df = df[["Type","Speaker","Topic","Year"]]
df = df.dropna()
df["combined"] = (
    "Speaker: " + df.Speaker.str.strip() + "; Topic: " + df.Topic.str.strip()
)

df.head(2)

Unnamed: 0,Type,Speaker,Topic,Year,combined
0,Keynote,Michael Macy,Opportunities and challenges for computational...,2015,Speaker: Michael Macy; Topic: Opportunities an...
1,Keynote,Jure Leskovec,Structure and dynamics of information propagation,2015,Speaker: Jure Leskovec; Topic: Structure and d...


In [8]:
encoding = tiktoken.get_encoding(embedding_encoding)

In [None]:
# This may take a few minutes
df["embedding"] = df.combined.apply(lambda x: get_embedding(x, model=embedding_model))
df.to_csv("data/css_speaker_topic_embeddings.csv")