# Keyword Embedding Generation

## Keyword Extraction

In [None]:
import json

import pandas as pd

from llm import compute_embeddings

df = pd.read_csv('data/stories_deduplicated_meta.csv')

llm_keywords = df['llm_keywords'].apply(json.loads).explode()
dev_keywords = df['dev_keywords'].apply(json.loads).explode()
all_keywords = pd.concat([llm_keywords, dev_keywords]).dropna().unique()
all_keywords

## Embedding Generation

In [None]:
import numpy as np
from tqdm import tqdm
from embeddingio import EmbeddingStore

BATCH_SIZE = 20
DB_PATH = 'data/embeddings.db'

final_keyword_embeddings = {}  # This dictionary will store all keyword embeddings (fetched or newly computed).
keywords_to_process = all_keywords.tolist()

with EmbeddingStore(db_path=DB_PATH) as store:
    for i in tqdm(range(0, len(keywords_to_process), BATCH_SIZE), desc="Processing keyword embeddings"):
        batch_keywords = keywords_to_process[i:i + BATCH_SIZE]

        # Fetch existing embeddings for the current batch.
        existing_embeddings_in_batch = store.get_embeddings(batch_keywords)
        final_keyword_embeddings.update(existing_embeddings_in_batch)

        # Identify keywords for which embeddings need to be computed.
        keywords_already_processed = set(existing_embeddings_in_batch.keys())
        new_keywords_to_compute = [k for k in batch_keywords if k not in keywords_already_processed]

        if new_keywords_to_compute:
            new_computed_embedding_vectors = compute_embeddings(new_keywords_to_compute)

            embeddings_to_add_to_store_list = []
            for keyword, vector in zip(new_keywords_to_compute, new_computed_embedding_vectors):
                # ensure vector is a np.ndarray(float32) before we use it anywhere
                vector = np.asarray(vector, dtype=np.float32)

                final_keyword_embeddings[keyword] = vector
                embeddings_to_add_to_store_list.append((keyword, vector))

            if embeddings_to_add_to_store_list:
                store.add_embeddings(embeddings_to_add_to_store_list)

final_keyword_embeddings