In [1]:
!pip install ipykernel umap-learn plotly numpy

Collecting umap-learn
  Using cached umap_learn-0.5.7-py3-none-any.whl.metadata (21 kB)
Collecting plotly
  Using cached plotly-5.24.1-py3-none-any.whl.metadata (7.3 kB)
Collecting scipy>=1.3.1 (from umap-learn)
  Using cached scipy-1.14.1-cp312-cp312-macosx_14_0_arm64.whl.metadata (60 kB)
Collecting scikit-learn>=0.22 (from umap-learn)
  Using cached scikit_learn-1.5.2-cp312-cp312-macosx_12_0_arm64.whl.metadata (13 kB)
Collecting numba>=0.51.2 (from umap-learn)
  Using cached numba-0.60.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (2.7 kB)
Collecting pynndescent>=0.5 (from umap-learn)
  Using cached pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Collecting tenacity>=6.2.0 (from plotly)
  Using cached tenacity-9.0.0-py3-none-any.whl.metadata (1.2 kB)
Collecting llvmlite<0.44,>=0.43.0dev0 (from numba>=0.51.2->umap-learn)
  Using cached llvmlite-0.43.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (4.8 kB)
Collecting joblib>=0.11 (from pynndescent>=0.5->umap-learn)
  Using cached job

In [13]:
# 1. Producing a batch request dataset and saving to disk.
import json
import os
from datetime import datetime

EMBEDDING_DIM = 512
EMBEDDING_MODEL = "text-embedding-3-large"
MAX_LINES = 50_000
SKIP = True

def write_embedding_batch_dataset(story_filepath, model):
    # Create embeddings path if non-existent
    out_filename = f"data/embeddings/embedding_batch_{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}.jsonl"  
    if not os.path.exists("data/embeddings"):
        os.makedirs("data/embeddings")
    with open(story_filepath, "r") as in_f, open(out_filename, "w") as out_f:
        
        lines = []
        for k, line in enumerate(in_f):
            entry = json.loads(line)
            lines.append(json.dumps(
                {
                    "custom_id": entry["generation_id"],
                    "method": "POST",
                    "url": "/v1/embeddings",
                    "body": {"model": model, "input": entry["story"], "dimensions": EMBEDDING_DIM}
                }
            ))
            if len(lines) + 1 >= MAX_LINES:
                break
            if SKIP and (k % 5 != 0):
                continue
            

        print(f"Expect a disk size of {len(lines) * (EMBEDDING_DIM / 3072) * 0.0414:0f} MB for the embeddings.")

        out_f.write("\n".join(lines))

        return out_filename

stories_file = "data/batches_2024-11-12-17-00-38/processed.jsonl"
write_embedding_batch_dataset(stories_file, EMBEDDING_MODEL)


Expect a disk size of 344.993100 MB for the embeddings.


'data/embeddings/embedding_batch_2024-11-26-17-29-02.jsonl'

In [36]:
# Alternatively, generate a mock response:

import json
import numpy as np

# Input and output file paths
stories_file = "data/batches_2024-11-12-17-00-38/processed.jsonl"
embeddings_file = "data/embeddings/mock_embeddings_file.jsonl"  # Path to generate the embeddings_file

# Parameters
embedding_dim = 100
MAX_LINES = 5_000

# Read stories_data
stories_data = []
with open(stories_file, 'r') as f:
    for k, line in enumerate(f):
        stories_data.append(json.loads(line))
        if len(stories_data) + 1 >= MAX_LINES:
                break
        if SKIP and (k % 5 != 0):
            continue

# Generate embeddings and write to embeddings_file
with open(embeddings_file, 'w') as f:
    for story in stories_data:
        generation_id = story['generation_id']
        embedding = np.random.normal(size=embedding_dim).tolist()
        
        # Create the embedding entry
        embedding_entry = {
            "custom_id": generation_id,
            "response": {
                "body": {
                    "data": [
                        {"embedding": embedding}
                    ]
                }
            }
        }
        
        # Write to file as a JSON object
        f.write(json.dumps(embedding_entry) + '\n')

print(f"Mock embeddings file created at: {embeddings_file}")


Mock embeddings file created at: data/embeddings/mock_embeddings_file.jsonl


2. Retrieving the batch through the OpenAI Web UI and saving to disk (To be implemented in code if needed)

In [8]:
# 3. Analyzing the embeddings

import json
import pandas as pd
import numpy as np
import umap
import plotly.express as px
import textwrap
from tqdm import tqdm
import re

def story_start(input_string):
    for delimiter in [",", "."]:
        if delimiter in input_string:
            first_part = input_string.split(delimiter, 1)[0]
            break
    words = first_part.split()
    if len(words) <= 10:
        return ' '.join(word.capitalize() for word in words)
    else:
        return ' '.join(word.capitalize() for word in words[:3])

MAX_LINES = 50_000
SKIP = False
embeddings_file = "data/embeddings/batch_6745f7c140588190b1dcb1a1fd9ae532_output.jsonl"
stories_file = "data/batches_2024-11-12-17-00-38/processed.jsonl"

embeddings_data = []
with open(embeddings_file, 'r') as f:
    for line in f:
        embeddings_data.append(json.loads(line))

stories_data = []
with open(stories_file, 'r') as f:
    for k, line in enumerate(f):
        stories_data.append(json.loads(line))
        if len(stories_data) + 1 >= MAX_LINES:
                break
        if SKIP and (k % 5 != 0):
            continue

story_dict = {story['generation_id']: k for k, story in enumerate(stories_data)}
matched_data = []
for embedding_entry in tqdm(embeddings_data):
    story_index = story_dict[embedding_entry['custom_id']]
    embedding = embedding_entry['response']['body']['data'][0]['embedding']
    story_info = stories_data[story_index]
    wrapped_story = '\n'.join(textwrap.wrap(story_info['story'], width=50))
    matched_data.append({
        'embedding': embedding,
        'story': wrapped_story,
        'theme': story_info['theme'],
        'topic': story_info['topic'],
        'persona': story_info['persona'],
        'grammar': story_info['grammar'],
        'id': story_info['generation_id'],
        'feature': story_info['feature'],
        'style': story_info['style'],
        'model': story_info['model'],
        'title': story_start(story_info['story'])
    })

embeddings = np.array([item['embedding'] for item in matched_data])
umap_reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)
embedding_2d = umap_reducer.fit_transform(embeddings)

100%|██████████| 49999/49999 [00:06<00:00, 7602.33it/s]
  warn(


In [9]:
# Adapt to the format required by the web app
MAX_LINES_WEB = 5_000

final_data = []
for i, item in enumerate(matched_data):
    final_data.append({
        "id": item['id'],
        "embedding1": float(embedding_2d[i][0]),
        "embedding2": float(embedding_2d[i][1]),
        "title": item['title'].capitalize(),
        "style": item['style'].capitalize(),
        "topic": item['topic'].capitalize(),
        "theme": item['theme'].capitalize(),
        "persona": item['persona'].capitalize(),
        "grammar": item['grammar'].capitalize(),
        "feature": item['feature'].capitalize(),
        "model": item['model'],
        "story": item['story'].replace('\n', ' ').replace('  ', '\n\n')
    })
    if i + 1 >= MAX_LINES_WEB:
        break

output_file = 'stories.json'
with open(output_file, 'w') as f:
    json.dump(final_data, f)

In [None]:
# Getting unique values for each feature
unique_features = {key: list(set([item[key] for item in final_data])) for key in ['topic', 'style', 'theme', 'persona', 'grammar', 'feature']}
print(unique_features)

{'style': ['Lighthearted', 'Tragic', 'Modern', 'Mystical', 'Mythological', 'Adventurous', 'Melancholic', 'Humorous', 'Romantic', 'Action-packed', 'Heartwarming', 'Epic', 'Lyric', 'Minimalist', 'Whimsical', 'Classic', 'Noir', 'Fable-like', 'Suspenseful', 'Surreal', 'Playful', 'Fairy tale-like', 'Philosophical'], 'topic': ['Superheroes', 'Space exploration', 'Snowy adventures', 'Holidays', 'Gardens', 'Dream worlds', 'Sibling rivalry', 'Alien encounters', 'A deadline or time limit', 'Mysterious maps', 'Fantasy worlds', 'Robots and technology', 'Unusual vehicles', 'Lost civilizations', 'Virtual worlds', 'Magical objects', 'Haunted places', 'Miniature worlds', 'Bygone eras', 'Hidden treasures', 'Mystical creatures', 'Subterranean worlds', 'Invisibility', 'Pirates', 'Dinosaurs', 'Underwater adventures', 'Enchanted forests', 'Cultural traditions', 'Talking animals', 'Magical lands', 'The sky', 'School life', 'Living objects', 'Giant creatures', 'Shape-shifting', 'Undercover missions', 'Outer 

In [45]:
# Find closest Stories (Marked for deletion, too slow)

from tqdm import tqdm
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Convert the list of embeddings into a numpy array
embeddings = np.array([entry['embedding'] for entry in matched_data])

max_distance = -1
story_pair = (None, None)

# Iterate over all pairs of embeddings to compute the cosine distance
for i in tqdm(range(len(embeddings))):
    for j in range(i + 1, len(embeddings)):
        similarity = cosine_similarity([embeddings[i]], [embeddings[j]])[0][0]
        distance = 1 - similarity
        if distance > max_distance:
            max_distance = distance
            story_pair = (i, j)

if story_pair[0] is not None and story_pair[1] is not None:
    story1 = matched_data[story_pair[0]]['story']
    story2 = matched_data[story_pair[1]]['story']
    print("Story 1 with maximal cosine distance:")
    print(story1)
    print("\nStory 2 with maximal cosine distance:")
    print(story2)
else:
    print("No stories to compare.")


  0%|          | 16/4999 [00:06<35:41,  2.33it/s]


KeyboardInterrupt: 

In [21]:
# 4. Visualization with Plotly

df = pd.DataFrame(embedding_2d, columns=['x', 'y'])
df['story'] = [item['story'].replace("\n", "<br>").strip() for item in matched_data]
df['theme'] = [item['theme'] for item in matched_data]
df['topic'] = [item['topic'] for item in matched_data]

hover_template = "<b>Story:</b><br>%{customdata[0]}<br><extra></extra>"

fig = px.scatter(df, x='x', y='y', symbol='theme', color='topic', 
                 hover_data={'story': True, 'theme': False, 'x': False, 'y': False},
                 custom_data=['story'],
                 title="UMAP of Story Embeddings")

fig.update_traces(hovertemplate=hover_template)

fig.update_layout(
    title={'x': 0.5},
    xaxis_title=None,
    yaxis_title=None,
    margin=dict(l=0, r=0, t=50, b=0),
    legend_title_text='',
    hoverlabel=dict(font_size=11),
)

if not os.path.exists("data/embeddings/web"):
    os.makedirs("data/embeddings/web")
fig.write_html("data/embeddings/web/index.html")


Collecting nbformat
  Downloading nbformat-5.10.4-py3-none-any.whl.metadata (3.6 kB)
Collecting fastjsonschema>=2.15 (from nbformat)
  Downloading fastjsonschema-2.20.0-py3-none-any.whl.metadata (2.1 kB)
Downloading nbformat-5.10.4-py3-none-any.whl (78 kB)
Downloading fastjsonschema-2.20.0-py3-none-any.whl (23 kB)
Installing collected packages: fastjsonschema, nbformat
Successfully installed fastjsonschema-2.20.0 nbformat-5.10.4
