In [1]:
import json
from tqdm import tqdm

In [2]:
with open('conversations.json', 'r') as f:
    conversations = json.load(f)

In [3]:
def get_msg(convo):
    mapping = convo['mapping']
    user_msgs = []
    
    for node in mapping.keys():
        if mapping[node]['message']:
            if 'parts' in mapping[node]['message']['content'].keys() and mapping[node]['message']['author']['role'] == 'user':
                if isinstance(mapping[node]['message']['content']['parts'][0], str):
                    user_msgs.append(mapping[node]['message']['content']['parts'])
    return user_msgs

In [4]:
all_user_msgs = []
for convo in tqdm(conversations):
    all_user_msgs.extend(get_msg(convo))

len(all_user_msgs)

100%|██████████| 3261/3261 [00:00<00:00, 37910.63it/s]


14157

In [None]:
from openai import OpenAI
import os 
from dotenv import load_dotenv
import tiktoken

load_dotenv()

MAX_CONTEXT_LENGTH = 8192
EMBEDDING_MODEL = "text-embedding-3-small"

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
embeddings = []

for t in tqdm(all_user_msgs):
    string_t = t[0]
    embedding = tiktoken.encoding_for_model(EMBEDDING_MODEL).encode(string_t)
    num_tokens = len(embedding)
    try:
        if num_tokens > MAX_CONTEXT_LENGTH:
            string_t = string_t[-MAX_CONTEXT_LENGTH:]
        embedding = client.embeddings.create(input=string_t, model=EMBEDDING_MODEL).data[0].embedding
        embeddings.append(embedding)
    except Exception as e:
        print(e)
        break


In [9]:
from openai import OpenAI
import os 
from dotenv import load_dotenv
import tiktoken

load_dotenv()

MAX_CONTEXT_LENGTH = 8192
EMBEDDING_MODEL = "text-embedding-3-small"
BATCH_SIZE = 100

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def truncate_msg(msg, max_length=MAX_CONTEXT_LENGTH, embedding_model=EMBEDDING_MODEL):
    msg = msg[0]
    encoding = tiktoken.encoding_for_model(embedding_model)
    tokens = encoding.encode(msg, disallowed_special=set())
    if len(tokens) > max_length:
        tokens = tokens[-max_length:]
    return encoding.decode(tokens)

texts_to_embed = [truncate_msg(msg) for msg in all_user_msgs]

embeddings = []
for i in tqdm(range(0, len(texts_to_embed), BATCH_SIZE), desc="Embedding batches"):
    batch = texts_to_embed[i:i+BATCH_SIZE]
    try:
        response = client.embeddings.create(input=batch, model=EMBEDDING_MODEL)
        batch_embeddings = [data.embedding for data in response.data]
        embeddings.extend(batch_embeddings)
    except Exception as e:
        print(f"Error embedding batch {i}: {e}")
        break

print(f"Total embeddings: {len(embeddings)}")

Embedding batches:   1%|▏         | 2/142 [00:04<05:15,  2.25s/it]

Error embedding batch 200: Error code: 400 - {'error': {'message': "'$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.", 'type': 'invalid_request_error', 'param': None, 'code': None}}
Total embeddings: 200



