In [1]:
import pandas as pd 
from dotenv import load_dotenv
import json
from openai import OpenAI
import os 
import numpy as np
import base64
import time 

MIN_COUNT = 25
STRIDE = 20
LENGTH = 40

MODEL = "text-embedding-3-small"
EMBED_DIM = 512
PRECISION = np.float16
PRECISION_STR = 'f16'

IN_FILE = f'data/combined_chunks_top{MIN_COUNT}_len{LENGTH}_stride{STRIDE}.jsonl'
OUT_FILE = f'D:/embed_top{MIN_COUNT}_len{LENGTH}_stride{STRIDE}_dim{EMBED_DIM}_type{PRECISION_STR}.jsonl'

total_tokens = 0

# Calc total chunks
# with open(IN_FILE, 'r') as file:
#     for total_chunks, line in enumerate(file):
#         pass
# print(total_chunks)
total_chunks = 17582505

COST_PER_TOKEN = .01 / 1000000

CHUNKS_PER_BATCH = 1000

In [None]:
def embed_chunk(df, i, client, window):
    window = df.iloc[i : i + window]
    chunk_ids = window['chunk_id'].values.tolist()
    article_ids = window['article_id'].values.tolist()

    try:
        result = client.embeddings.create(
            input=window['chunk'].values.tolist(),
            model=MODEL,
            dimensions=EMBED_DIM
        )
    except Exception as e:
        print(e)
        return [
            {'chunk_id': x, 'embedding': None} 
            for x in chunk_ids
        ]

    global total_tokens
    total_tokens += result.usage.total_tokens

    return [
        {
            'chunk_id': chunk_ids[i], 
            'article_id': article_ids[i],
            'embedding': base64.b64encode(np.array(x.embedding, dtype=PRECISION)).decode('utf-8'),
            'model': MODEL
        } 
        for i, x in enumerate(result.data)
    ]

def append_to_disk(embeddings):
    with open(OUT_FILE, 'a', encoding='utf-8') as file:
        for row in embeddings:
            file.write(json.dumps(row) + '\n')

def embed_df(df, client, last_idx, start):
    global total_tokens
    i = 0
    while i + CHUNKS_PER_BATCH < len(df):
        append_to_disk(embed_chunk(df, i, client, CHUNKS_PER_BATCH))
        i += CHUNKS_PER_BATCH
        print(f"\nChunks embedded: {last_idx} ({round(last_idx / total_chunks, 5)}%). Total tokens spent: {total_tokens}. (${round(total_tokens * COST_PER_TOKEN, 3)}). Total time: {round((time.time() - start) / 60, 2)} minutes. Chunks per second: {round(last_idx / (time.time() - start), 1)}. Projected total hours: {round((((time.time() - start) / last_idx) * total_chunks) / 3600, 2)}")
    
    if i < len(df):
        append_to_disk(embed_chunk(df, i, client, CHUNKS_PER_BATCH))

In [None]:
load_dotenv()

client = OpenAI(api_key=os.environ.get("API_KEY"))

df = []
start = time.time()
with open(IN_FILE, 'r') as file:
    for idx, line in enumerate(file):
        df.append(json.loads(line.strip()))
        if idx != 0 and idx % 10000 == 0:
            embed_df(pd.DataFrame(df), client, idx, start)
            df = []


Chunks embedded: 1000 (0.00057%). Total tokens spent: 53303. ($0.001). Total time: 0.04 minutes. Chunks per second: 4102.4. Projected total hours: 1.19

Chunks embedded: 2000 (0.00057%). Total tokens spent: 107940. ($0.001). Total time: 0.08 minutes. Chunks per second: 2192.8. Projected total hours: 2.23

Chunks embedded: 3000 (0.00057%). Total tokens spent: 160257. ($0.002). Total time: 0.12 minutes. Chunks per second: 1415.1. Projected total hours: 3.45

Chunks embedded: 4000 (0.00057%). Total tokens spent: 212050. ($0.002). Total time: 0.16 minutes. Chunks per second: 1048.3. Projected total hours: 4.66

Chunks embedded: 5000 (0.00057%). Total tokens spent: 267630. ($0.003). Total time: 0.2 minutes. Chunks per second: 821.1. Projected total hours: 5.95

Chunks embedded: 6000 (0.00057%). Total tokens spent: 323728. ($0.003). Total time: 0.24 minutes. Chunks per second: 688.9. Projected total hours: 7.09

Chunks embedded: 7000 (0.00057%). Total tokens spent: 379509. ($0.004). Total t

In [None]:
# df = []
# with open(OUT_FILE, 'r') as file:
#     for line in file:
#         df.append(json.loads(line.strip()))

# df = pd.DataFrame(df)

# df['d'] = df.embedding.apply(lambda x: np.frombuffer(base64.b64decode(x), dtype=PRECISION))
# df.d.apply(lambda x: x.dtype)
# df