In [6]:
EMBEDDING_MODEL = "text-embedding-ada-002"
import openai

def set_pactum_api_key():
    openai.api_key = ''

def set_my_api_key():
    openai.api_key = ''

set_pactum_api_key()


def get_embedding(text: str, model: str=EMBEDDING_MODEL) -> list[float]:
    result = openai.Embedding.create(
        model=model,
        input=text
    )
    return result["data"][0]["embedding"]


In [9]:
from psycopg2 import extras
import os
from typing import List
import psycopg2
import urllib
from dotenv import load_dotenv
load_dotenv()
if 'DATABASE_URL' not in os.environ:
    raise Exception('DATABASE_URL environment variable not set')

DATABASE_URL = os.environ['DATABASE_URL']

class Article:
    def __init__(self, id, markdown, md_ada_002_embedding):
        self.id = id
        self.markdown = markdown
        self.md_ada_002_embedding = md_ada_002_embedding


def connect(db_url):
    url = urllib.parse.urlparse(db_url)
    return psycopg2.connect(
        host=url.hostname,
        database=url.path[1:],
        user=url.username,
        password=url.password
    )

def get_articles() -> List[Article]:
    with connect(DATABASE_URL) as conn:
        with conn.cursor(cursor_factory=extras.DictCursor) as cur:
            cur.execute("SELECT id, markdown, md_ada_002_embedding FROM article WHERE deleted_at IS NULL")
            rows = cur.fetchall()

    articles = [Article(*row) for row in rows]
    return [article for article in articles if article.md_ada_002_embedding is None]


articles = get_articles()
len(articles)

133

In [10]:
from tqdm import tqdm


def update_article_embedding(article: Article):
    with connect(DATABASE_URL) as conn:
        with conn.cursor(cursor_factory=extras.DictCursor) as cur:
            cur.execute("UPDATE article SET md_ada_002_embedding = %s WHERE id = %s", (article.md_ada_002_embedding, article.id))
        conn.commit()

for article in tqdm(articles):
    article.md_ada_002_embedding = get_embedding(article.markdown)
    update_article_embedding(article)

100%|██████████| 133/133 [01:44<00:00,  1.27it/s]


In [8]:
len(articles)

213