# Put the embeddings in the database
This notebook will compute the embeddings for the articles in the database. It will then insert the embeddings into the database.

## Steps

1. Find the posts that have not been embedded yet
2. Compute the embeddings for those posts
3. Insert the embeddings into the database

In [1]:
import psycopg2

# Connect to the database
conn = psycopg2.connect("dbname=hn user=julien")

# Open a cursor to perform database operations
cur = conn.cursor()

# Query the database and obtain data as Python objects
cur.execute("""
    SELECT id, title, text
    FROM hn_article
    LEFT JOIN hn_embeddings USING(id)
    WHERE hn_embeddings.id IS NULL
    AND text IS NOT NULL
    LIMIT 100000;
            """)

articles = cur.fetchall()
len(articles)

438

In [2]:
from openai import AsyncAzureOpenAI
from os import getenv
from tiktoken import get_encoding
import dotenv

# We load the environment variables.
dotenv.load_dotenv(override=True)

# Set to a global variable to avoid calling the function every time.
enc = get_encoding("cl100k_base")

# We define the proxies to use.
proxies = {
    'http': getenv("PROXY_URL_USA"),
    'https': getenv("PROXY_URL_USA"),
}

# Constants
# The maximum number of tokens we will use to compute embeddings.
MAX_TOKENS = 3072
DIMENSIONS = 1536  # The number of dimensions of the embeddings.
MODEL_ID = "text-embedding-ada-002"  # The ID of the model to use.

client = AsyncAzureOpenAI(
    api_version= getenv("AZURE_AI_VERSION"),
    azure_endpoint=getenv("AZURE_AI_ENDPOINT"),
    azure_deployment=getenv("AZURE_DEPLOYMENT_ID"),
    api_key=getenv("AZURE_AI_API_KEY"),
)

def get_text_truncated_tokenized(text: str, max_tokens: int) -> str:
    """
    Truncate a text to the desired number of tokens.
    It's to avoid excessive costs when computing embeddings.

    Args:
        text (str): The text to truncate.
        max_tokens (int): The maximum number of tokens in cl100k_base

    """
    # We tokenize the text.
    tokens = enc.encode(text, disallowed_special=())

    # We truncate the tokens.
    tokens = tokens[:max_tokens]

    # We decode the tokens.
    text = enc.decode(tokens)

    # As stated here: https://learn.microsoft.com/en-us/azure/cognitive-services/openai/reference#embeddings
    # It's best to replace newlines with spaces.
    text = text.replace("\n", " ")

    return text

async def compute_embeddings(text: str) -> list[float]:
    """
    Compute the embeddings of a URL from the text of the article.
    First, we get the text of the article.
    Then, we shrink the text to MAX_TOKENS tokens.
    Finally, we compute the embeddings of the text.

    The dimension for text-embedding-ada-002 is 1536.

    Args:
        url (str): The URL of the article.

    Returns:
        list[float]: The embeddings of the article.
    """
    text = get_text_truncated_tokenized(text, MAX_TOKENS)

    if (len(text) == 0):
        raise Exception("Text extracted is empty.")

    # We compute the embeddings.
    response = (await client.embeddings.create(input=text, model=MODEL_ID, encoding_format="float")).data[0].embedding

    return response

In [3]:
async def compute_embeddings_for_article(article: tuple[int, str, str]) -> None:
    """
    Compute the embeddings for a list of articles.

    Args:
        articles (list[tuple[int, str, str]]): The list of articles to compute the embeddings for.
    """
    id = article[0]
    title = article[1]
    text = article[2]

    toEmbed = title + " " + text
    embeddings = None
    try:
        embeddings = await compute_embeddings(toEmbed)
    except Exception as e:
        print(f"Error while computing embeddings for article {id}: {e}")
        return
    
    # We insert the embeddings in the database.
    try:
        cur.execute("""
            INSERT INTO hn_embeddings(id, embedding)
            VALUES (%s, %s);
            """, (id, embeddings))
        conn.commit()
        print(f"Successfully inserted embeddings for article {id} {title}")
    except Exception as e:
        conn.rollback()
        print(f"Error while inserting embeddings for article {id}: {e}")
        return

        

In [4]:
import aiometer

# We compute the embeddings for the articles.
await aiometer.run_on_each(compute_embeddings_for_article, articles, max_per_second=5)

Successfully inserted embeddings for article 38637853 microsoft/windows-ai-studio
Successfully inserted embeddings for article 38722246 Beeper - Moving Forward
Successfully inserted embeddings for article 38759257 GTA 5 source code leaks online, giving Rockstar a huge blow on Christmas: Report
Successfully inserted embeddings for article 38719736 US homelessness up 12% to highest reported level as rents soar and coronavirus pandemic aid lapses
Successfully inserted embeddings for article 38755412 Did English ever have a formal version of "you"?
Successfully inserted embeddings for article 155400 Some Heroes
Successfully inserted embeddings for article 38730107 LABS.GOOGLE
Successfully inserted embeddings for article 38687997 Providing HTML Content Using Htmx
Successfully inserted embeddings for article 38629630 Bash One-Liners for LLMs
Successfully inserted embeddings for article 38581959 Scrambling Eggs for Spotify with Knuth's Fibonacci Hashing
Successfully inserted embeddings for ar

In [5]:
countTokens = 0

tokenArr = []
for article in articles:
    count = len(enc.encode(article[1] + " " + article[2], disallowed_special=()))
    countTokens += min(count, 3072)
    tokenArr.append(min(count, 3072))


print(f"Average number of tokens per article: {countTokens / len(articles)}")

tokenArr.sort()
print(f"Median number of tokens per article: {tokenArr[len(tokenArr) // 2]}")
# Quartiles
print(f"First quartile of tokens per article: {tokenArr[len(tokenArr) // 4]}")
print(f"Third quartile of tokens per article: {tokenArr[len(tokenArr) // 4 * 3]}")

# Max
print(f"Max number of tokens per article: {tokenArr[-1]}")

# Sum
print(f"Total number of tokens: {countTokens}")

Average number of tokens per article: 1561.9597701149426
Median number of tokens per article: 1373
First quartile of tokens per article: 654
Third quartile of tokens per article: 2711
Max number of tokens per article: 3072
Total number of tokens: 1087124
