In [4]:
import psycopg2
from openai import AsyncAzureOpenAI
import dotenv
from os import getenv
from aiometer import run_on_each
from time import sleep
from itertools import product

# We load the environment variables.
dotenv.load_dotenv(override=True)

MODEL_ID = "tags"

client = AsyncAzureOpenAI(
    api_version= getenv("AZURE_AI_VERSION"),
    azure_endpoint=getenv("AZURE_AI_ENDPOINT"),
    azure_deployment=getenv("tags"),
    api_key=getenv("AZURE_AI_API_KEY"),
)

# Connect to the database
conn = psycopg2.connect("dbname=hn user=julien")

# Open a cursor to perform database operations
# cur = conn.cursor()

conn.rollback()

In [5]:
async def getKeywordsFromChatGPT(keywords: list[str]) -> str:
    """Ask ChatGPT for a given list of keywords,
    find 3 keywords that summarize the input keywords.

    Args:
        keywords (list[str]): A list of keywords

    Returns:
        str: A string containing 3 keywords 
    """

    messages = [
        {
            "role": "system",
            "content": "The assistant must give 5 precise keywords summarising a list of keywords. The response must be comma separated. The keywords returned by the assistant can be different or identical from the keywords in the list."
        },
        {
            "role": "user",
            "content": ", ".join(keywords)
        }
    ]

    message = ""
    i = 0

    # Because of the randomness of the model, we try 3 times to get a valid response
    while i < 3 and (message == "" or message == ","):
        res = await client.chat.completions.create(
            model=MODEL_ID,
            messages=messages,
            temperature=0.2,
        )
        message = res.choices[0].message.content
        i += 1
        sleep(0.3)

    return res.choices[0].message.content


def getLeafPathFromDB() -> list[tuple[int]]:
    """
    We get the list of all the paths (e.g. (1, 7, 0, 2, 4) ) from the database
    that don't have any children.

    Returns:
        list[tuple[int]]: A list of paths
    """

    cur = conn.cursor()

    query = """
    SELECT DISTINCT path
    from hn_embeddings
    WHERE path NOT IN (
        SELECT path
        FROM hn_paths
    )
    """

    cur.execute(query)
    return [row[0] for row in cur.fetchall()]


def insertDirectoryInDB(path: list[int], keywords: str):
    """
    Insert a directory in the database.

    Args:
        depth (int): The depth of the directory
        path (int): The path of the directory
        keywords (str): The keywords of the directory
    """

    cur = conn.cursor()

    query = """
    INSERT INTO hn_paths (path, title)
    VALUES (%s, %s)
    ON CONFLICT ON CONSTRAINT hn_paths_pkey DO UPDATE
    SET title = EXCLUDED.title

    """

    try:
        cur.execute(query, (path, keywords))
        conn.commit()
    except psycopg2.errors.UniqueViolation:
        conn.rollback
        pass


def getTagsForPath(path: list[int]) -> list[str]:
    """
    Get the keywords in the DB for a given leaf path.

    Args:
        id (int): The ID of the directory

    Returns:
        list[str]: A list of keywords
    """
    cur = conn.cursor()

    query = """
    SELECT STRING_AGG(tags, ', ')
    FROM hn_article
    INNER JOIN hn_embeddings ON hn_article.id = hn_embeddings.id
    WHERE hn_embeddings.path = %s
    """

    cur.execute(query, (path,))
    res = cur.fetchone()[0]

    return [row.strip() for row in res.split(",")]


async def computeKeywordForOneLeaf(path):
    # Get the ID that corresponds to the path
    keywords = getTagsForPath(path)

    title = ""
    # Get the title from ChatGPT
    # Trim the title to 3 keywords and capitalize the first letter of each keyword
    try:
        title = await getKeywordsFromChatGPT(keywords)
        title = ", ".join([text.strip().capitalize()
                          for text in title.split(",")[:3]])
    except Exception as e:
        print(e)
        sleep(2)
        title = await getKeywordsFromChatGPT(keywords)
        title = ", ".join([text.strip().capitalize()
                          for text in title.split(",")[:3]])

    insertDirectoryInDB(path, title)
    print("Done for {}".format(path))


async def computeKeywordForOneNonLeaf(path: list[int], N=8):
    """ Get the keywords for a non-leaf path.

    While we compute the keywords of a leaf from the article,
    this function computes the keywords using the keywords of the nested subdirectories

    Args:
        path 
    """

    cur = conn.cursor()

    pathsToQuery = [path.copy() + [i] for i in range(N)]

    # STRING_AGG concat all the string with ", " separator
    query = """
    SELECT STRING_AGG(title, ', ')
    FROM hn_paths
    WHERE path IN %s
    """
    # We have to convert it to a tuple so that psycopg2 doesn't interprit as an array but
    # as a list in postgres with the () notation
    cur.execute(query, (tuple(pathsToQuery),))
    keywords: str = cur.fetchone()[0]

    if keywords is None:
        print("No keywords for ", path)
        return

    title = await getKeywordsFromChatGPT([row.strip()
                                   for row in keywords.split(",")])
    
    insertDirectoryInDB(path, title)
    print("Done for ", path)
    


async def setKeywordsForLeafPath():
    paths = getLeafPathFromDB()
    print("There are {} paths to compute".format(len(paths)))
    try:
        await run_on_each(computeKeywordForOneLeaf, paths, max_at_once=6)
    except Exception as e:
        sleep(10)
        print("Restarting the process")
        print(e)
        await setKeywordsForLeafPath()


async def setKeywordsForNonLeafPath(depth: int, N=8):
    """Compute the keywords for all the non-leaf paths of a given depth.
    e.g if depth = 2, we compute the keywords for all the directories at depth 2 (e.g. 1,2; 3,7; 7,1)

    Args:
        depth (int): The depth of the directories
        N (int, optional): The number of subfolder per folder. Defaults to 8.
    """

    # Get all the paths of the given depth
    numbers = [i for i in range(N)]
    paths = list(product(numbers, repeat=depth))
    print("There are {} paths to compute for depth {}".format(len(paths), depth))

    # Get the paths that are not in the database
    cur = conn.cursor()
    i = 0
    query = """
    SELECT path
    FROM hn_paths
    WHERE path = %s
    """
    while i < len(paths):
        cur.execute(query, (list(paths[i]),))
        if cur.fetchone() is not None:
            paths.pop(i)
        else:
            i += 1

    # Convert list of tuple to list of list
    paths = [list(path) for path in paths]
    print("There are {} paths left to compute".format(len(paths)))
    try:
        await run_on_each(computeKeywordForOneNonLeaf, paths, max_at_once=6)
    except Exception as e:
        sleep(10)
        print("Restarting the process")
        print(e)
        await setKeywordsForNonLeafPath(depth, N)

In [6]:
await setKeywordsForLeafPath()
await setKeywordsForNonLeafPath(3)
await setKeywordsForNonLeafPath(2)
await setKeywordsForNonLeafPath(1)

There are 3970 paths to compute
Done for [7, 5, 2, 1]
Done for [4, 5, 6, 1]
Done for [1, 3, 4, 6]
Done for [4, 0, 4, 3]
Done for [3, 6, 7, 4]
Done for [0, 2, 4, 5]
Done for [3, 0, 3, 1]
Done for [4, 7, 3, 4]
Done for [0, 1, 6, 1]
Done for [2, 6, 4, 4]
Done for [6, 7, 2, 1]
Done for [6, 1, 1, 4]
Done for [6, 1, 4, 1]
Done for [7, 3, 7, 4]
Done for [6, 2, 5, 2]
Done for [4, 1, 6, 3]
Done for [3, 1, 5, 0]
Done for [7, 2, 6, 1]
Done for [5, 2, 2, 6]
Done for [4, 1, 3, 6]
Done for [2, 0, 3, 0]
Done for [4, 4, 7, 0]
Done for [7, 1, 0, 5]
Done for [1, 4, 3, 5]
Done for [2, 1, 5, 4]
Done for [3, 6, 0, 0]
Done for [1, 6, 2, 2]
Done for [3, 4, 6, 4]
Done for [6, 6, 0, 2]
Done for [6, 7, 7, 1]
Done for [4, 7, 7, 4]
Done for [6, 3, 2, 5]
Done for [5, 2, 5, 1]
Done for [5, 0, 7, 1]
Done for [5, 3, 2, 7]
Done for [2, 4, 4, 2]
Done for [0, 0, 7, 1]
Done for [3, 0, 0, 2]
Done for [4, 2, 2, 6]
Done for [2, 4, 5, 7]
Done for [7, 2, 6, 0]
Done for [6, 7, 2, 5]
Done for [7, 6, 5, 1]
Done for [0, 1, 4, 7]
