In [None]:
import os

if "notebooks" in os.getcwd():
    os.chdir("..")

In [None]:
from openai import OpenAI

client = OpenAI()

In [None]:
from functools import lru_cache

from chromadb.api.models.Collection import Collection

from src.config import settings
from src.extract_embeddings import (
    create_chroma_client,
    create_thesis_collection,
)

chromadb_name = "default"
# chromadb_name = "instructor"


@lru_cache
def load_collection() -> Collection:
    client = create_chroma_client.fn(
        host=settings.CHROMA_CLIENT_HOSTNAME,
        port=settings.CHROMA_CLIENT_PORT,
        auth_provider=settings.CHROMA_CLIENT_AUTH_PROVIDER,
        auth_credentials=(
            settings.CHROMA_CLIENT_AUTH_CREDENTIALS.get_secret_value()
        ),
        chromadb_name=chromadb_name,
    )
    collection = create_thesis_collection.fn(
        client, chromadb_name=chromadb_name
    )
    return collection


collection = load_collection()

In [None]:
import textwrap

with open("src/assets/prompt-text-to-chroma.txt", encoding="utf-8") as f:
    prompt_text_to_chroma = f.read()
    prompt_text_to_chroma = textwrap.dedent(prompt_text_to_chroma)

In [None]:
user_query = "Gostaria de trabalhos sobre tribos indígenas da Amazônia desenvolvidas na UFPA"
# user_query = "Pesquisas desafios do câncer de mama nas relações familiares"
# user_query = "encontre teses e dissertações sobre políticas públicas voltadas a proteção animal"
# user_query = "Liste-me documentos sobre o trabalho infantil no Brasil"
# user_query = "Quero um resumo de trabalhos que fale sobre os impactos ambientais da exploração de minério de ferro no Quadrilátero Ferrífero"
# user_query = "Me dê um resumo de trabalhos que falam sobre o turismo sustentável no Brasil produzidos desde 2020"


completion = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": prompt_text_to_chroma},
        {"role": "user", "content": user_query},
    ],
    response_format={"type": "json_object"},
)

print(completion.choices[0].message)

In [None]:
import json


def extract_json_from_message(message):
    return json.loads(message.content)


response_dict = extract_json_from_message(completion.choices[0].message)

print(response_dict)

In [None]:
results = collection.query(
    query_texts=response_dict["query"],
    where=response_dict.get("where", {}),
    n_results=20,
)

results = [item for items in results["metadatas"] for item in items]
results

In [None]:
with open("src/assets/prompt-rag.txt", encoding="utf-8") as f:
    prompt_rag = f.read()
    prompt_rag = textwrap.dedent(prompt_rag)

completion = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": prompt_rag},
        {
            "role": "user",
            "content": f"""
            {user_query}
            {results}
            """,
        },
    ],
    response_format={"type": "json_object"},
)

print(completion.choices[0].message)

In [None]:
response_2 = extract_json_from_message(completion.choices[0].message)
response_2

In [None]:
import pandas as pd

df = pd.DataFrame(results)

df = df.loc[df["id"].isin(response_2["ids"])]

df