# Setup

In [1]:
import os, base64
from dotenv import load_dotenv
from openai import OpenAI
from pdf2image import convert_from_path

load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")

In [2]:
from IPython.display import Markdown, display

# Perform OCR and transform to images

In [3]:
def pdf_to_images(pdf_path, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    imagens = convert_from_path(pdf_path)
    image_paths = []

    pdf_nome = os.path.splitext(os.path.basename(pdf_path))[0]
    for i, imagem in enumerate(imagens):
        image_path = os.path.join(output_folder, f"{pdf_nome}_pg{i+1}.jpg")
        imagem.save(image_path, "JPEG")
        image_paths.append(image_path)

    return image_paths

In [4]:
# Convertendo pdf para img
main_folder = os.path.join(os.getcwd(), "5.rag_openai")
receips_folder = os.path.join(main_folder, "receips")
output_folder = os.path.join(main_folder, "img")

pdf_path = os.path.join(receips_folder, "Southern Cookbook of Fine Recipes.pdf")
image_paths = pdf_to_images(pdf_path, output_folder)

In [5]:
client = OpenAI(api_key=openai_api_key)
model = "gpt-4o-mini"

In [6]:
# Read and encode one image
image_path = os.path.join(output_folder, "Southern Cookbook of Fine Recipes_pg9.jpg")
with open(image_path, "rb") as image_file:
    image_data = base64.b64encode(image_file.read()).decode('utf-8')

In [7]:
system_prompt = """
Analyse the content of this image and extract any related recipe information.
"""

In [8]:
def generate(client, system_prompt, image, model="gpt-4o-mini"):
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": [
                "This is the image from the recipe page.",
                {"type": "image_url",
                 "image_url": {"url": f"data:image/jpeg;base64,{image}",
                               "detail": "high"}}
            ]}
        ],
        temperature=0
    )
    return response.choices[0].message.content

In [9]:
#response = generate(client, system_prompt, image_data)
#display(Markdown(response))

In [10]:
def get_gpt_response(client, system_prompt, imagem):
    response = generate(client, system_prompt, imagem)
    return [response, display(Markdown(response))]

In [11]:
# Melhorando o system prompt
system_prompt2 = """
Analyse the content of this image and extract any related recipe information into structure components.
Specifically, extract the recipe title, list of ingredients, step by step instructions, cousine type, dish type, any relevant tags or metadata.
The output must be in a way suited for embedding in a Retrieval Augmented Generation (RAG) system.
"""

In [12]:
# Teste
#resposta = get_gpt_response(client, system_prompt2, image_data)
#resposta[1]

---
#### Precisa Melhorar
No resultado, não foi mostrada todas as receitas da imagem. É preciso ir testando os prompts e parâmetros.
<img src="5.rag_openai/img/Southern%20Cookbook%20of%20Fine%20Recipes_pg9.jpg" width="700"/>

Devido custo e tempo, vamos continuar com o que temos.

In [13]:
def converter_imagens_b64(imagens_path):
    imagens_b64 = []
    for imagem_path in imagens_path:
        with open(imagem_path, "rb") as arquivo_imagem:
            imagens_b64.append(base64.b64encode(arquivo_imagem.read()).decode('utf-8'))
    return imagens_b64

In [14]:
def gerar_respostas(client, system_prompt, lista_imagens_b64):
    respostas = []
    for image_data in lista_imagens_b64:
        respostas.append(generate(client, system_prompt, image_data))
    return respostas

In [15]:
def filtrar_receitas(receitas):
    receitas_filtradas = []
    for receita in receitas:
        if any(keyword in receita["conteudo"].lower() for keyword in ["ingredients", "instructions", "recipe title"]):
            receitas_filtradas.append(receita)
        else:
            print(f"Ignorando receita: {receita["imagem_path"]}")
    return receitas_filtradas

In [16]:
def converter_gerar_filtrar(client, system_prompt, imagens_path):
    imagens_b64 = converter_imagens_b64(imagens_path)
    responses = gerar_respostas(client, system_prompt, imagens_b64)
    output = []
    for i, imagem_path in enumerate(imagens_path):
        output.append(
            {"imagem_path": imagem_path, "conteudo": responses[i]}
        )
    output_filtrado = filtrar_receitas(output)
    return output_filtrado

In [41]:
# Teste com cinco receitas
images_path = [
    "5.rag_openai/img/Southern Cookbook of Fine Recipes_pg9.jpg",
    "5.rag_openai/img/Southern Cookbook of Fine Recipes_pg10.jpg",
    "5.rag_openai/img/Southern Cookbook of Fine Recipes_pg11.jpg"
]
receitas_filtradas = converter_gerar_filtrar(client, system_prompt2, images_path)
receitas_filtradas

[{'imagem_path': '5.rag_openai/img/Southern Cookbook of Fine Recipes_pg9.jpg',
  'conteudo': 'Here’s the structured recipe information extracted from the image:\n\n### Recipe Title\nChicken Gumbo\n\n### Cuisine Type\nSouthern\n\n### Dish Type\nSoup\n\n### Ingredients\n- 1 small stewing chicken\n- 2 tablespoons flour, melted\n- 3 tablespoons butter\n- 1 onion, chopped\n- 4 cups okra, sliced and chopped\n- 2 cups tomato pulp\n- Few sprigs parsley, chopped\n- Salt and pepper to taste\n- 4 cups water\n\n### Instructions\n1. Clean and dress the chicken, cutting it into serving portions.\n2. Dredge the chicken lightly with flour and sauté in melted butter along with the chopped onion.\n3. Once the chicken is nicely browned, add the okra, tomatoes, parsley, and water.\n4. Season to taste with salt and pepper.\n5. Cook slowly until the chicken is tender and the okra is well-cooked (about 2½ hours).\n6. If a thinner soup is preferred, increase the quantity of water as needed.\n\n### Relevant Ta

## Salvando dados em JSON

In [19]:
import json

In [42]:
output_file = "5.rag_openai/json/recipe_info.json"

with open(output_file, "w") as json_file:
    json.dump(receitas_filtradas, json_file, indent = 4)

## Embeddings
Hora de fazer os embeddings

In [43]:
import numpy as np

In [44]:
# Carregando as receitas filtradas
with open("5.rag_openai/json/recipe_info.json", "r") as json_file:
    receitas_filtradas = json.load(json_file)

Os dados estão organizados por pagina. Poderiam estar organizadas por receita, mas deveria ser feito no pré processamento.

In [57]:
# Gerando embeddings por receita filtrada
receitas_conteudo = [receita["conteudo"] for receita in receitas_filtradas]
embedding_response = client.embeddings.create(
    input = receitas_conteudo,
    model = "text-embedding-3-large"
)

Pagina dos modelos de embeddings: https://platform.openai.com/docs/guides/embeddings#embedding-models

In [58]:
# Extraindo os embeddings
embeddings = [data.embedding for data in embedding_response.data]

In [59]:
# Convertendo os embeddings para numpy array
embedding_matrix = np.array(embeddings)

In [60]:
# Verificando os embeddings
print(f"Gerado embeddings de {len(receitas_filtradas)} receitas")
print(f"Cada embedding tem {len(embeddings[0])} de tamanho")

Gerado embeddings de 3 receitas
Cada embedding tem 3072 de tamanho


## Retrieval System

In [61]:
import faiss

In [62]:
# Initialize the FAISS index
index = faiss.IndexFlatL2(embedding_matrix.shape[1])
index.add(embedding_matrix)

In [63]:
# Save the Index
faiss.write_index(index, "5.rag_openai/receitas_filtradas_index.index")

In [64]:
# Save the metadata
# Isso serve para ver se os dados puxados pelo rag são relevantes para queryie, e se esta funcionando bem
metadata = [{'receita_conteudo': receita['conteudo'],
             'imagem_path': receita['imagem_path']} for receita in receitas_filtradas]
with open("5.rag_openai/json/receita_metadata.json", "w") as json_file:
    json.dump(metadata, json_file, indent = 4)

In [71]:
def query_embeddings(query, index, metadada, k=5):
    # Gerando os embeddings para a query
    query_embedding = client.embeddings.create(
        input=[query],
        model="text-embedding-3-large"
    ).data[0].embedding
    query_vector = np.array(query_embedding).reshape(1, -1)
    
    # Search the Faiss index
    distances, indices = index.search(query_vector, min(k, len(metadata)))
    
    # Store the indices and distances
    stored_indices = indices[0].tolist()
    stored_distances = distances[0].tolist()
    
    # Print everything (indices, distances, metadata) -> Debugging
    #print(f"The query embedding is {query_embedding}\n")
    #print(f"The query vector is {query_vector}\n")
    #print(f"The indices are {indices}\n")
    #print(f"The distances are {distances}\n")
    #print(f"The stored indices are {stored_indices}\n")
    #print(f"The stored distances are {stored_distances}\n")
    #for i, dist in zip(stored_indices, stored_distances):
    #    if 0 <= i < len(metadata): #se i for maior ou igual a 0 e menor que metadata
    #        print(f"Distances: {dist}, Metadata: {metadata[i]['receita_conteudo']}")
    
    # Retornando os resultados
    return [(
            metadata[i]['receita_conteudo'], dist
        ) for i, dist in zip(
            stored_indices, stored_distances
        ) if 0 <= i < len(metadata)]

In [70]:
# Test the retrieval system
query = "How to make bread?"
resultados = query_embeddings(query, index, metadata)
print(f"Os resultados são {resultados}")

The query embedding is [-0.019710248336195946, -0.028066067025065422, -0.02203328348696232, 0.016628669574856758, -0.04790668934583664, -0.048831161111593246, 0.03799822926521301, 0.017932415008544922, 0.0015822718851268291, 0.004142352379858494, -0.0047971876338124275, -0.013878954574465752, -0.012883367016911507, -0.0016044947551563382, 0.03806934133172035, -0.02381111867725849, 0.017019793391227722, -0.016676079481840134, -0.007277265656739473, 0.002324517350643873, -0.03185877948999405, 0.02010137215256691, 0.015004915185272694, -0.007810615468770266, 0.006263900548219681, 0.01783759705722332, 0.0033926991745829582, -0.011206277646124363, -0.03970494866371155, 0.04233614355325699, -0.003641595831140876, 0.023704446852207184, -0.03463219851255417, -0.02117992378771305, -0.015893831849098206, 0.011307021602988243, -0.014767871238291264, 0.005274239461869001, 0.008503970690071583, 0.019413942471146584, -0.012444835156202316, 0.007792836986482143, 0.00036093726521357894, -0.00826692581

In [75]:
# Combine the results
def combined_retrieved_content(results):
    combined_content = "\n\n".join([result[0] for result in results])
    return combined_content
    
conteudo_combinado = combined_retrieved_content(resultados)

In [86]:
conteudo_combinado

'Here’s the structured recipe information extracted from the image:\n\n### Recipe Title\nCreole Soup à la Madame Begue\n\n### Ingredients\n- 1 tablespoon butter, melted\n- 1 tablespoon chopped green pepper\n- 1 tablespoon chopped red pepper\n- 1 tablespoon flour\n- 1½ cups soup stock\n- 1 cup tomato pulp\n- ½ cup corn\n- Salt and pepper\n\n### Instructions\n1. Lightly brown the peppers in the melted butter.\n2. Add the flour and stir.\n3. Slowly add the soup stock and tomato pulp; continue to stir until soup boils.\n4. Reduce the heat, cover, and let cook slowly for 20 minutes.\n5. Strain into another pot, add the corn, and season to taste with salt and pepper.\n\n### Cuisine Type\nSouthern\n\n### Dish Type\nSoup\n\n### Tags/Metadata\n- Comfort Food\n- Vegetarian Option (if using vegetable stock)\n- Quick Preparation\n\n---\n\n### Additional Recipes Mentioned\n1. **Plantation Soup**\n   - Ingredients: Carrot, celery, onion, soup stock, butter, flour, milk, grated cheese.\n   - Instruct

## Generative System

In [80]:
# Define the system prompt
system_prompt3 = f"""
You are highly experienced and expert chef specialized in providing cooking advice.
Your main taks is to provide information precise and accurate on the combined content.
You answer directly to the query using only information from the provided content.
If you don't know the answer, just say that you don't know.
Your goal is to help the user and answer the query.
""" 

In [82]:
# Define function to retrieve from API
def generate_response(query, combined_content, system_prompt):
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": query},
            {"role": "assistant", "content": combined_content}
        ],
        temperature=0
    )
    return response.choices[0].message.content

In [85]:
# Get the results
query = "Get me the best chiken gumbo recipe"
response = generate_response(query, conteudo_combinado, system_prompt3)
display(Markdown(response))

I don't know.

In [87]:
# Função para o RAG
def rag_system(query, index, metadata, system_prompt, k=5):
    # Retrieval System
    results = query_embeddings(query, index, metadata)

    # Content Merge
    combined_content = combined_retrieved_content(results)

    # Generation
    response = generate_response(query, combined_content, system_prompt3)

    return response

In [89]:
# Testando o sistema rag
query1 = "How to make the best chicken gumbo?"
resposta = rag_system(query1, index, metadata, system_prompt3)
display(Markdown(resposta))

To make the best chicken gumbo, follow this recipe:

### Ingredients
- 1 small stewing chicken
- 2 tablespoons flour, melted
- 3 tablespoons butter
- 1 onion, chopped
- 4 cups okra, sliced and chopped
- 2 cups tomato pulp
- Few sprigs parsley, chopped
- Salt and pepper to taste
- 4 cups water

### Instructions
1. Clean and dress the chicken, cutting it into serving portions.
2. Dredge the chicken lightly with flour and sauté in melted butter along with the chopped onion.
3. Once the chicken is nicely browned, add the okra, tomatoes, parsley, and water.
4. Season to taste with salt and pepper.
5. Cook slowly until the chicken is tender and the okra is well-cooked (about 2½ hours).
6. If a thinner soup is preferred, increase the quantity of water as needed.

Enjoy your delicious chicken gumbo!

In [91]:
query2 = "i want something vegan"
resposta = rag_system(query2, index, metadata, system_prompt3)
display(Markdown(resposta))

I don't know.