In [1]:
import numpy as np
import pandas as pd
import json
from dotenv import load_dotenv
import os
import re
import csv
load_dotenv()

True

In [2]:
api_key = os.environ.get('OPENAI_API_KEY')

from openai import OpenAI
client = OpenAI(api_key=api_key)

# Pre Processing

In [3]:
# Load the dataset
df = pd.read_csv('Dishes.csv')

# Split the multiline dishes and descriptions into lists
dish_list = df['dishes'].str.split('\n\n', expand=False)
description_list = df['Description'].str.split('\n\n', expand=False)

# Initialize lists to hold individual dish names, ingredients, summaries, and descriptions
all_dishes = []
all_ingredients = []
all_summaries = []
all_descriptions = []

# Regex patterns to match ingredients and summary
ingredients_pattern = r'Ingredients: (.+)'
summary_pattern = r'Summary: (.+)'

# Iterate over each row
for i in range(len(dish_list)):
    # Get each dish and corresponding description
    dishes = dish_list[i]
    descriptions = description_list[i]
    
    # Loop through each dish entry
    for j in range(len(dishes)):
        # Extract dish name, ingredients, and summary
        dish_lines = dishes[j].split('\n')
        dish_name = dish_lines[0].strip()  # First line is the dish name
        ingredients = re.search(ingredients_pattern, dishes[j])
        summary = re.search(summary_pattern, dishes[j])
        
        all_dishes.append(dish_name)
        all_ingredients.append(ingredients.group(1) if ingredients else '')
        all_summaries.append(summary.group(1) if summary else '')
        all_descriptions.append(descriptions[j])

# Create a new DataFrame with separate columns for dish name, ingredients, summary, and description
new_df = pd.DataFrame({
    'Dish': all_dishes,
    'Ingredients': all_ingredients,
    'Summary': all_summaries,
    'Description': all_descriptions
})

In [4]:
# Save the new dataset
new_df.to_csv('data/dishes_col.csv', index=False)

print("Data cleaned and saved to 'dishes_col.csv'")

Data cleaned and saved to 'dishes_col.csv'


In [5]:
dishes = pd.read_csv('data/dishes_col.csv')

In [6]:
df.nunique()

dishes         10
Description    10
dtype: int64

## EDA

In [7]:
dishes.describe()

Unnamed: 0,Dish,Ingredients,Summary,Description
count,50,50,50,50
unique,50,50,50,50
top,6. Sinigang na Bangus (Milkfish in Sour Soup),"Bangus, tomatoes, onions, string beans, eggpla...","Boil water with tomatoes and onions, add tamar...","6. Sinigang na Bangus:\nA tangy, aromatic soup..."
freq,1,1,1,1


In [8]:
dishes.nunique()

Dish           50
Ingredients    50
Summary        50
Description    50
dtype: int64

In [9]:
dishes.head()

Unnamed: 0,Dish,Ingredients,Summary,Description
0,6. Sinigang na Bangus (Milkfish in Sour Soup),"Bangus, tomatoes, onions, string beans, eggpla...","Boil water with tomatoes and onions, add tamar...","6. Sinigang na Bangus:\nA tangy, aromatic soup..."
1,7. Chicken Afritada,"Chicken, potatoes, carrots, bell peppers, toma...","Sauté garlic and onions, add chicken, vegetabl...","7. Chicken Afritada:\nA vibrant, hearty stew w..."
2,8. Pork Menudo,"Pork, potatoes, carrots, bell peppers, tomato ...","Sauté garlic and onions, add pork and vegetabl...","8. Pork Menudo:\nA rustic, colorful dish with ..."
3,9. Beef Caldereta,"Beef, potatoes, carrots, bell peppers, tomato ...","Sauté garlic and onions, add beef and simmer u...","9. Beef Caldereta:\nA luxurious, deep red stew..."
4,10. Pinakbet,"String beans, eggplant, bitter gourd, squash, ...","Sauté garlic, onions, and tomatoes. Add vegeta...","10. Pinakbet:\nA rustic, colorful vegetable me..."


## Cleaning

In [10]:
# Function to remove newline characters and numbers using re
def preprocess(text):
    if isinstance(text, str):
        # Remove newlines and numbers
        return re.sub(r'[\n\d.]', '', text)  # Replace newlines and numbers with a space
    return text  # Return as is if not a string

def remove_dish_name_from_description(description):
    if isinstance(description, str):
        # Remove the text before the first colon (:) and any space after it
        return re.sub(r'^.*?:\s*', '', description)
    return description  # Return as is if not a string

In [11]:
df_cleaned = dishes.map(preprocess)
df_cleaned['Description'] = df_cleaned['Description'].apply(remove_dish_name_from_description)

In [12]:
df_cleaned.head()

Unnamed: 0,Dish,Ingredients,Summary,Description
0,Sinigang na Bangus (Milkfish in Sour Soup),"Bangus, tomatoes, onions, string beans, eggpla...","Boil water with tomatoes and onions, add tamar...","A tangy, aromatic soup that's clear and light ..."
1,Chicken Afritada,"Chicken, potatoes, carrots, bell peppers, toma...","Sauté garlic and onions, add chicken, vegetabl...","A vibrant, hearty stew with golden-brown chick..."
2,Pork Menudo,"Pork, potatoes, carrots, bell peppers, tomato ...","Sauté garlic and onions, add pork and vegetabl...","A rustic, colorful dish with cubes of pork swi..."
3,Beef Caldereta,"Beef, potatoes, carrots, bell peppers, tomato ...","Sauté garlic and onions, add beef and simmer u...","A luxurious, deep red stew with chunks of tend..."
4,Pinakbet,"String beans, eggplant, bitter gourd, squash, ...","Sauté garlic, onions, and tomatoes Add vegetab...","A rustic, colorful vegetable medley Green stri..."


## JSON

In [13]:
# Transform the DataFrame into a list of dictionaries
data_for_embedding = df_cleaned.to_dict(orient='records')

# Convert to JSON
import json
json_data = json.dumps(data_for_embedding, indent=4)

In [14]:
# Save to a JSON file
with open('data/dishes_embeddings.json', 'w') as f:
    f.write(json_data)

print("Data saved to 'dishes_embeddings.json'")

Data saved to 'dishes_embeddings.json'


In [15]:
file_path = 'data/dishes_embeddings.json'
with open(file_path, 'r', encoding='utf-8') as file:
    data = json.load(file)
    # Convert values to a list
    dishes = list(data)

    # Print the loaded JSON data (nicely formatted)
    print("Loaded JSON data:")
    print(json.dumps(data, indent=4))

Loaded JSON data:
[
    {
        "Dish": " Sinigang na Bangus (Milkfish in Sour Soup)",
        "Ingredients": "Bangus, tomatoes, onions, string beans, eggplant, tamarind powder, salt",
        "Summary": "Boil water with tomatoes and onions, add tamarind powder, vegetables, and fish Simmer until cooked",
        "Description": "A tangy, aromatic soup that's clear and light The milkfish chunks float in a broth that's a pale, cloudy pink from the tomatoes Green string beans and purple eggplant add pops of color The soup has a sour, mouth-puckering taste from the tamarind, balanced by the mild, flaky fish It smells fresh and slightly acidic, with a hint of fish and vegetables"
    },
    {
        "Dish": " Chicken Afritada",
        "Ingredients": "Chicken, potatoes, carrots, bell peppers, tomato sauce, garlic, onions, cooking oil",
        "Summary": "Saut\u00e9 garlic and onions, add chicken, vegetables, and tomato sauce Simmer until chicken is cooked",
        "Description": "A vibr