# __Recipe Recommendation__

### Project Overview

#### Inputs:
- **Ingredients that you have**
- **Ingredients that you are allergic to**

#### Outputs:
- **Table of all data**
- **List of Information for Best Recipe:**
  - Type of Recipe (Food, Drink, Other)
  - Difficulty Level (Easy, Medium, Hard, Unknown)
  - Title
  - Ingredients
  - Ingredients Missing *(optional if completed)*
  - Full Recipe Steps


# Initialization

In [None]:
# Initialize Libraries
import ast
import nltk
import numpy as np
import os
import pandas as pd
import re

nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from transformers import pipeline

!pip install openai

In [None]:
# To save and read data files from your Google drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import openai
openai.api_key = 'N/A'

# Import Recipes and Make Updated Data Frame (updated_df)

In [None]:
# Import the github and kaggle files
path = '/content/drive/MyDrive/2024 Spring/Text Mining/Projects/FinalFolder/'

### Kaggle recipes

In [None]:
# kaggle recipes
# https://www.kaggle.com/datasets/pes12017000148/food-ingredients-and-recipe-dataset-with-images
kaggle_df = pd.read_csv(path+'kaggle_recipes.csv')
kaggle_df.info()
kaggle_df.head()

### GitHub recipes

In [None]:
# github recipes
# https://github.com/cweber/cookbook/blob/master/recipes.csv
github_df = pd.read_csv(path + 'github_recipes.csv', engine='python')
github_df.info()
github_df.head()

### Updated recipes

In [None]:
# Put github text into udpated_df
updated_df = github_df[['Title', 'Directions']].copy()
ingredient_cols = [col for col in github_df.columns if col.startswith('Ingredient')]

ingredients_column = []
for i in range(len(github_df)):
    ingredients = []
    for col in ingredient_cols:
        v = github_df.at[i, col]
        if isinstance(v, str) and v.strip():
            ingredients.append(v.strip())
    ingredients_column.append(ingredients)

updated_df['Ingredients'] = ingredients_column

# Make kaggle into real list
kaggle_temp = kaggle_df[['Title', 'Cleaned_Ingredients', 'Instructions']].copy()
kaggle_temp = kaggle_temp.rename(columns={
    'Cleaned_Ingredients': 'Ingredients',
    'Instructions': 'Directions'
})
kaggle_temp['Ingredients'] = kaggle_temp['Ingredients'].apply(ast.literal_eval)

# Add kaggle into udpated_df
updated_df = pd.concat([updated_df, kaggle_temp], ignore_index=True)
updated_df

### Preprocess the Ingredients

In [None]:
unicode_fracs = r'[\u00BC-\u00BE\u2150-\u215E]'
def my_preprocessor(list_of_strings):
  """
  Parameters:
    text: (list(str))

  Changes:
    Converts text to lowercase
    Removed unicode fractions
    Removed numbers
    Removed punctuation
    Removed stop words
    Removed common words
  """

  text_processed_list = []

  for text in list_of_strings:
    # Makes text lowercase
    text_lower = text.lower()

    # Remove unicode fractions
    text_uni = re.sub(unicode_fracs, '', text_lower)

    # Remove measurements
    text_meas = re.sub(r'\d+[\d\s\/\.\-]*', '', text_uni)

    # Remove numbers
    text_num = re.sub(r'\d+', '', text_meas)

    # Split text into words (also gets rid of punctuation)
    tokens = RegexpTokenizer(r'\w+').tokenize(text_num)

    # Removes stop words
    stop_words = set(stopwords.words('english'))
    stopunct_tokens = []
    for token in tokens:
      if token not in stop_words:
        stopunct_tokens.append(token)

    # Combines text
    text_processed_0 = ' '.join(stopunct_tokens)

    # Removes common words
    common_words = [
    'additional', 'accompaniment',
    'basic', 'bit', 'blend',
    'chopped', 'chunk', 'chunks', 'cook', 'cooked', 'crosswise', 'cubed', 'cup', 'cups', 'cut',
    'diced', 'divided',
    'fine', 'finely',
    'g', 'good', 'gram', 'grams',
    'half',
    'inch',
    'kg',
    'large', 'lb', 'like',
    'medium', 'minute', 'ml',
    'optional', 'ounce', 'ounces',
    'pan', 'patted', 'pieces', 'plus', 'pound', 'pounds', 'precooked',
    'quality', 'quart', 'quartered', 'qt',
    'room',
    'serving', 'sliced', 'slices', 'size', 'small', 'smooth', 'spoon',
    'tablespoon', 'tablespoons', 'taste', 'tbsp', 'teaspoon', 'teaspoons', 'temp', 'temperature', 'thermometer', 'tsp',
    'whole'
    ]

    final_tokens = []
    for token in text_processed_0.split():
      if token not in common_words:
        final_tokens.append(token)

    # Make one string again
    text_processed = ' '.join(final_tokens)
    if text_processed.strip():
      text_processed_list.append(text_processed)

  return text_processed_list

In [None]:
# Make a list of processed ingredients (strings) (will be used later for counting the # of ingredients)
updated_df['Ingredients'] = updated_df['Ingredients'].apply(my_preprocessor)

# Turn those into strings for embedding
updated_df['Ingredients_Text'] = updated_df['Ingredients'].apply(lambda ingredient: ', '.join(ingredient))

# Delete rows that have NaN in any of the columns
updated_df = updated_df.dropna()

updated_df

# Predict Recipe Type (Food, Drink, Other)

In [None]:
updated_df = pd.read_csv(path + 'updated_df.csv')

In [None]:
# If updated_df exists in drive, load it
if os.path.exists(path + 'updated_df.csv'):
    updated_df = pd.read_csv(path + 'updated_df.csv')
else: # Apply classifier and save it

  batch_size = 40  # Keep at 40 for token safety

  def classify_batch_with_gpt(batch_df):
      prompt_intro = f"""Classify each recipe below as one of the following: food, drink, or other.
  If it is a soup, dessert, or cake, classify as other.

  Respond with exactly {len(batch_df)} numbered lines like this:
  1. food
  2. drink
  ...

  Only return the classification word per line. Do not include titles or directions in your response.

  """

  prompt_body = ""
  for i, row in enumerate(batch_df.itertuples(), start=1):
    prompt_body += f"{i}) Title: {row.Title}\nDirections: {row.Directions}\n\n"

  full_prompt = prompt_intro + prompt_body

  response = openai.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[{"role": "user", "content": full_prompt}],
    temperature=0
  )

  lines = response.choices[0].message.content.strip().splitlines()
  cleaned = [line.split(".")[-1].strip().lower() for line in lines if "." in line]
  if len(cleaned) != len(batch_df):
    raise ValueError(f"Mismatched response length: expected {len(batch_df)}, got {len(cleaned)}")

  return cleaned

  for start in range(0, len(updated_df), batch_size):
    end = min(start + batch_size, len(updated_df))
    batch_df = updated_df.iloc[start:end].copy()
    try:
      classifications = classify_batch_with_gpt(batch_df)
      updated_df.loc[start:end-1, 'Recipe_Type'] = classifications
      print(f"Processed {end} rows")

  # updated_df.to_csv(path + 'updated_df.csv', index=False)

updated_df.head()

In [None]:
# Delete rows that have NaN (just in case)
updated_df = updated_df.dropna()

# Choose ingredients you have, are allergic to, and the recipe type

In [None]:
# have = ['chicken, water, rice, olive oil, onion, garlic, salt, pepper, pasta, beef, and tomatoes, butter, oil, celery, carrot']
have = ['bread, butter, cheese']
allergic = ['peanut, nut, tree nut, cashew, hazelnut, macaroon, pistachio, almond, coconuts, adhesive']

# Choose recipe type ('all', 'food', 'drink', 'other')
recipe_type = 'food'

# Start Embedding

### Make model

In [None]:
# Make model
# model = SentenceTransformer('all-MiniLM-L6-v2') # does not do well
model = SentenceTransformer('paraphrase-mpnet-base-v2')

In [None]:
# If embeddings are saved, use them. Othewise, make and save them

# All recipe types
if recipe_type == 'all':
  if os.path.exists(path + 'all_recipe_embedding.npy'):
      all_recipe_embedding = np.load(path + 'all_recipe_embedding.npy')
  else:
    recipe_text = updated_df['Ingredients_Text'].tolist()
    all_recipe_embedding = model.encode(recipe_text, batch_size=64, show_progress_bar=True)
    np.save(path + 'all_recipe_embedding.npy', all_recipe_embedding)

# Food type
elif recipe_type == 'food':
  if os.path.exists(path + 'food_recipe_embedding.npy'):
      food_recipe_embedding = np.load(path + 'food_recipe_embedding.npy')
  else:
    food_recipe_text = updated_df[updated_df['Recipe_Type'] == 'food']['Ingredients_Text'].tolist()
    food_recipe_embedding = model.encode(food_recipe_text, batch_size=64, show_progress_bar=True)
    np.save(path + 'food_recipe_embedding.npy', food_recipe_embedding)

# Drink type
elif recipe_type == 'drink':
  if os.path.exists(path + 'drink_recipe_embedding.npy'):
      drink_recipe_embedding = np.load(path + 'drink_recipe_embedding.npy')
  else:
    drink_recipe_text = updated_df[updated_df['Recipe_Type'] == 'drink']['Ingredients_Text'].tolist()
    drink_recipe_embedding = model.encode(drink_recipe_text, batch_size=64, show_progress_bar=True)
    np.save(path + 'drink_recipe_embedding.npy', drink_recipe_embedding)

# Other type
elif recipe_type == 'other':
  if os.path.exists(path + 'other_recipe_embedding.npy'):
      other_recipe_embedding = np.load(path + 'other_recipe_embedding.npy')
  else:
    other_recipe_text = updated_df[updated_df['Recipe_Type'] == 'other']['Ingredients_Text'].tolist()
    other_recipe_embedding = model.encode(other_recipe_text, batch_size=64, show_progress_bar=True)
    # Save embedding
    np.save(path + 'other_recipe_embedding.npy', other_recipe_embedding)

In [None]:
# Determine Embedding Type
if recipe_type == 'all':
    recipe_embedding = all_recipe_embedding
    scores_df = updated_df
elif recipe_type == 'food':
    recipe_embedding = food_recipe_embedding
    scores_df = updated_df[updated_df['Recipe_Type'] == 'food'].reset_index(drop=True)
elif recipe_type == 'drink':
    recipe_embedding = drink_recipe_embedding
    scores_df = updated_df[updated_df['Recipe_Type'] == 'drink'].reset_index(drop=True)
elif recipe_type == 'other':
    recipe_embedding = other_recipe_embedding
    scores_df = updated_df[updated_df['Recipe_Type'] == 'other'].reset_index(drop=True)
else:
    raise ValueError("Invalid recipe type. Choose from 'all', 'food', 'drink', 'other'.")

### Determine raw and normalized scores

In [None]:
# Embed the foods we have
have_string = ', '.join(have)
have_embedding = model.encode([have_string])

# Embed the foods we are allergic to
allergic_string = ', '.join(allergic)
allergic_embedding = model.encode([allergic_string])

# Determine scores for foods we have and are allergic to
have_scores = cosine_similarity(have_embedding, recipe_embedding)[0]
allergic_scores = cosine_similarity(allergic_embedding, recipe_embedding)[0]

# Put scores in scores_df
scores_df.loc[:, 'Have_Score_Raw'] = have_scores
scores_df.loc[:, 'Allergic_Score_Raw'] = allergic_scores
scores_df.loc[:, 'Combined_Score_Raw'] = scores_df['Have_Score_Raw'] - scores_df['Allergic_Score_Raw']

# Normalize scores
scaler = MinMaxScaler()
raw = np.vstack([have_scores, allergic_scores]).T  # shape (n_recipes, 2)
normed = scaler.fit_transform(raw)

# Put normalized scores in scores_df
scores_df.loc[:, 'Have_Score'] = normed[:, 0]
scores_df.loc[:, 'Allergic_Score'] = normed[:, 1]
scores_df.loc[:, 'Combined_Score'] = scores_df['Have_Score'] - scores_df['Allergic_Score']

# Sort by normalized combined score
sorted_df = scores_df.sort_values(by='Combined_Score', ascending=False)
sorted_df.head()

### Take away recipes the user is allergic to (Allergic Score > 0.5)

In [None]:
# Take away raw scores
no_allergies_df = sorted_df.drop(columns=['Have_Score_Raw', 'Allergic_Score_Raw', 'Combined_Score_Raw'])
no_allergies_df

# Take away recipes that most likely have allergies
no_allergies_df = no_allergies_df[no_allergies_df['Allergic_Score'] < 0.5]
no_allergies_df.head()

# Use HuggingFace to see what Ingredients I don't have

### Import Hugging Face and start the chat

In [None]:
!pip install huggingface_hub
!pip install hugchat

In [None]:
from huggingface_hub import InferenceApi, InferenceClient
from hugchat import hugchat
from hugchat.login import Login
import os

Email = 'lhosk'
Password = 'NA'

# Log in to huggingface and grant authorization to huggingchat
sign = Login(Email, Password)
cookies = sign.login()

In [None]:
# Start a chatbot
chatbot = hugchat.ChatBot(cookies=cookies.get_dict())  # or cookie_path="usercookies.json"

In [None]:
# Start a new conversation
conversation_id = chatbot.new_conversation() # To get a new conversation ID
chatbot.change_conversation(conversation_id) # So start a new conversation

### Test HuggingFace

In [None]:
top3_df = no_allergies_df.head(3)

In [None]:
top3_df = top3_df.drop(columns=['Ingredients', 'Allergic_Score', 'Combined_Score'])

top3_df

In [None]:
for i, row in top3_df.iterrows():
  recipe_title = row["Title"]
  recipe_directions = row["Directions"]
  recipe_ingredients = row["Ingredients_Text"]

  prompt = f"""I have the following ingredients: {', '.join(have)}.
  The recipe is {recipe_title}.
  These are the directions: {recipe_directions}.
  These are the ingredients: {recipe_ingredients}

  Also group all cheese together. Like you can generalize a bit and get rid of cooking sprays and stuff

  Print: Name of Recipe
  Print missing ingredients: You are missing: ingredient, ingredient, ...
  Next Line Print: You have (percentage %) of the ingredients.
  Next Line Print if there are alternatives.
  """


  response = chatbot.chat(prompt)
  print(f"{response} \n \n")