Using Word2Vec

In [None]:
# Mount Google Drive/Users/celinewidjaja/Documents/recipe-reccomender/full_code.ipynb
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Import Libraries
!pip install nltk spacy unidecode gensim

import pandas as pd
import numpy as np
import re
from unidecode import unidecode
from gensim.models import Word2Vec
from tqdm import tqdm



In [None]:
# Download NLTK and spaCy models
import nltk
nltk.download('punkt')
nltk.download('stopwords')

import spacy
spacy.cli.download("en_core_web_sm")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
# Load data from Google Drive
file_path = "/content/drive/MyDrive/NLP - Group Project/preprocessed_recipes.csv"

# Load the dataset
df_preprocessed = pd.read_csv(file_path)
print("Original shape:", df_preprocessed.shape)

Original shape: (996675, 11)


In [None]:
from gensim.models import Word2Vec

class RecipeCorpus:
    def __init__(self, df, batch_size=100000):
        self.df = df
        self.batch_size = batch_size

    def __iter__(self):
        for start in range(0, len(self.df), self.batch_size):
            end = start + self.batch_size
            batch = self.df.iloc[start:end]
            for _, row in batch.iterrows():
                ingredients = str(row['clean_ingredients'])
                directions = str(row['clean_directions'])
                yield (ingredients + ' ' + directions).split()


In [None]:
corpus = RecipeCorpus(df_preprocessed)

w2v_model = Word2Vec(vector_size=100, window=5, min_count=2, workers=4, seed=42)
w2v_model.build_vocab(corpus)

# Train using corpus
w2v_model.train(corpus, total_examples=w2v_model.corpus_count, epochs=10)


(864154130, 1137149380)

In [None]:
# Cooking Time Extraction
def extract_cook_time(text):
    matches = re.findall(r'(\d+)\s*(minutes|min|hours|hrs)', text.lower())
    total_minutes = 0
    for num, unit in matches:
        num = int(num)
        if 'hour' in unit or 'hr' in unit:
            total_minutes += num * 60
        else:
            total_minutes += num

    # Set to max 6 hours (360 min)
    total_minutes = min(total_minutes, 360)

    # Use a default if no time was found
    if total_minutes == 0:
        total_minutes = 30
    hours = total_minutes // 60
    minutes = total_minutes % 60
    return f"{hours} hr {minutes} min" if hours > 0 else f"{minutes} min"

df_preprocessed['cooking_time'] = df_preprocessed['directions'].apply(extract_cook_time)


In [None]:
# Cuisine Tagging
cuisine_keywords = {
    "Italian": {"parmesan", "mozzarella", "basil", "oregano", "pasta", "risotto", "lasagna"},
    "Mexican": {"tortilla", "salsa", "jalapeno", "chipotle", "queso", "avocado", "enchilada"},
    "Indian": {"turmeric", "curry", "cumin", "masala", "paneer", "garam", "dal", "coriander"},
    "Chinese": {"soy", "ginger", "scallion", "noodle", "tofu", "hoisin", "bok choy"},
    "Japanese": {"miso", "sushi", "nori", "sake", "wasabi", "teriyaki", "mirin"},
    "Middle Eastern": {"couscous", "hummus", "tahini", "zaatar", "sumac", "falafel"},
    "Thai": {"lemongrass", "curry", "coconut", "fish sauce", "galangal", "kaffir"},
    "American": {
            "steak", "grill", "bbq", "barbecue", "cheddar", "ranch", "macaroni",
            "sloppy joe", "brisket", "meatloaf", "cornbread", "bacon", "buttermilk",
            "burger", "fried chicken", "coleslaw", "hotdog", "fries", "ketchup"
    },
    "French": {"thyme", "brie", "wine", "butter", "shallot", "tarragon", "ratatouille"},
    "Indonesian": {"tempeh", "sambal", "gula", "rendang", "ketumbar", "kecap","Medan"},
    "German": {"sauerkraut", "strudel", "pumpernickel", "bratwurst", "spätzle"},
}

In [None]:
# Ensure all text fields are string and fill NaNs
df_preprocessed['clean_ingredients'] = df_preprocessed['clean_ingredients'].fillna("").astype(str)

# Function to calculate average word vector
def get_avg_vector(words):
    vectors = [w2v_model.wv[w] for w in words if w in w2v_model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(w2v_model.vector_size)

# Calculate cuisine keyword vectors
cuisine_vectors = {
    cuisine: get_avg_vector(list(keywords))
    for cuisine, keywords in cuisine_keywords.items()
}


In [None]:
# Tagging cuisine based on cosine similarity
def tag_cuisine(text):
    text = str(text)  # Convert to string if not already
    tokens = text.split()
    vec = get_avg_vector(tokens)

    similarities = {
        cuisine: np.dot(vec, cuisine_vectors[cuisine]) /
                 (np.linalg.norm(vec) * np.linalg.norm(cuisine_vectors[cuisine]) + 1e-10)
        for cuisine in cuisine_vectors
    }

    return max(similarities, key=similarities.get)

# Apply cuisine tagging
df_preprocessed['cuisine_tag'] = df_preprocessed['clean_ingredients'].apply(tag_cuisine)

In [None]:
# Cuisine Overrides
cuisine_overrides = {
    # American
    "steak": "American",
    "brisket": "American",
    "sloppy joe": "American",
    "bbq": "American",
    "barbecue": "American",
    "pie crust": "American",
    "ranch": "American",
    "bacon": "American",
    "meatloaf": "American",
    "macaroni": "American",
    "cheddar": "American",
    "hotdog": "American",
    "cornbread": "American",
    "buttermilk": "American",
    "fried chicken": "American",
    "chili": "American",
    "ketchup": "American",
    "coleslaw": "American",
    "graham cracker": "American",
    "cranberry sauce": "American",
    "peanut butter": "American",
    "crisco": "American",

    # Italian
    "pasta": "Italian",
    "lasagna": "Italian",
    "risotto": "Italian",
    "bolognese": "Italian",
    "mozzarella": "Italian",
    "parmesan": "Italian",
    "oregano": "Italian",
    "basil": "Italian",
    "anchovy": "Italian",
    "fettuccine": "Italian",
    "gnocchi": "Italian",
    "prosciutto": "Italian",
    "carbonara": "Italian",
    "caprese": "Italian",
    "marinara": "Italian",
    "penne": "Italian",
    "ziti": "Italian",

    # French
    "tart": "French",
    "quiche": "French",
    "ratatouille": "French",
    "vinaigrette": "French",
    "brie": "French",
    "croissant": "French",
    "thyme": "French",
    "shallot": "French",
    "crepe": "French",
    "bouillabaisse": "French",
    "bechamel": "French",
    "duxelles": "French",
    "souffle": "French",

    # German
    "pretzel": "German",
    "portzelky": "German",
    "strudel": "German",
    "sauerkraut": "German",
    "bratwurst": "German",
    "spätzle": "German",
    "wiener": "German",

    # Indonesian
    "rendang": "Indonesian",
    "sambal": "Indonesian",
    "medan": "Indonesian",
    "tempeh": "Indonesian",
    "gula jawa": "Indonesian",
    "kecap": "Indonesian",
    "nasi goreng": "Indonesian",
    "bumbu": "Indonesian",
    "ikan bakar": "Indonesian",
    "balado": "Indonesian",

    # Mexican
    "tortilla": "Mexican",
    "quesadilla": "Mexican",
    "ranchero": "Mexican",
    "enchilada": "Mexican",
    "salsa": "Mexican",
    "jalapeno": "Mexican",
    "chipotle": "Mexican",
    "avocado": "Mexican",
    "taco": "Mexican",
    "guacamole": "Mexican",
    "refried beans": "Mexican",
    "queso": "Mexican",

    # Japanese
    "nori": "Japanese",
    "miso": "Japanese",
    "sushi": "Japanese",
    "teriyaki": "Japanese",
    "sashimi": "Japanese",
    "mirin": "Japanese",
    "wasabi": "Japanese",
    "dashi": "Japanese",
    "udon": "Japanese",
    "tempura": "Japanese",

    # Middle Eastern
    "falafel": "Middle Eastern",
    "tahini": "Middle Eastern",
    "zaatar": "Middle Eastern",
    "sumac": "Middle Eastern",
    "hummus": "Middle Eastern",
    "couscous": "Middle Eastern",
    "shawarma": "Middle Eastern",
    "pita": "Middle Eastern",
    "lamb kebab": "Middle Eastern",

    # Chinese
    "soy sauce": "Chinese",
    "hoisin": "Chinese",
    "bok choy": "Chinese",
    "scallion": "Chinese",
    "dumpling": "Chinese",
    "wonton": "Chinese",
    "chow mein": "Chinese",
    "szechuan": "Chinese",
    "five-spice": "Chinese",
    "ginger": "Chinese",

    # Thai
    "lemongrass": "Thai",
    "galangal": "Thai",
    "kaffir": "Thai",
    "coconut milk": "Thai",
    "fish sauce": "Thai",
    "green curry": "Thai",
    "red curry": "Thai",
    "thai basil": "Thai",

    # Indian
    "masala": "Indian",
    "garam": "Indian",
    "paneer": "Indian",
    "dal": "Indian",
    "turmeric": "Indian",
    "curry": "Indian",
    "tikka": "Indian",
    "ghee": "Indian",
    "biryani": "Indian"
}



def tag_cuisine_with_overrides(text):
    text = str(text).lower()
    for keyword, cuisine in cuisine_overrides.items():
        if keyword in text:
            return cuisine
    tokens = text.split()
    vec = get_avg_vector(tokens)
    similarities = {
        cuisine: np.dot(vec, cuisine_vectors[cuisine]) /
                 (np.linalg.norm(vec) * np.linalg.norm(cuisine_vectors[cuisine]) + 1e-10)
        for cuisine in cuisine_vectors
    }
    return max(similarities, key=similarities.get)

df_preprocessed['cuisine_tag'] = df_preprocessed['clean_ingredients'].apply(tag_cuisine_with_overrides)


In [None]:
# Diet Tagging
diet_keywords = {
    "Meat": ["beef", "chicken", "pork", "bacon", "sausage", "lamb", "turkey", "ham", "steak"],
    "Dairy": ["milk", "cheese", "butter", "cream", "yogurt", "egg", "ghee"],
    "Gluten": ["wheat", "barley", "rye", "flour", "bread", "pasta", "cracker", "noodle"]
}

def tag_diet(text):
    text = str(text).lower()
    has_meat = any(x in text for x in diet_keywords["Meat"])
    has_dairy = any(x in text for x in diet_keywords["Dairy"])
    has_gluten = any(x in text for x in diet_keywords["Gluten"])

    if has_meat:
        diet = "Non-Vegetarian"
    elif has_dairy:
        diet = "Vegetarian"
    else:
        diet = "Vegan"

    gluten = "Contains Gluten" if has_gluten else "Gluten-Free"
    return f"{diet}, {gluten}"

# Apply in batch (vectorized)
df_preprocessed['diet_tag'] = df_preprocessed['clean_ingredients'].apply(tag_diet)


In [None]:
print(df_preprocessed.columns.tolist())

['Unnamed: 0', 'title', 'ingredients', 'directions', 'link', 'source', 'NER', 'clean_ingredients', 'clean_directions', 'clean_text', 'simhash', 'cooking_time', 'cuisine_tag', 'diet_tag']


In [None]:
# Preview
df_preprocessed[['title', 'clean_ingredients', 'cuisine_tag', 'diet_tag', 'cooking_time']].head().style.set_properties(**{'white-space': 'pre-wrap'})

Unnamed: 0,title,clean_ingredients,cuisine_tag,diet_tag,cooking_time
0,Marinated Flank Steak Recipe,pound flank steak c finely minced green onions scallions c dry red wine c soy sauce tbsp salad oil teaspoon sesame seeds teaspoon packed brown sugar teaspoon grnd black pepper teaspoon grnd ginger clove garlic chopped,American,"Non-Vegetarian, Gluten-Free",6 hr 0 min
1,French Chicken Stew,tablespoon rosemary teaspoon thyme bay leaves teaspoon smoked paprika teaspoon pepper cup red wine cups chicken broth cups button mushrooms sliced cups mushroom mix oyster shiitake baby bella sliced medium carrots sliced diagonally onion medium chopped red potato medium cut in inch pieces cup frozen green beans inch pieces can black olives pitted ripe halved handful grape tomatoes halved chicken thighs with bones and skin lbs stalks celery cups water,French,"Non-Vegetarian, Gluten-Free",6 hr 0 min
2,Glazed Carrots,to carrots tbsp butter c brown sugar grated lemon rind and juice,French,"Vegetarian, Gluten-Free",15 min
3,Moms Pie Dough,cups flour tsp salt pinch baking powder tbls sugar cup crisco egg lightly beaten tsp vinegar ice water,American,"Vegetarian, Contains Gluten",30 min
4,Pretzel Salad Or Dessert,c crushed small thin pretzels sticks c margarine,German,"Vegan, Gluten-Free",8 min


In [None]:
# Define output path
output_path = "/content/drive/MyDrive/NLP - Group Project/tagged_recipes.csv"

# Save the result
df_preprocessed.to_csv(output_path, index=False)
print(f"Saved to {output_path}")

Saved to /content/drive/MyDrive/NLP - Group Project/tagged_recipes.csv
