In [8]:
import json

In [9]:
with open('train.json', 'r') as f:
    train_data = json.load(f)

In [10]:
print("Total recipes:", len(train_data))
print("Example recipe:", train_data[0])

Total recipes: 39774
Example recipe: {'id': 10259, 'cuisine': 'greek', 'ingredients': ['romaine lettuce', 'black olives', 'grape tomatoes', 'garlic', 'pepper', 'purple onion', 'seasoning', 'garbanzo beans', 'feta cheese crumbles']}


In [11]:
import pandas as pd


In [12]:
df = pd.DataFrame(train_data)

In [13]:
df.head()

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,22213,indian,"[water, vegetable oil, wheat, salt]"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39774 entries, 0 to 39773
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           39774 non-null  int64 
 1   cuisine      39774 non-null  object
 2   ingredients  39774 non-null  object
dtypes: int64(1), object(2)
memory usage: 932.3+ KB


In [15]:
import re


In [16]:
def clean_ingredient(ing):
    ing = ing.lower()
    ing = re.sub(r'[^a-z\s]', '', ing)
    ing = re.sub(r'\s+', ' ', ing)
    return ing.strip()

In [17]:
def suggest_recipes(user_ingredients, df):
    suggestions = []
    user_cleaned = [clean_ingredient(i) for i in user_ingredients]

    for _, row in df.iterrows():
        recipe_ingredients = [clean_ingredient(i) for i in row['ingredients']]
        if any(ing in recipe_ingredients for ing in user_cleaned):
            recipe_id = row['id']
            cuisine = row['cuisine'].title()
            name = f"{cuisine} Dish #{recipe_id}"

            suggestions.append({
                'Name': name,
                'ID': recipe_id,
                'Cuisine': cuisine,
                'Ingredients': recipe_ingredients
            })

    return suggestions

In [18]:
user_input = ["milk", "egg", "sugar"]
results = suggest_recipes(user_input, df)

print(f"Found {len(results)} matching recipes!\n")

for r in results[:5]:
    print(f"🍽️ {r['Name']} (Cuisine: {r['Cuisine']})")
    print("🧂 Ingredients:", ', '.join(r['Ingredients']))
    print("------")


Found 8099 matching recipes!

🍽️ Southern_Us Dish #25693 (Cuisine: Southern_Us)
🧂 Ingredients: plain flour, ground pepper, salt, tomatoes, ground black pepper, thyme, eggs, green tomatoes, yellow corn meal, milk, vegetable oil
------
🍽️ Indian Dish #13162 (Cuisine: Indian)
🧂 Ingredients: black pepper, shallots, cornflour, cayenne pepper, onions, garlic paste, milk, butter, salt, lemon juice, water, chili powder, passata, oil, ground cumin, boneless chicken skinless thigh, garam masala, double cream, natural yogurt, bay leaf
------
🍽️ Jamaican Dish #6602 (Cuisine: Jamaican)
🧂 Ingredients: plain flour, sugar, butter, eggs, fresh ginger root, salt, ground cinnamon, milk, vanilla extract, ground ginger, powdered sugar, baking powder
------
🍽️ Italian Dish #3735 (Cuisine: Italian)
🧂 Ingredients: sugar, pistachio nuts, white almond bark, flour, vanilla extract, olive oil, almond extract, eggs, baking powder, dried cranberries
------
🍽️ Chinese Dish #45887 (Cuisine: Chinese)
🧂 Ingredients: lo

In [19]:
!pip install scikit-learn




In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# 1. Convert ingredients into joined strings
df['ingredient_text'] = df['ingredients'].apply(lambda x: ' '.join(clean_ingredient(i) for i in x))

# 2. Fit the TF-IDF model on recipe ingredients
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['ingredient_text'])

# 3. Function to get top N recipes based on cosine similarity
def smart_suggest(user_ingredients, df, tfidf_matrix, top_n=5):
    user_input = ' '.join(clean_ingredient(i) for i in user_ingredients)
    user_vec = vectorizer.transform([user_input])

    cosine_similarities = cosine_similarity(user_vec, tfidf_matrix).flatten()
    top_indices = cosine_similarities.argsort()[-top_n:][::-1]

    results = []
    for i in top_indices:
        row = df.iloc[i]
        name = f"{row['cuisine'].title()} Dish #{row['id']}"
        results.append({
            'Name': name,
            'ID': row['id'],
            'Cuisine': row['cuisine'].title(),
            'Ingredients': row['ingredients'],
            'Score': round(cosine_similarities[i], 3)
        })
    return results


In [21]:
user_input = ["milk", "sugar", "flour"]

results = smart_suggest(user_input, df, tfidf_matrix, top_n=5)

print(f"Top {len(results)} recipe suggestions:\n")

for r in results:
    print(f"🍽️ {r['Name']} (Cuisine: {r['Cuisine']})")
    print("🧂 Ingredients:", ', '.join(r['Ingredients']))
    print("📈 Match Score:", r['Score'])
    print("------")


Top 5 recipe suggestions:

🍽️ French Dish #38948 (Cuisine: French)
🧂 Ingredients: milk, sugar, eggs, flour
📈 Match Score: 0.85
------
🍽️ Russian Dish #42427 (Cuisine: Russian)
🧂 Ingredients: flour, sugar, oil, milk, eggs, salt
📈 Match Score: 0.784
------
🍽️ Russian Dish #35624 (Cuisine: Russian)
🧂 Ingredients: eggs, butter, milk, oil, sugar, salt, flour
📈 Match Score: 0.717
------
🍽️ Indian Dish #32171 (Cuisine: Indian)
🧂 Ingredients: sugar, all-purpose flour, milk, water, oil, salt
📈 Match Score: 0.705
------
🍽️ British Dish #32978 (Cuisine: British)
🧂 Ingredients: milk, salt, eggs, flour
📈 Match Score: 0.671
------
