In [141]:
import pandas as pd 
import numpy as np
import random

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix,accuracy_score,roc_auc_score,roc_curve,auc,f1_score

import seaborn as sns
import matplotlib.pyplot as plt

In [142]:
# Read recepies table and extract ingredients 
df = pd.read_csv("Dish data_Final.csv")
df = df.astype({"Name": 'str', "description": 'str'})
# create ingredients list
df["ingredients_list"] = df["description"].apply(lambda x: x.lower().replace('|', ',').split(','))
df["ingredients_list"] = df["ingredients_list"].apply(lambda l:[x.strip() for x in l])
# df['ingredients_string'] = df['ingredients_list'].apply(lambda x: ','.join(x))
# keep a copy
recipie_data = df
#recipie_data

In [143]:
# mapping veg/ non veg
vg_nv = []
nv = pd.read_csv('Non_Veg_Ingredients_list.csv')
list_of_nv = nv['Non veg Ingredients list'].tolist()
for my_ingredient in recipie_data.ingredients_list:
    if any(check in list_of_nv for check in my_ingredient):
        vg_nv.append('non-veg')
    else:
        vg_nv.append('veg')
            
recipie_data['Veg_Non'] = vg_nv
# recipie_data.head()

In [144]:
# the ingredients list has been created and expored to excel. 
# And the alleriges have been mapped using a vlookup in excel with and approximate match parameter 
# read allergies table

my_allergy = []
allergiesData = pd.read_csv('AllergiesData.csv')
allergiesData = allergiesData.dropna()
allergiesData['Ingredient'] = allergiesData['Ingredient'].apply(lambda x : x.lower())

In [145]:
# map the allergies based on ingredients
df = df.explode('ingredients_list')
df_allergies = pd.merge(df,allergiesData, how='left', left_on='ingredients_list', right_on='Ingredient')
df_allergies = (df_allergies.groupby(['Name']).agg({'Allergy': lambda x: x.tolist()}))

# add the allergies column to the recepies_data df
recipie_data = pd.merge(recipie_data,df_allergies, how='left', left_on='Name', right_on='Name')

recipie_data['Allergy'] = recipie_data['Allergy'].apply(lambda x: [item for item in x if not(pd.isnull(item)) == True])
recipie_data['Allergy'] = recipie_data['Allergy'].apply(lambda x: list(dict.fromkeys(x)))


In [146]:
# map nutrition for each ingredient
Nutritions = { 'vitamin_C' : ['margarita','lemon juice','Citrus fruit',' potato', 'broccoli', 'bell pepper', 'spinach','strawberry','tomato', 'tomatoes','Brussels sprouts'], 
              'vitamin_A' : ['beef', 'liver', 'eggs','egg', 'shrimp', 'fish', 'fortified milk',' sweet potatoes', 'carrots', 'pumpkins', 'spinach',' mangoes'], 
              'vitamin_D' : ['milk' ,'cereals',' fatty','fish'],
              'vitamin_E' : ['oil','extra virgin olive oil' ,'sunflower oil','leafy green', 'whole grain', 'nuts'], 
              'vitamin_K' : ['Cabbage', 'eggs','egg',' milk', 'spinach', 'broccoli',' kale'], 
              'potassium' : ['cake','meat', 'milk', 'fruit', 'vegetable', 'grain', 'legume'], 
              'chromium' : ['meat', 'poultry', 'fish', 'nuts', 'cheese'] , 
              'copper' : ['shellfish', 'nuts', 'seeds', 'beans',' prunes'] , 
              'flouride' : ['fish', 'teas'], 
              'manganese' : ['nut', 'legume', 'whole grain', 'tea','coffee','chai'] , 
              'zinc' : ['meat', 'shellfish', 'legume', 'whole grain'] , 
              'phsoporous' : ['paneer','kebab'], 
              'calcium' : ['keer', 'yogurt', 'cheese', 'milk', 'salmon', 'leafy green', 'Cappuccino', 'Chia seed', 'Soy milk', 'Almonds', 'Dried figs', 'Tofu', 'White beans', 'Sunflower seeds', 'Broccoli rabe', 'Edamame', 'Kale', 'Sesame seeds', 'Broccoli', 'Sweet potatoes', 'Mustard and collard greens', 'Okra', 'Oranges and orange juice', 'Butternut squash', 'Arugula'], 
              'maginesum' : ['chocolate', 'Spinach', 'broccoli', 'legume', 'seeds', 'Dark Chocolate', 'Avocado', 'Nut', 'Legume', 'Tofu', 'Seeds', 'whole grain', 'Some Fatty Fish', 'Bananas', 'Leafy Greens'], 
              'sodium' : ['salt', 'soy sauce', 'vegetables', 'Shrimp', 'Soup', 'Ham', 'Instant pudding', 'Cottage cheese', 'Vegetable juice', 'Salad dressing', 'Pizza', 'Sandwiches', 'Broths and stocks', 'Boxed potato casseroles', 'Pork rinds', 'Canned vegetables', 'Processed cheese', 'Jerky and other dried meats', 'Tortillas', 'Cold cuts and salami', 'Pretzels', 'Pickles', 'Sauces', 'Hot dogs and bratwurst', 'Tomato sauce', 'Bagels and other breads', 'Canned meats, poultry and seafood', 'Boxed meal helpers', 'Biscuits', 'Macaroni and cheese', 'Frozen meals', 'Baked beans', 'Sausage, bacon and salt pork'], 
              'iodine' : ['salt', 'seafood'], 
              'iron' : ['chicken', 'turkey', 'egg', 'fruits', 'palak', 'spinach', 'bread', 'Beef', 'Lamb', 'Ham', 'Turkey', 'Chicken', 'Veal', 'Pork', 'Dried beef', 'Liver', 'Liverwurst', 'Eggs (any style)', 'Shrimp', 'Clams', 'Scallops', 'Oysters', 'Tuna', 'Sardines', 'Haddock', 'Mackerel', 'Spinach', 'Sweet potatoes', 'Peas', 'Broccoli', 'String beans', 'Beet greens', 'Dandelion greens', 'Collards', 'Kale', 'Chard', 'White bread (enriched)', 'Whole wheat bread', 'Enriched pasta', 'Wheat products', 'Bran cereals', 'Corn meal', 'Oat cereal', 'Cream of Wheat', 'Rye bread', 'Enriched rice', 'Strawberries', 'Watermelon', 'Raisins', 'Dates', 'Figs', 'Prunes', 'Prune juice', 'Dried apricots', 'Dried peaches', 'Tofu', 'Beans (kidney, garbanzo, or white, canned)', 'Tomato products (e.g., paste)', 'Dried peas', 'Dried beans', 'Lentils', 'Instant breakfast', 'Corn syrup', 'Maple syrup', 'Molasses'] ,
              'selenium' : ['banana', 'Organ meat', 'seafood', 'walnut', 'Brazil nuts', 'Fish', 'Ham', 'Enriched foods', 'Pork', 'Beef', 'Turkey', 'Chicken', 'Cottage cheese', 'Eggs', 'Brown rice', 'Sunflower seeds', 'Baked beans', 'Mushrooms', 'Oatmeal', 'Spinach', 'Milk and yogurt', 'Lentils', 'Cashews', 'Bananas'] ,
              'protien' : ['Eggs','whole eggs','arhar dal (split toor dal)' ,'Almonds', 'Chicken', 'Cottage cheese', 'Greek yogurt', 'Milk', 'Lentils', 'beef', 'Fish', 'Quinoa', 'Protein powders', 'Ezekiel bread', 'Pumpkin seeds', 'Turkey breast', 'Shellfish', 'Peanuts and peanut butter', 'cookie', 'cookies', 'mutton', 'meat', 'gosht', 'Chicken', 'Wings', 'Meat', 'Salmon', 'Cob', 'Kebab', 'Fish', 'Snake', 'Gosht', 'Bacon', 'Mutton', 'Lamb', 'Black beans', 'Lima beans', 'Corn', 'Salmon', 'Potatoes', 'Broccoli', 'Cauliflower', 'Chinese cabbage', 'Eggs', 'Beef', 'Chicken breast', 'Oats', 'Tuna', 'Tempeh', 'Spirulina', 'legume', 'Hemp seeds', 'Sun-dried tomatoes', 'Guava', 'Artichokes', 'Peas', 'Bison', 'Pork', 'Turkey', 'Chickpeas', 'Quinoa', 'Greek yogurt', 'Cottage cheese', 'Almonds', 'Milk', 'Lentils', 'Pumpkin seeds', 'Avocado', 'Pistachios', 'Chia seeds', 'Nut butters', 'Halibut', 'Asparagus', 'Watercress', 'Brussel sprouts', 'Spelt', 'Teff', 'Whey protein powder'],
              'carbs' : ['sugar','caster sugar','whole wheat flour','butter (unsalted)', 'all purpose flour (maida)','Sweet potatoes', 'Beet', 'Corn', 'Quinoa', 'rice', 'Oats', 'Bananas', 'Apples', 'Mangos', 'Dates', 'Raisins', 'Goji berries', 'Kidney beans', 'Garbanzo beans', 'Lentils', 'Corn flakes', 'Maize Biscuit', 'Whole-grain toast', 'Water biscuit', 'French bread roll', 'Rye Bread', 'Rice', 'Spaghetti', 'potato', 'Peas', 'Chickpeas', 'Lentils', 'Beans', 'Soy beans', 'Poha', 'aloo', 'butter', 'ghee', 'rice', 'biryani', 'pulao']
             }
    
nutrient = []    
for value in df['ingredients_list']:
    f=1
    for key,values in Nutritions.items():
        for i in [k.lower() for k in values]:
            if i in value and f==1:
                nutrient.append(key)
                f=0
                    
    if f==1:
        nutrient.append('')
        f=0

df['Nutrient'] = nutrient
df_nutrient = (df.groupby(['Name']).agg({'Nutrient': lambda x: x.tolist()}))

# add the nutrients column to the recepies_data df
recipie_data = pd.merge(recipie_data,df_nutrient, how='left', left_on='Name', right_on='Name')

recipie_data['Nutrient'] = recipie_data['Nutrient'].apply(lambda x: [item for item in x if not(pd.isnull(item)) == True])
recipie_data['Nutrient'] = recipie_data['Nutrient'].apply(lambda x: list(dict.fromkeys(x)))

In [147]:
# Map diseases (bad for)

foods_bad_for = {'vitamin_E' : 'High Cholestrol', 'sodium' : 'Blood pressure','carbs' : 'Diabetes,Obesity,Heart-related ailments,PCOD'}
disease_bad_for = []    
for value in df['Nutrient']:
    f=1
    for key in foods_bad_for:
            if key in value and f==1:
                disease_bad_for.append(foods_bad_for.get(key))
                f=0
                    
    if f==1:
        disease_bad_for.append('')
        f=0

df['Disease (Bad for)'] = disease_bad_for
df_disease_bad_for = (df.groupby(['Name']).agg({'Disease (Bad for)': lambda x: x.tolist()}))

# add the diseases (Bad for) column to the recepies_data df
recipie_data = pd.merge(recipie_data,df_disease_bad_for, how='left', left_on='Name', right_on='Name')
recipie_data['Disease (Bad for)'] = recipie_data['Disease (Bad for)'].apply(lambda x: list(dict.fromkeys(x)))


In [148]:
pd.options.mode.chained_assignment = None
def predictor_model(recipie_data, column_name, predict_by):
    
    recepie_subcat_model_df = recipie_data[recipie_data[column_name].notnull()]
    recepie_subcat_apply_df = recipie_data[recipie_data[column_name].isnull()]
    x_train, x_test, y_train, y_test = train_test_split(recepie_subcat_model_df[predict_by], recepie_subcat_model_df[column_name] , test_size=0.20)
    model = make_pipeline(TfidfVectorizer(), svm.SVC())
    model.fit(x_train, y_train)
    labels = model.predict(x_test)
    #print(metrics.classification_report(y_test, labels))
    predicted_new_labels = model.predict(recepie_subcat_apply_df[predict_by])
    predicted_new_labels_list = predicted_new_labels.tolist()
    recepie_subcat_apply_df[column_name] = predicted_new_labels_list
    recipie_data = pd.concat([recepie_subcat_model_df, recepie_subcat_apply_df], axis=0)
    return recipie_data

In [149]:
# predict subcategory by name of dish
recipie_data = predictor_model(recipie_data, 'sub_catagory', 'Name')

# predict cusine and Diet based on description
recipie_data = predictor_model(recipie_data, 'Cusine', 'Name')
recipie_data = predictor_model(recipie_data, 'Diet', 'description')

#display recipie data
#recipie_data


In [150]:
# map categories thorugh a vlookup
categoryData = pd.read_csv('Subcat_to_cat_mapping.csv')
recipie_data = pd.merge(recipie_data ,categoryData , how='left', left_on='sub_catagory', right_on='sub_catagory')

# map seasons through a vlookup
SeasonData = pd.read_csv('Subcat_to_season_mapping.csv')
recipie_data = pd.merge(recipie_data ,SeasonData , how='left', left_on='sub_catagory', right_on='sub_catagory')

# map moods through a vlookup
moodData = pd.read_csv('subcat_to mood_mapping.csv')
recipie_data = pd.merge(recipie_data ,moodData , how='left', left_on='sub_catagory', right_on='sub_catagory')

# map missing reviews with mean
recipie_data['Review'].fillna(int(recipie_data['Review'].mean()), inplace=True)

recipie_data

Unnamed: 0,Name,description,sub_catagory,Veg_Non,Review,Cusine,Diet,ingredients_list,Allergy,Nutrient,Disease (Bad for),Category,Season,Mood
0,Himachali Dry Raw Mango Chutney Recipe,Aam Papad (Sun Dried Mango)|Mint Leaves (Pudin...,Indian Chutney,veg,4.909474,Himachal,Vegetarian,"[aam papad (sun dried mango), mint leaves (pud...","[Oral Allergy Syndrome, Sugar Allergy / Intole...","[, carbs, sodium]","[, Diabetes,Obesity,Heart-related ailments,PCO...",Side dish,no season,Salt craving
1,Aamras Ki Kadhi Recipe - Mango Kadhi Recipe,Aamras|Gram flour (besan)|Turmeric powder (Hal...,Indian Curry,veg,4.873846,Gujarati Recipes﻿,Vegetarian,"[aamras, gram flour (besan), turmeric powder (...","[Legume Allergy, Histamine Allergy]","[, copper, sodium]","[, Blood pressure]",Lunch / Dinner,no season,Hungry
2,Acorn Squash Sambal Recipe,Acorn Squash|Sambal paste|Turmeric powder (Hal...,Continental Food,veg,4.936475,Asian,Non Vegeterian,"[acorn squash, sambal paste, turmeric powder (...","[Histamine Allergy, Allium Allergy, Peanut All...","[protien, , vitamin_E, sodium]","[, High Cholestrol, Blood pressure]",Lunch / Dinner,School vacations,Hungry
3,Kerala Palada Pradhaman Recipe,Ada|Milk|Sugar|Cardamom Powder (Elaichi),Sweet Recipes (Indian Mithai / Indian,veg,4.901949,Kerala Recipes,Vegetarian,"[ada, milk, sugar, cardamom powder (elaichi)]","[Milk allergy / Lactose intolerance, Sugar All...","[, vitamin_D, carbs]","[, Diabetes,Obesity,Heart-related ailments,PCOD]",Dessert,"Diwali, Dusshera","Sweet Craving,Celebrate"
4,Agathi Keerai Recipe,Agathi keerai|Yellow Moong Dal (Split)|Cumin s...,Poriyal Recipes,veg,4.919255,South Indian Recipes,Vegetarian,"[agathi keerai, yellow moong dal (split), cumi...",[],"[calcium, , copper, manganese, vitamin_E, sodium]","[, High Cholestrol, Blood pressure]",Breakfast,no season,Healthy eating
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7215,Gatta curry,"Yogurt, besan, sauce, garam masala powder, gra...",Sabzi,veg,4.000000,Indian,Vegetarian,"[yogurt, besan, sauce, garam masala powder, gr...",[Milk allergy / Lactose intolerance],"[calcium, ]",[],Lunch / Dinner,no season,Healthy eating
7216,Khaman,"Yogurt, fresh coconut, sesame seeds, semolina,...",Indian Snack,veg,4.000000,Indian,Vegetarian,"[yogurt, fresh coconut, sesame seeds, semolina...","[Milk allergy / Lactose intolerance, Seed Alle...","[calcium, manganese, copper, ]",[],Tea time snack,School vacations,Bored
7217,Paniyaram,"Yogurt, ginger, curry leaves, baking soda, gre...",Indian Snack,veg,4.000000,South Indian Recipes,Vegetarian,"[yogurt, ginger, curry leaves, baking soda, gr...","[Milk allergy / Lactose intolerance, Histamine...","[calcium, ]",[],Tea time snack,School vacations,Bored
7218,Lassi,"Yogurt, milk, nuts, sugar",Smoothies,veg,4.000000,Indian,Vegetarian,"[yogurt, milk, nuts, sugar]","[Milk allergy / Lactose intolerance, Nut Aller...","[calcium, vitamin_D, vitamin_E, carbs]","[, High Cholestrol, Diabetes,Obesity,Heart-rel...",Dessert,Summer,"Stressed,Sweet Craving"


In [151]:
#recipie_data.to_csv('Prepare Food DB output.csv')
recipie_data.to_json('Prepare Food DB output.json')