<a href="https://colab.research.google.com/github/kavyajeetbora/recipe_recommender/blob/master/analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from glob import glob
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import streamlit as st

In [2]:
files = glob(r"data\*.parquet")
df = pd.read_parquet(files)
df['steps'] = df['steps'].str.strip("[]")
df.iloc[0]['steps']

nan

In [3]:
df['name']

0         arriba   baked winter squash mexican style
1                   a bit different  breakfast pizza
2                          all in the kitchen  chili
3                                 alouette  potatoes
4                 amish  tomato ketchup  for canning
                             ...                    
231632        egyptian slow cooked eggs  beid hamine
231633                  egyptian spiced carrot puree
231634                        egyptian spiced prawns
231635     egyptian spicy meat pie in a phyllo crust
231636                       egyptian spinach omelet
Name: name, Length: 231637, dtype: object

In [5]:
files = glob(r"data\*.parquet")
df = pd.read_parquet(files)
df['ingredients'] = df['ingredients'].str.strip("[]").str.replace("'","").str.replace('"',"").str.split("',").apply(lambda x: [y.strip() for y in x])
print("Shape of the dataframe",df.shape)
df.head(3)

Shape of the dataframe (231637, 15)


Unnamed: 0,name,minutes,n_steps,steps,description,ingredients,n_ingredients,calories,total fat (PDV),sugar (PDV),sodium (PDV),protein (PDV),saturated fat (PDV),carbohydrates (PDV),embedding
0,arriba baked winter squash mexican style,55,11,"[make a choice and proceed with recipe, depend...",autumn is my favorite time of year to cook! th...,"[winter squash, mexican seasoning, mixed spice...",7,51.5,0.0,13.0,0.0,2.0,0.0,4.0,"[-0.008845049, 0.009866926, 0.028063796, 0.101..."
1,a bit different breakfast pizza,30,9,"[preheat oven to 425 degrees f, press dough in...",this recipe calls for the crust to be prebaked...,"[prepared pizza crust, sausage patty, eggs, mi...",6,173.4,18.0,0.0,17.0,22.0,35.0,1.0,"[-0.054575536, 0.027983457, 0.065263726, 0.032..."
2,all in the kitchen chili,130,6,"[brown ground beef in large pot, add chopped o...",this modified version of 'mom's' chili was a h...,"[ground beef, yellow onions, diced tomatoes, t...",13,269.8,22.0,32.0,48.0,39.0,27.0,5.0,"[-0.10401253, -0.024388006, 0.06653514, 0.0346..."


In [6]:
def cosine_similarity(vec1,vec2):
    '''
    Returns the cosine similarity between two vectors of n dimension
    '''
    denom = np.sqrt(np.sum(np.square(vec1))) * np.sqrt(np.sum(np.square(vec2)))
    return np.round(np.dot(vec1,vec2) / denom * 100, 2)

In [7]:
index = 999
data = df.iloc[index]
recipe, vector = data['name'], data['embedding']

print("Name of the dish:", recipe)

Name of the dish: 1890 cream cake


In [8]:
%%time

df_result = df.copy()
df_result['similarity'] = df_result['embedding'].apply(lambda x : cosine_similarity(vector, x))
df_result.drop('embedding', axis=1, inplace=True)
df_result.sort_values(by="similarity", ascending=False).iloc[1:4]

CPU times: total: 3.7 s
Wall time: 3.69 s


Unnamed: 0,name,minutes,n_steps,steps,description,ingredients,n_ingredients,calories,total fat (PDV),sugar (PDV),sodium (PDV),protein (PDV),saturated fat (PDV),carbohydrates (PDV),similarity
170691,white sponge cake,70,21,"[preheat oven to 400 degrees f, grease 3- 12 i...",nice light white cake. delicious served filled...,"[egg yolk, eggs, sugar, salt, baking powder, c...",10,584.2,30.0,168.0,26.0,24.0,13.0,29.0,93.06
3952,almond torta,90,17,"[preheat oven to 325 f, butter and flour a 10 ...",posted for zaar world tour 2005. recipe sourc...,"[almonds, flour, salt, egg yolks, amaretto, va...",11,205.3,17.0,48.0,5.0,15.0,7.0,6.0,92.76
136066,south african beesting cake with custard filling,105,24,"[preheat oven to 350 deg f / 180 deg celsius, ...","i do not know where the ""bee-sting"" comes from...","[flour, baking powder, salt, butter, superfine...",13,500.9,49.0,83.0,15.0,16.0,83.0,15.0,92.58


In [9]:
df_result.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231637 entries, 0 to 231636
Data columns (total 15 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   name                 231636 non-null  object 
 1   minutes              231637 non-null  int64  
 2   n_steps              231637 non-null  int64  
 3   steps                231637 non-null  object 
 4   description          226658 non-null  object 
 5   ingredients          231637 non-null  object 
 6   n_ingredients        231637 non-null  int64  
 7   calories             231637 non-null  float64
 8   total fat (PDV)      231637 non-null  float64
 9   sugar (PDV)          231637 non-null  float64
 10  sodium (PDV)         231637 non-null  float64
 11  protein (PDV)        231637 non-null  float64
 12  saturated fat (PDV)  231637 non-null  float64
 13  carbohydrates (PDV)  231637 non-null  float64
 14  similarity           231637 non-null  float64
dtypes: float64(8), in

## Plotting the nutrition values

In [73]:
def setColor(pdv):
    if pdv<5:
        return '#8ADAB2'
    
    elif pdv>=5 and pdv<20:
        return '#D0F288'
    
    elif pdv>20:
        return "#DF826C"
    
def plot_nutrition(data):
    
    x = data.index[8:13]
    y = data.values[8:13]

    fig = go.Figure(
        go.Bar(
            name="",
            x = x,
            y = y,
            width = 0.2,
            uirevision = True,
            marker=dict(color = list(map(setColor,y))),
            hovertemplate =
                '<br><b>%{x}</b>: %{y:.2f}'
            ),
            
    )
    fig.update_layout(
        template="plotly_dark",
        margin=dict(l=20, r=20, t=20, b=20)
    )
    fig.update_xaxes(
        showgrid=False,
    )
    # fig.update_yaxes(
    #     showgrid=False,
    #     showticklabels=False
    # )
    fig.layout.xaxis.fixedrange = True
    fig.layout.yaxis.fixedrange = True

    return fig

In [63]:
data.index

Index(['name', 'minutes', 'n_steps', 'steps', 'description', 'ingredients',
       'n_ingredients', 'calories', 'total fat (PDV)', 'sugar (PDV)',
       'sodium (PDV)', 'protein (PDV)', 'saturated fat (PDV)',
       'carbohydrates (PDV)', 'embedding'],
      dtype='object')

In [74]:
fig = plot_nutrition(df_result.iloc[3])
fig.show()