In [1]:
import pandas as pd 
import sys
import os
import json

In [2]:
current_dir = os.getcwd()
project_dir = os.path.abspath(os.path.join(current_dir, os.pardir))
src_path = os.path.join(project_dir, 'src')
sys.path.append(src_path)

from utils import load_env_vars
load_env_vars()

# Importing the Dataset

In [3]:
cuisines_ingredients_path = os.getenv('cuisines_ingredients_json')
with open (cuisines_ingredients_path, "r") as data:
    json_data = data.read()
data = json.loads(json_data)
df = pd.DataFrame(data)

In [4]:
df.head()

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,22213,indian,"[water, vegetable oil, wheat, salt]"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."


# Initial Observations

In [5]:
df.shape

(39774, 3)

In [6]:
df.dtypes

id              int64
cuisine        object
ingredients    object
dtype: object

In [7]:
print('Number of unique cuisines in the dataset:', df['cuisine'].unique().shape[0])
df['cuisine'].unique()

Number of unique cuisines in the dataset: 20


array(['greek', 'southern_us', 'filipino', 'indian', 'jamaican',
       'spanish', 'italian', 'mexican', 'chinese', 'british', 'thai',
       'vietnamese', 'cajun_creole', 'brazilian', 'french', 'japanese',
       'irish', 'korean', 'moroccan', 'russian'], dtype=object)

In [8]:
df.isnull().sum()

id             0
cuisine        0
ingredients    0
dtype: int64

# Reformatting the Data

Before anything else, we can drop the id column, because it won't be helpful for any analyses

In [9]:
df.drop(columns=['id'], inplace=True)

Currently, the ingredients are stored in a column of lists. This isn't useful for the type of analysis I want to do, so I'll need to make the ingredients numerical. That can be done with one-hot encoding. Each row will still be a dish, but the columns will be all the ingredients in the dataset. If the dish has the ingredient, the value will be 1, and 0 otherwise

In [10]:
df_expanded = df['ingredients'].apply(pd.Series)
df_expanded.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,64
0,romaine lettuce,black olives,grape tomatoes,garlic,pepper,purple onion,seasoning,garbanzo beans,feta cheese crumbles,,...,,,,,,,,,,
1,plain flour,ground pepper,salt,tomatoes,ground black pepper,thyme,eggs,green tomatoes,yellow corn meal,milk,...,,,,,,,,,,
2,eggs,pepper,salt,mayonaise,cooking oil,green chilies,grilled chicken breasts,garlic powder,yellow onion,soy sauce,...,,,,,,,,,,
3,water,vegetable oil,wheat,salt,,,,,,,...,,,,,,,,,,
4,black pepper,shallots,cornflour,cayenne pepper,onions,garlic paste,milk,butter,salt,lemon juice,...,,,,,,,,,,


Each row is a dish, and the columns are the ingredients in order. Now one-hot encoding can be applied. In the resulting dataset, there will be many duplicate columns. This happens because an ingredient that shows up in multiple columns will, after one-hot encoding, show up as separate columns. 

However this issue can easily be resolved by grouping the columns based on sum. If a dish contains an ingredient that has duplicates, a 1 will only show up in one of the columns. After grouping, the ingredients that a dish contains will have a value of 1, and the ones that the dish doesn't have will have a value of 0

In [11]:
df_expanded = pd.get_dummies(df_expanded, prefix='', prefix_sep='')
df_expanded.head()

Unnamed: 0,( oz.) tomato sauce,(14.5 oz.) diced tomatoes,1% low-fat buttermilk,1% low-fat cottage cheese,1% low-fat milk,2% reduced-fat milk,40% less sodium taco seasoning,Accent Seasoning,Alfredo sauce,American cheese,...,butter,panko breadcrumbs,low-fat milk,multi-grain penne pasta,extra-virgin olive oil,cilantro leaves,green pepper,shredded mozzarella cheese,fresh parsley,spaghetti
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


At the current size of the dataset, grouping the columns will be too costly for memory. I'll split the dataset into pieces, group the pieces individually, then put the dataset back together

In [12]:
# Makes a list containing each piece of the one-hot encoded dataset
split_df_list = []
x = int(len(df_expanded.index) / 21) # Number of rows for each piece
for num in range(21):
    split_df = df_expanded.iloc[num*x : (num+1)*x, :]
    split_df_list.append(split_df)

In [None]:
# For each piece, group the columns
for i in range(len(split_df_list)):
    split_df_list[i] = split_df_list[i].groupby(split_df_list[i].columns, axis=1).sum()

In [14]:
# Put the pieces back together
df_expanded = pd.concat(split_df_list, axis=0)

Currently, df_expanded is missing the cuisine column, so I'll add that back

In [16]:
cuisine = pd.DataFrame(df['cuisine'])
df_expanded = pd.concat([cuisine, df_expanded], axis=1)
print('New dimensions:', df_expanded.shape)
df_expanded.head()

New dimensions: (39774, 6715)


Unnamed: 0,cuisine,( oz.) tomato sauce,( oz.) tomato paste,(10 oz.) frozen chopped spinach,"(10 oz.) frozen chopped spinach, thawed and squeezed dry",(14 oz.) sweetened condensed milk,(14.5 oz.) diced tomatoes,(15 oz.) refried beans,1% low-fat buttermilk,1% low-fat chocolate milk,...,yukon gold potatoes,yuzu,yuzu juice,za'atar,zest,zesty italian dressing,zinfandel,ziti,zucchini,zucchini blossoms
0,greek,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,southern_us,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,filipino,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,indian,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,indian,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


The number of columns went from 43000 down to 6000, which is a big improvement