In [1]:
import numpy as np
import pandas as pd
import seaborn as sb
import json as js
import os as os
import matplotlib.pyplot as plt

# Cleaning the dataset

In [2]:
json_data = pd.read_json('../data/train.json')
json_data

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,22213,indian,"[water, vegetable oil, wheat, salt]"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."
...,...,...,...
39769,29109,irish,"[light brown sugar, granulated sugar, butter, ..."
39770,11462,italian,"[KRAFT Zesty Italian Dressing, purple onion, b..."
39771,2238,irish,"[eggs, citrus fruit, raisins, sourdough starte..."
39772,41882,chinese,"[boneless chicken skinless thigh, minced garli..."


From the above, we can see that the train data is presented as a json file format with 39774 rows and 3 columns. 

In [3]:
#convert the json file into csv
json_data.to_csv('data_csv.csv', encoding='utf-8')

#import the csv file as a pandas dataframe
data_csv = pd.read_csv('data_csv.csv')

#rename columns properly 
data_csv.columns = ["Index","ID","Cuisine","Ingredients"]

#store the renamed dataframe as a csv file
data_csv.to_csv('data_csv.csv', encoding='utf-8')
data_csv

Unnamed: 0,Index,ID,Cuisine,Ingredients
0,0,10259,greek,"['romaine lettuce', 'black olives', 'grape tom..."
1,1,25693,southern_us,"['plain flour', 'ground pepper', 'salt', 'toma..."
2,2,20130,filipino,"['eggs', 'pepper', 'salt', 'mayonaise', 'cooki..."
3,3,22213,indian,"['water', 'vegetable oil', 'wheat', 'salt']"
4,4,13162,indian,"['black pepper', 'shallots', 'cornflour', 'cay..."
...,...,...,...,...
39769,39769,29109,irish,"['light brown sugar', 'granulated sugar', 'but..."
39770,39770,11462,italian,"['KRAFT Zesty Italian Dressing', 'purple onion..."
39771,39771,2238,irish,"['eggs', 'citrus fruit', 'raisins', 'sourdough..."
39772,39772,41882,chinese,"['boneless chicken skinless thigh', 'minced ga..."


In [4]:
test_str = data_csv.Ingredients[0]
print("The data type is: ",type(test_str))
print("The output looks like: ",test_str)

The data type is:  <class 'str'>
The output looks like:  ['romaine lettuce', 'black olives', 'grape tomatoes', 'garlic', 'pepper', 'purple onion', 'seasoning', 'garbanzo beans', 'feta cheese crumbles']


From the above, we can see that under the ingredients column, it is currently treated as a string, hence we need to split up each of the ingredients from each cell. 

In [5]:
#now we test the removal of [] and ''
test_str = data_csv.Ingredients[0].replace("'",'').strip('[]')
print("The data type is: ",type(test_str))
print("The output looks like: ",test_str)

The data type is:  <class 'str'>
The output looks like:  romaine lettuce, black olives, grape tomatoes, garlic, pepper, purple onion, seasoning, garbanzo beans, feta cheese crumbles


In [6]:
#create new dataframe for removal of [] and '' for all rows under ingredients column
test_csv = pd.DataFrame(data_csv)
test_csv

Unnamed: 0,Index,ID,Cuisine,Ingredients
0,0,10259,greek,"['romaine lettuce', 'black olives', 'grape tom..."
1,1,25693,southern_us,"['plain flour', 'ground pepper', 'salt', 'toma..."
2,2,20130,filipino,"['eggs', 'pepper', 'salt', 'mayonaise', 'cooki..."
3,3,22213,indian,"['water', 'vegetable oil', 'wheat', 'salt']"
4,4,13162,indian,"['black pepper', 'shallots', 'cornflour', 'cay..."
...,...,...,...,...
39769,39769,29109,irish,"['light brown sugar', 'granulated sugar', 'but..."
39770,39770,11462,italian,"['KRAFT Zesty Italian Dressing', 'purple onion..."
39771,39771,2238,irish,"['eggs', 'citrus fruit', 'raisins', 'sourdough..."
39772,39772,41882,chinese,"['boneless chicken skinless thigh', 'minced ga..."


In [7]:
#removal process
test_csv.loc[:, 'Ingredients'] = (
    test_csv['Ingredients']
    .str.replace("'", "", regex=False)
    .str.strip("[]")
)


In [8]:
test_csv.to_csv('data_csv.csv', index = False, encoding='utf-8')
data_csv 
#all ingredients do not have quotations and square brackets now

Unnamed: 0,Index,ID,Cuisine,Ingredients
0,0,10259,greek,"romaine lettuce, black olives, grape tomatoes,..."
1,1,25693,southern_us,"plain flour, ground pepper, salt, tomatoes, gr..."
2,2,20130,filipino,"eggs, pepper, salt, mayonaise, cooking oil, gr..."
3,3,22213,indian,"water, vegetable oil, wheat, salt"
4,4,13162,indian,"black pepper, shallots, cornflour, cayenne pep..."
...,...,...,...,...
39769,39769,29109,irish,"light brown sugar, granulated sugar, butter, w..."
39770,39770,11462,italian,"KRAFT Zesty Italian Dressing, purple onion, br..."
39771,39771,2238,irish,"eggs, citrus fruit, raisins, sourdough starter..."
39772,39772,41882,chinese,"boneless chicken skinless thigh, minced garlic..."


In [9]:
#compilation of all ingredients.
ingredient_list = ", ".join(data_csv["Ingredients"].astype(str)) + ", "
ingredient_list

'romaine lettuce, black olives, grape tomatoes, garlic, pepper, purple onion, seasoning, garbanzo beans, feta cheese crumbles, plain flour, ground pepper, salt, tomatoes, ground black pepper, thyme, eggs, green tomatoes, yellow corn meal, milk, vegetable oil, eggs, pepper, salt, mayonaise, cooking oil, green chilies, grilled chicken breasts, garlic powder, yellow onion, soy sauce, butter, chicken livers, water, vegetable oil, wheat, salt, black pepper, shallots, cornflour, cayenne pepper, onions, garlic paste, milk, butter, salt, lemon juice, water, chili powder, passata, oil, ground cumin, boneless chicken skinless thigh, garam masala, double cream, natural yogurt, bay leaf, plain flour, sugar, butter, eggs, fresh ginger root, salt, ground cinnamon, milk, vanilla extract, ground ginger, powdered sugar, baking powder, olive oil, salt, medium shrimp, pepper, garlic, chopped cilantro, jalapeno chilies, flat leaf parsley, skirt steak, white vinegar, sea salt, bay leaf, chorizo sausage, su

From the above, we can see that there are issues with duplicate ingredients.

In [10]:
#change the string of ingredients into a list instead
ingredient_list_non_unique = list(ingredient_list.split(", "))
ingredient_list_non_unique

['romaine lettuce',
 'black olives',
 'grape tomatoes',
 'garlic',
 'pepper',
 'purple onion',
 'seasoning',
 'garbanzo beans',
 'feta cheese crumbles',
 'plain flour',
 'ground pepper',
 'salt',
 'tomatoes',
 'ground black pepper',
 'thyme',
 'eggs',
 'green tomatoes',
 'yellow corn meal',
 'milk',
 'vegetable oil',
 'eggs',
 'pepper',
 'salt',
 'mayonaise',
 'cooking oil',
 'green chilies',
 'grilled chicken breasts',
 'garlic powder',
 'yellow onion',
 'soy sauce',
 'butter',
 'chicken livers',
 'water',
 'vegetable oil',
 'wheat',
 'salt',
 'black pepper',
 'shallots',
 'cornflour',
 'cayenne pepper',
 'onions',
 'garlic paste',
 'milk',
 'butter',
 'salt',
 'lemon juice',
 'water',
 'chili powder',
 'passata',
 'oil',
 'ground cumin',
 'boneless chicken skinless thigh',
 'garam masala',
 'double cream',
 'natural yogurt',
 'bay leaf',
 'plain flour',
 'sugar',
 'butter',
 'eggs',
 'fresh ginger root',
 'salt',
 'ground cinnamon',
 'milk',
 'vanilla extract',
 'ground ginger',
 'po

In [11]:
#change the list of ingredients into a set because set cannot have duplicate values
ingredients_set = set(ingredient_list_non_unique)
final_ingredient_list = list(ingredients_set)
final_ingredient_list

['',
 'oyster-flavor sauc',
 'chicken breast halves',
 'celery ribs',
 'grated cauliflower',
 'mango juice',
 'sofrito',
 'anjou pears',
 'pink salmon',
 'pisco',
 'Reblochon',
 'ice cream',
 'manioc flour',
 'sweet italian sausage',
 'frozen crabmeat',
 'pickle juice',
 'slider rolls',
 'fat free reduced sodium chicken broth',
 'cumin',
 'praline topping',
 'pepper leaves',
 'dill',
 'hard cider',
 'Pepperidge Farm Puff Pastry Sheets',
 'uncooked ziti',
 'pasta sheets',
 'ragu cheesi doubl cheddar sauc',
 'pigeon peas',
 'regular cucumber',
 'cocktail pumpernickel bread',
 'baguette',
 'tri-tip steak',
 'quinces',
 'ale',
 'xanthan gum',
 'rice milk',
 'italian style stewed tomatoes',
 'duck drumsticks',
 'smoked chorizo',
 'veal',
 'organic buttermilk',
 'whole milk',
 'coffee',
 'frozen popcorn chicken',
 'toffee bits',
 'olive tapenade',
 'harissa paste',
 'lop chong',
 'soba',
 'honey liqueur',
 'mojo marinade',
 'canned chopped tomatoes',
 'whole wheat breadcrumbs',
 'kumquats',


In [12]:
final_ingredient_list.remove('') #the first data point is removed
number = len(final_ingredient_list)
print('Number of unique ingredients are: '+str(number)) 
final_ingredient_list

Number of unique ingredients are: 6724


['oyster-flavor sauc',
 'chicken breast halves',
 'celery ribs',
 'grated cauliflower',
 'mango juice',
 'sofrito',
 'anjou pears',
 'pink salmon',
 'pisco',
 'Reblochon',
 'ice cream',
 'manioc flour',
 'sweet italian sausage',
 'frozen crabmeat',
 'pickle juice',
 'slider rolls',
 'fat free reduced sodium chicken broth',
 'cumin',
 'praline topping',
 'pepper leaves',
 'dill',
 'hard cider',
 'Pepperidge Farm Puff Pastry Sheets',
 'uncooked ziti',
 'pasta sheets',
 'ragu cheesi doubl cheddar sauc',
 'pigeon peas',
 'regular cucumber',
 'cocktail pumpernickel bread',
 'baguette',
 'tri-tip steak',
 'quinces',
 'ale',
 'xanthan gum',
 'rice milk',
 'italian style stewed tomatoes',
 'duck drumsticks',
 'smoked chorizo',
 'veal',
 'organic buttermilk',
 'whole milk',
 'coffee',
 'frozen popcorn chicken',
 'toffee bits',
 'olive tapenade',
 'harissa paste',
 'lop chong',
 'soba',
 'honey liqueur',
 'mojo marinade',
 'canned chopped tomatoes',
 'whole wheat breadcrumbs',
 'kumquats',
 'red

In [13]:
#create a new dataframe consisting of ingredients only
ingredients_df = pd.DataFrame(final_ingredient_list)

#convert to csv file
ingredients_df.to_csv('ingredients.csv',index=False, encoding='utf-8')
ingredients_df = pd.read_csv('ingredients.csv')
ingredients_df.columns=['Ingredients'] #name empty columns
ingredients_df['Ingredients'] = ingredients_df['Ingredients'].str.capitalize() #make nicer, caps first letter of each word
ingredients_df=ingredients_df.sort_values(by="Ingredients")

#drop the old index and relabel the index column, starting from 0 onwards
ingredients_df=ingredients_df.reset_index(drop=True) 
ingredients_df.to_csv('ingredients.csv', index = True, encoding='utf-8')
ingredients_df = pd.read_csv('ingredients.csv')

#rename the index column
ingredients_df.columns=['Index','Ingredients']  
ingredients_df.to_csv('ingredients.csv', index = False, encoding='utf-8')
ingredients_df

Unnamed: 0,Index,Ingredients
0,0,"""best foods mayonnaise with lime juice"""
1,1,"""breakstones sour cream"""
2,2,"""campbells condensed cheddar cheese soup"""
3,3,"""campbells condensed cream of chicken soup"""
4,4,"""campbells condensed cream of mushroom soup"""
...,...,...
6719,6719,Zesty italian dressing
6720,6720,Zinfandel
6721,6721,Ziti
6722,6722,Zucchini


In [14]:
#compilation of all cuisines 
cuisine_list = "" 
for x in data_csv.Index:
    cuisine_list = cuisine_list+data_csv.Cuisine[x]+', '
cuisine_list

'greek, southern_us, filipino, indian, indian, jamaican, spanish, italian, mexican, italian, italian, chinese, italian, mexican, italian, indian, british, italian, thai, vietnamese, thai, mexican, southern_us, chinese, italian, chinese, cajun_creole, italian, chinese, mexican, italian, cajun_creole, mexican, thai, italian, cajun_creole, italian, filipino, southern_us, southern_us, italian, brazilian, mexican, indian, mexican, chinese, french, southern_us, southern_us, southern_us, japanese, southern_us, italian, southern_us, italian, jamaican, japanese, indian, italian, irish, thai, thai, indian, jamaican, italian, thai, korean, french, french, southern_us, spanish, indian, moroccan, italian, italian, moroccan, moroccan, vietnamese, japanese, mexican, cajun_creole, southern_us, french, indian, indian, italian, italian, indian, korean, spanish, italian, southern_us, southern_us, greek, mexican, mexican, southern_us, indian, italian, italian, italian, japanese, italian, greek, korean, in

From the above we can see that there are issues with duplicate cuisines. 

In [15]:
#change the string of cuisines into a list instead
cuisine_list_non_unique = list(cuisine_list.split(", "))
cuisine_list_non_unique

['greek',
 'southern_us',
 'filipino',
 'indian',
 'indian',
 'jamaican',
 'spanish',
 'italian',
 'mexican',
 'italian',
 'italian',
 'chinese',
 'italian',
 'mexican',
 'italian',
 'indian',
 'british',
 'italian',
 'thai',
 'vietnamese',
 'thai',
 'mexican',
 'southern_us',
 'chinese',
 'italian',
 'chinese',
 'cajun_creole',
 'italian',
 'chinese',
 'mexican',
 'italian',
 'cajun_creole',
 'mexican',
 'thai',
 'italian',
 'cajun_creole',
 'italian',
 'filipino',
 'southern_us',
 'southern_us',
 'italian',
 'brazilian',
 'mexican',
 'indian',
 'mexican',
 'chinese',
 'french',
 'southern_us',
 'southern_us',
 'southern_us',
 'japanese',
 'southern_us',
 'italian',
 'southern_us',
 'italian',
 'jamaican',
 'japanese',
 'indian',
 'italian',
 'irish',
 'thai',
 'thai',
 'indian',
 'jamaican',
 'italian',
 'thai',
 'korean',
 'french',
 'french',
 'southern_us',
 'spanish',
 'indian',
 'moroccan',
 'italian',
 'italian',
 'moroccan',
 'moroccan',
 'vietnamese',
 'japanese',
 'mexican',

In [16]:
#change the list of cuisines into a set because set cannot have duplicate values
cuisine_set = set(cuisine_list_non_unique)
final_cuisine_list = list(cuisine_set)
final_cuisine_list

['',
 'japanese',
 'spanish',
 'italian',
 'mexican',
 'british',
 'vietnamese',
 'brazilian',
 'thai',
 'chinese',
 'french',
 'southern_us',
 'moroccan',
 'jamaican',
 'cajun_creole',
 'indian',
 'irish',
 'korean',
 'greek',
 'filipino',
 'russian']

In [17]:
final_cuisine_list.remove('') #the first data point is removed
number = len(final_cuisine_list)
print('Number of unique cuisines are: '+str(number)) 
final_cuisine_list

Number of unique cuisines are: 20


['japanese',
 'spanish',
 'italian',
 'mexican',
 'british',
 'vietnamese',
 'brazilian',
 'thai',
 'chinese',
 'french',
 'southern_us',
 'moroccan',
 'jamaican',
 'cajun_creole',
 'indian',
 'irish',
 'korean',
 'greek',
 'filipino',
 'russian']

In [18]:
#create a new dataframe consisting of cusines only
cuisines_df = pd.DataFrame(final_cuisine_list)

#convert to csv file
cuisines_df.to_csv('cuisines.csv', index=False, encoding='utf-8')
cuisines_df = pd.read_csv('cuisines.csv')
cuisines_df.columns=['Cuisine'] #name empty columns
cuisines_df['Cuisine'] = cuisines_df['Cuisine'].str.capitalize() #make nicer, caps first letter of each word
cuisines_df=cuisines_df.sort_values(by="Cuisine")

#drop the old index and relabel the index column, starting from 0 onwards
cuisines_df=cuisines_df.reset_index(drop=True) 
cuisines_df.to_csv('cuisines.csv', index = True, encoding='utf-8')
cuisines_df = pd.read_csv('cuisines.csv')

#rename the index column
cuisines_df.columns=['Index','Cuisines']  
cuisines_df.to_csv('cuisines.csv', index = False, encoding='utf-8')
cuisines_df

Unnamed: 0,Index,Cuisines
0,0,Brazilian
1,1,British
2,2,Cajun_creole
3,3,Chinese
4,4,Filipino
5,5,French
6,6,Greek
7,7,Indian
8,8,Irish
9,9,Italian


Next, we are going to do additional data cleaning to sort out the frequency of ingredients for each cuisine.

In [19]:
#refer back to original dataframe
data_csv

Unnamed: 0,Index,ID,Cuisine,Ingredients
0,0,10259,greek,"romaine lettuce, black olives, grape tomatoes,..."
1,1,25693,southern_us,"plain flour, ground pepper, salt, tomatoes, gr..."
2,2,20130,filipino,"eggs, pepper, salt, mayonaise, cooking oil, gr..."
3,3,22213,indian,"water, vegetable oil, wheat, salt"
4,4,13162,indian,"black pepper, shallots, cornflour, cayenne pep..."
...,...,...,...,...
39769,39769,29109,irish,"light brown sugar, granulated sugar, butter, w..."
39770,39770,11462,italian,"KRAFT Zesty Italian Dressing, purple onion, br..."
39771,39771,2238,irish,"eggs, citrus fruit, raisins, sourdough starter..."
39772,39772,41882,chinese,"boneless chicken skinless thigh, minced garlic..."


In [20]:
#test with greek:find out all the ingredients in a greek cuisine 
greek_list = "" 
for x in data_csv.Index:
    if data_csv.Cuisine[x] == 'greek':
        greek_list = greek_list+data_csv.Ingredients[x]+', '
greek_list

'romaine lettuce, black olives, grape tomatoes, garlic, pepper, purple onion, seasoning, garbanzo beans, feta cheese crumbles, ground pork, finely chopped fresh parsley, onions, salt, vinegar, caul fat, minced garlic, dried oregano, red wine vinegar, olive oil, boneless chop pork, lemon juice, orange, anise, cinnamon sticks, unflavored gelatin, zinfandel, orange blossom honey, sugar, lemon, calimyrna figs, clove, honey, whipping cream, plain whole-milk yogurt, fresh dill, yoghurt, salt, myzithra, large eggs, cheese, feta cheese, phyllo, kefalotyri, ground black pepper, extra-virgin olive oil, onions, olive oil, salt, hamburger buns, paprika, chopped fresh mint, ground cinnamon, balsamic vinegar, feta cheese crumbles, baby spinach leaves, purple onion, ground lamb, pepper, dried mint flakes, salt, dried oregano, tomatoes, ground black pepper, garlic, dried dillweed, olive oil, red wine, lamb, plain yogurt, pita bread rounds, purple onion, cucumber, garbanzo beans, liquid, black pepper, 

Again, from the above, we can see that there are duplicate ingredients in greek suisine 

In [21]:
#change the string of ingredients in greek into a list instead
greek_list_non_unique = list(greek_list.split(", "))
greek_list_non_unique

['romaine lettuce',
 'black olives',
 'grape tomatoes',
 'garlic',
 'pepper',
 'purple onion',
 'seasoning',
 'garbanzo beans',
 'feta cheese crumbles',
 'ground pork',
 'finely chopped fresh parsley',
 'onions',
 'salt',
 'vinegar',
 'caul fat',
 'minced garlic',
 'dried oregano',
 'red wine vinegar',
 'olive oil',
 'boneless chop pork',
 'lemon juice',
 'orange',
 'anise',
 'cinnamon sticks',
 'unflavored gelatin',
 'zinfandel',
 'orange blossom honey',
 'sugar',
 'lemon',
 'calimyrna figs',
 'clove',
 'honey',
 'whipping cream',
 'plain whole-milk yogurt',
 'fresh dill',
 'yoghurt',
 'salt',
 'myzithra',
 'large eggs',
 'cheese',
 'feta cheese',
 'phyllo',
 'kefalotyri',
 'ground black pepper',
 'extra-virgin olive oil',
 'onions',
 'olive oil',
 'salt',
 'hamburger buns',
 'paprika',
 'chopped fresh mint',
 'ground cinnamon',
 'balsamic vinegar',
 'feta cheese crumbles',
 'baby spinach leaves',
 'purple onion',
 'ground lamb',
 'pepper',
 'dried mint flakes',
 'salt',
 'dried orega

In [22]:
#change the list of ingredients into a set because set cannot have duplicate values
greek_set = set(greek_list_non_unique)
final_greek_list = list(greek_set)
final_greek_list

['asian eggplants',
 'low-fat plain greek yogurt',
 'smoked paprika',
 '',
 'pitted date',
 'golden zucchini',
 'bechamel',
 'chicken breast halves',
 'parsley flakes',
 'beet greens',
 'stevia extract',
 'celery ribs',
 'egg noodles',
 'curly leaf spinach',
 'hazelnuts',
 'caster sugar',
 'white onion',
 'round loaf',
 'reduced fat milk',
 'mint',
 'farmer cheese',
 'green lentil',
 'frozen mixed berries',
 'mango juice',
 'potatoes',
 'pure vanilla extract',
 'olive oil cooking spray',
 'tuna packed in water',
 'italian plum tomatoes',
 'bacon',
 'portabello mushroom',
 'sour cherries',
 'chili sauce',
 'spanish onion',
 'plain low fat greek yogurt',
 'Mazola Canola Oil',
 'stuffing mix',
 'dill weed',
 'pomegranate seeds',
 'lemon',
 'burger buns',
 'pickle relish',
 'phyllo dough',
 'branzino',
 'farro',
 'chicken breasts',
 'basil',
 'whole milk greek yogurt',
 'fresh spinach leaves',
 'center cut loin pork chop',
 'pecans',
 'toasted pine nuts',
 'sour cream',
 'havarti cheese',


In [23]:
final_greek_list.remove('') #the first data point is removed
number = len(final_greek_list)
print('Number of unique ingredients in greek cuisine are: '+str(number)) 
final_greek_list

Number of unique ingredients in greek cuisine are: 1203


['asian eggplants',
 'low-fat plain greek yogurt',
 'smoked paprika',
 'pitted date',
 'golden zucchini',
 'bechamel',
 'chicken breast halves',
 'parsley flakes',
 'beet greens',
 'stevia extract',
 'celery ribs',
 'egg noodles',
 'curly leaf spinach',
 'hazelnuts',
 'caster sugar',
 'white onion',
 'round loaf',
 'reduced fat milk',
 'mint',
 'farmer cheese',
 'green lentil',
 'frozen mixed berries',
 'mango juice',
 'potatoes',
 'pure vanilla extract',
 'olive oil cooking spray',
 'tuna packed in water',
 'italian plum tomatoes',
 'bacon',
 'portabello mushroom',
 'sour cherries',
 'chili sauce',
 'spanish onion',
 'plain low fat greek yogurt',
 'Mazola Canola Oil',
 'stuffing mix',
 'dill weed',
 'pomegranate seeds',
 'lemon',
 'burger buns',
 'pickle relish',
 'phyllo dough',
 'branzino',
 'farro',
 'chicken breasts',
 'basil',
 'whole milk greek yogurt',
 'fresh spinach leaves',
 'center cut loin pork chop',
 'pecans',
 'toasted pine nuts',
 'sour cream',
 'havarti cheese',
 'pit

In [24]:
#create a new dataframe consisting of greek ingredients only
greek_ingredients_df = pd.DataFrame(final_greek_list)

#convert to csv file
greek_ingredients_df.to_csv('greek_ingredients.csv',index=False, encoding='utf-8')
greek_ingredients_df = pd.read_csv('greek_ingredients.csv')
greek_ingredients_df.columns=['Ingredients'] #name empty columns
greek_ingredients_df['Ingredients'] = greek_ingredients_df['Ingredients'].str.capitalize() #make nicer, caps first letter of each word
greek_ingredients_df=greek_ingredients_df.sort_values(by="Ingredients")

#drop the old index and relabel the index column, starting from 0 onwards
greek_ingredients_df=greek_ingredients_df.reset_index(drop=True) 
greek_ingredients_df.to_csv('greek_ingredients.csv', index = True, encoding='utf-8')
greek_ingredients_df = pd.read_csv('greek_ingredients.csv')

#rename the index column
greek_ingredients_df.columns=['Index','Ingredients']  
greek_ingredients_df.to_csv('greek_ingredients.csv', index = False, encoding='utf-8')
greek_ingredients_df

Unnamed: 0,Index,Ingredients
0,0,"""hellmann or best food light mayonnais"""
1,1,"""hellmann or best food real mayonnais"""
2,2,"""i cant believ it not butter! made with olive ..."
3,3,"""uncle bens original converted brand rice"""
4,4,(10 oz.) frozen chopped spinach
...,...,...
1198,1198,Zest
1199,1199,Zesty italian dressing
1200,1200,Zinfandel
1201,1201,Ziti


In [25]:
#test with brazilian:find out all the ingredients in a brazilian cuisine 
brazilian_list = "" 
for x in data_csv.Index:
    if data_csv.Cuisine[x] == 'brazilian':
        brazilian_list = brazilian_list+data_csv.Ingredients[x]+', '
brazilian_list

#change the string of ingredients in brazilian into a list instead
brazilian_list_non_unique = list(brazilian_list.split(", "))
brazilian_list_non_unique

#change the list of ingredients into a set because set cannot have duplicate values
brazilian_set = set(brazilian_list_non_unique)
final_brazilian_list = list(brazilian_set)
final_brazilian_list

final_brazilian_list.remove('') #the first data point is removed
number = len(final_brazilian_list)
print('Number of unique ingredients in brazilian cuisine are: '+str(number)) 
final_brazilian_list

#create a new dataframe consisting of brazilian ingredients only
brazilian_ingredients_df = pd.DataFrame(final_brazilian_list)

#convert to csv file
brazilian_ingredients_df.to_csv('brazilian_ingredients.csv',index=False, encoding='utf-8')
brazilian_ingredients_df = pd.read_csv('brazilian_ingredients.csv')
brazilian_ingredients_df.columns=['Ingredients'] #name empty columns
brazilian_ingredients_df['Ingredients'] = brazilian_ingredients_df['Ingredients'].str.capitalize() #make nicer, caps first letter of each word
brazilian_ingredients_df=brazilian_ingredients_df.sort_values(by="Ingredients")

#drop the old index and relabel the index column, starting from 0 onwards
brazilian_ingredients_df=brazilian_ingredients_df.reset_index(drop=True) 
brazilian_ingredients_df.to_csv('brazilian_ingredients.csv', index = True, encoding='utf-8')
brazilian_ingredients_df = pd.read_csv('brazilian_ingredients.csv')

#rename the index column
brazilian_ingredients_df.columns=['Index','Ingredients']  
brazilian_ingredients_df.to_csv('brazilian_ingredients.csv', index = False, encoding='utf-8')
brazilian_ingredients_df


Number of unique ingredients in brazilian cuisine are: 853


Unnamed: 0,Index,Ingredients
0,0,1% low-fat milk
1,1,2% reduced-fat milk
2,2,Acai juice
3,3,Active dry yeast
4,4,Adobo style seasoning
...,...,...
848,848,Yuca
849,849,Yucca
850,850,Yucca root
851,851,Yukon gold potatoes


In [26]:
#test with british:find out all the ingredients in a british cuisine 
british_list = "" 
for x in data_csv.Index:
    if data_csv.Cuisine[x] == 'british':
        british_list = british_list+data_csv.Ingredients[x]+', '
british_list

#change the string of ingredients in british into a list instead
british_list_non_unique = list(british_list.split(", "))
british_list_non_unique

#change the list of ingredients into a set because set cannot have duplicate values
british_set = set(british_list_non_unique)
final_british_list = list(british_set)
final_british_list

final_british_list.remove('') #the first data point is removed
number = len(final_british_list)
print('Number of unique ingredients in british cuisine are: '+str(number)) 
final_british_list

#create a new dataframe consisting of british ingredients only
british_ingredients_df = pd.DataFrame(final_british_list)

#convert to csv file
british_ingredients_df.to_csv('british_ingredients.csv',index=False, encoding='utf-8')
british_ingredients_df = pd.read_csv('british_ingredients.csv')
british_ingredients_df.columns=['Ingredients'] #name empty columns
british_ingredients_df['Ingredients'] = british_ingredients_df['Ingredients'].str.capitalize() #make nicer, caps first letter of each word
british_ingredients_df=british_ingredients_df.sort_values(by="Ingredients")

#drop the old index and relabel the index column, starting from 0 onwards
british_ingredients_df=british_ingredients_df.reset_index(drop=True) 
british_ingredients_df.to_csv('british_ingredients.csv', index = True, encoding='utf-8')
british_ingredients_df = pd.read_csv('british_ingredients.csv')

#rename the index column
british_ingredients_df.columns=['Index','Ingredients']  
british_ingredients_df.to_csv('british_ingredients.csv', index = False, encoding='utf-8')
british_ingredients_df


Number of unique ingredients in british cuisine are: 1166


Unnamed: 0,Index,Ingredients
0,0,"""colmans mustard powder"""
1,1,"""coxs orange pippin"""
2,2,2% reduced-fat milk
3,3,Active dry yeast
4,4,Aged cheddar cheese
...,...,...
1161,1161,Yellow split peas
1162,1162,Yoghurt
1163,1163,Yukon gold
1164,1164,Yukon gold potatoes


In [27]:
#test with cajun_creole:find out all the ingredients in a cajun_creole cuisine 
cajun_creole_list = "" 
for x in data_csv.Index:
    if data_csv.Cuisine[x] == 'cajun_creole':
        cajun_creole_list = cajun_creole_list+data_csv.Ingredients[x]+', '
cajun_creole_list

#change the string of ingredients in cajun_creole into a list instead
cajun_creole_list_non_unique = list(cajun_creole_list.split(", "))
cajun_creole_list_non_unique

#change the list of ingredients into a set because set cannot have duplicate values
cajun_creole_set = set(cajun_creole_list_non_unique)
final_cajun_creole_list = list(cajun_creole_set)
final_cajun_creole_list

final_cajun_creole_list.remove('') #the first data point is removed
number = len(final_cajun_creole_list)
print('Number of unique ingredients in cajun_creole cuisine are: '+str(number)) 
final_cajun_creole_list

#create a new dataframe consisting of cajun_creole ingredients only
cajun_creole_ingredients_df = pd.DataFrame(final_cajun_creole_list)

#convert to csv file
cajun_creole_ingredients_df.to_csv('cajun_creole_ingredients.csv',index=False, encoding='utf-8')
cajun_creole_ingredients_df = pd.read_csv('cajun_creole_ingredients.csv')
cajun_creole_ingredients_df.columns=['Ingredients'] #name empty columns
cajun_creole_ingredients_df['Ingredients'] = cajun_creole_ingredients_df['Ingredients'].str.capitalize() #make nicer, caps first letter of each word
cajun_creole_ingredients_df=cajun_creole_ingredients_df.sort_values(by="Ingredients")

#drop the old index and relabel the index column, starting from 0 onwards
cajun_creole_ingredients_df=cajun_creole_ingredients_df.reset_index(drop=True) 
cajun_creole_ingredients_df.to_csv('cajun_creole_ingredients.csv', index = True, encoding='utf-8')
cajun_creole_ingredients_df = pd.read_csv('cajun_creole_ingredients.csv')

#rename the index column
cajun_creole_ingredients_df.columns=['Index','Ingredients']  
cajun_creole_ingredients_df.to_csv('cajun_creole_ingredients.csv', index = False, encoding='utf-8')
cajun_creole_ingredients_df


Number of unique ingredients in cajun_creole cuisine are: 1580


Unnamed: 0,Index,Ingredients
0,0,"""hellmann or best food real mayonnais"""
1,1,"""hellmanns® real mayonnaise"""
2,2,"""johnsonville® hot n spicy brats"""
3,3,"""potatoes obrien"""
4,4,"""tony chacheres seasoning"""
...,...,...
1575,1575,Yukon gold
1576,1576,Yukon gold potatoes
1577,1577,Zatarains creole seasoning
1578,1578,Zatarain’s jambalaya mix


In [28]:
#test with chinese:find out all the ingredients in a chinese cuisine 
chinese_list = "" 
for x in data_csv.Index:
    if data_csv.Cuisine[x] == 'chinese':
        chinese_list = chinese_list+data_csv.Ingredients[x]+', '
chinese_list

#change the string of ingredients in chinese into a list instead
chinese_list_non_unique = list(chinese_list.split(", "))
chinese_list_non_unique

#change the list of ingredients into a set because set cannot have duplicate values
chinese_set = set(chinese_list_non_unique)
final_chinese_list = list(chinese_set)
final_chinese_list

final_chinese_list.remove('') #the first data point is removed
number = len(final_chinese_list)
print('Number of unique ingredients in chinese cuisine are: '+str(number)) 
final_chinese_list

#create a new dataframe consisting of chinese ingredients only
chinese_ingredients_df = pd.DataFrame(final_chinese_list)

#convert to csv file
chinese_ingredients_df.to_csv('chinese_ingredients.csv',index=False, encoding='utf-8')
chinese_ingredients_df = pd.read_csv('chinese_ingredients.csv')
chinese_ingredients_df.columns=['Ingredients'] #name empty columns
chinese_ingredients_df['Ingredients'] = chinese_ingredients_df['Ingredients'].str.capitalize() #make nicer, caps first letter of each word
chinese_ingredients_df=chinese_ingredients_df.sort_values(by="Ingredients")

#drop the old index and relabel the index column, starting from 0 onwards
chinese_ingredients_df=chinese_ingredients_df.reset_index(drop=True) 
chinese_ingredients_df.to_csv('chinese_ingredients.csv', index = True, encoding='utf-8')
chinese_ingredients_df = pd.read_csv('chinese_ingredients.csv')

#rename the index column
chinese_ingredients_df.columns=['Index','Ingredients']  
chinese_ingredients_df.to_csv('chinese_ingredients.csv', index = False, encoding='utf-8')
chinese_ingredients_df


Number of unique ingredients in chinese cuisine are: 1796


Unnamed: 0,Index,Ingredients
0,0,"""egglands best® eggs"""
1,1,"""i cant believe its not butter!® spread"""
2,2,"""pigs trotters"""
3,3,33% less sodium smoked fully cooked ham
4,4,Abalone
...,...,...
1791,1791,Yellow squash
1792,1792,Yolk
1793,1793,Yu choy
1794,1794,Zest


In [29]:
#test with filipino:find out all the ingredients in a filipino cuisine 
filipino_list = "" 
for x in data_csv.Index:
    if data_csv.Cuisine[x] == 'filipino':
        filipino_list = filipino_list+data_csv.Ingredients[x]+', '
filipino_list

#change the string of ingredients in filipino into a list instead
filipino_list_non_unique = list(filipino_list.split(", "))
filipino_list_non_unique

#change the list of ingredients into a set because set cannot have duplicate values
filipino_set = set(filipino_list_non_unique)
final_filipino_list = list(filipino_set)
final_filipino_list

final_filipino_list.remove('') #the first data point is removed
number = len(final_filipino_list)
print('Number of unique ingredients in filipino cuisine are: '+str(number)) 
final_filipino_list

#create a new dataframe consisting of filipino ingredients only
filipino_ingredients_df = pd.DataFrame(final_filipino_list)

#convert to csv file
filipino_ingredients_df.to_csv('filipino_ingredients.csv',index=False, encoding='utf-8')
filipino_ingredients_df = pd.read_csv('filipino_ingredients.csv')
filipino_ingredients_df.columns=['Ingredients'] #name empty columns
filipino_ingredients_df['Ingredients'] = filipino_ingredients_df['Ingredients'].str.capitalize() #make nicer, caps first letter of each word
filipino_ingredients_df=filipino_ingredients_df.sort_values(by="Ingredients")

#drop the old index and relabel the index column, starting from 0 onwards
filipino_ingredients_df=filipino_ingredients_df.reset_index(drop=True) 
filipino_ingredients_df.to_csv('filipino_ingredients.csv', index = True, encoding='utf-8')
filipino_ingredients_df = pd.read_csv('filipino_ingredients.csv')

#rename the index column
filipino_ingredients_df.columns=['Index','Ingredients']  
filipino_ingredients_df.to_csv('filipino_ingredients.csv', index = False, encoding='utf-8')
filipino_ingredients_df


Number of unique ingredients in filipino cuisine are: 949


Unnamed: 0,Index,Ingredients
0,0,"""franks® redhot® original cayenne pepper sauce"""
1,1,7 up
2,2,Accent seasoning
3,3,Achiote powder
4,4,Acorn squash
...,...,...
944,944,Yellow squash
945,945,Young coconut meat
946,946,Yucca
947,947,Yukon gold potatoes


In [30]:
#test with french:find out all the ingredients in a french cuisine 
french_list = "" 
for x in data_csv.Index:
    if data_csv.Cuisine[x] == 'french':
        french_list = french_list+data_csv.Ingredients[x]+', '
french_list

#change the string of ingredients in french into a list instead
french_list_non_unique = list(french_list.split(", "))
french_list_non_unique

#change the list of ingredients into a set because set cannot have duplicate values
french_set = set(french_list_non_unique)
final_french_list = list(french_set)
final_french_list

final_french_list.remove('') #the first data point is removed
number = len(final_french_list)
print('Number of unique ingredients in french cuisine are: '+str(number)) 
final_french_list

#create a new dataframe consisting of french ingredients only
french_ingredients_df = pd.DataFrame(final_french_list)

#convert to csv file
french_ingredients_df.to_csv('french_ingredients.csv',index=False, encoding='utf-8')
french_ingredients_df = pd.read_csv('french_ingredients.csv')
french_ingredients_df.columns=['Ingredients'] #name empty columns
french_ingredients_df['Ingredients'] = french_ingredients_df['Ingredients'].str.capitalize() #make nicer, caps first letter of each word
french_ingredients_df=french_ingredients_df.sort_values(by="Ingredients")

#drop the old index and relabel the index column, starting from 0 onwards
french_ingredients_df=french_ingredients_df.reset_index(drop=True) 
french_ingredients_df.to_csv('french_ingredients.csv', index = True, encoding='utf-8')
french_ingredients_df = pd.read_csv('french_ingredients.csv')

#rename the index column
french_ingredients_df.columns=['Index','Ingredients']  
french_ingredients_df.to_csv('french_ingredients.csv', index = False, encoding='utf-8')
french_ingredients_df


Number of unique ingredients in french cuisine are: 2106


Unnamed: 0,Index,Ingredients
0,0,"""i cant believe its not butter!® spread"""
1,1,"""piment despelette"""
2,2,"""soft goats cheese"""
3,3,( oz.) tomato sauce
4,4,1% low-fat chocolate milk
...,...,...
2101,2101,Young leeks
2102,2102,Yukon gold
2103,2103,Yukon gold potatoes
2104,2104,Zinfandel


In [31]:
#test with indian:find out all the ingredients in a indian cuisine 
indian_list = "" 
for x in data_csv.Index:
    if data_csv.Cuisine[x] == 'indian':
        indian_list = indian_list+data_csv.Ingredients[x]+', '
indian_list

#change the string of ingredients in indian into a list instead
indian_list_non_unique = list(indian_list.split(", "))
indian_list_non_unique

#change the list of ingredients into a set because set cannot have duplicate values
indian_set = set(indian_list_non_unique)
final_indian_list = list(indian_set)
final_indian_list

final_indian_list.remove('') #the first data point is removed
number = len(final_indian_list)
print('Number of unique ingredients in indian cuisine are: '+str(number)) 
final_indian_list

#create a new dataframe consisting of indian ingredients only
indian_ingredients_df = pd.DataFrame(final_indian_list)

#convert to csv file
indian_ingredients_df.to_csv('indian_ingredients.csv',index=False, encoding='utf-8')
indian_ingredients_df = pd.read_csv('indian_ingredients.csv')
indian_ingredients_df.columns=['Ingredients'] #name empty columns
indian_ingredients_df['Ingredients'] = indian_ingredients_df['Ingredients'].str.capitalize() #make nicer, caps first letter of each word
indian_ingredients_df=indian_ingredients_df.sort_values(by="Ingredients")

#drop the old index and relabel the index column, starting from 0 onwards
indian_ingredients_df=indian_ingredients_df.reset_index(drop=True) 
indian_ingredients_df.to_csv('indian_ingredients.csv', index = True, encoding='utf-8')
indian_ingredients_df = pd.read_csv('indian_ingredients.csv')

#rename the index column
indian_ingredients_df.columns=['Index','Ingredients']  
indian_ingredients_df.to_csv('indian_ingredients.csv', index = False, encoding='utf-8')
indian_ingredients_df


Number of unique ingredients in indian cuisine are: 1667


Unnamed: 0,Index,Ingredients
0,0,"""quorn chikn tenders"""
1,1,( oz.) tomato paste
2,2,(14 oz.) sweetened condensed milk
3,3,1% low-fat milk
4,4,2% reduced-fat milk
...,...,...
1662,1662,Yoghurt natural low fat
1663,1663,Yogurt cheese
1664,1664,Yogurt dressing
1665,1665,Yukon gold potatoes


In [32]:
#test with irish:find out all the ingredients in a irish cuisine 
irish_list = "" 
for x in data_csv.Index:
    if data_csv.Cuisine[x] == 'irish':
        irish_list = irish_list+data_csv.Ingredients[x]+', '
irish_list

#change the string of ingredients in irish into a list instead
irish_list_non_unique = list(irish_list.split(", "))
irish_list_non_unique

#change the list of ingredients into a set because set cannot have duplicate values
irish_set = set(irish_list_non_unique)
final_irish_list = list(irish_set)
final_irish_list

final_irish_list.remove('') #the first data point is removed
number = len(final_irish_list)
print('Number of unique ingredients in irish cuisine are: '+str(number)) 
final_irish_list

#create a new dataframe consisting of irish ingredients only
irish_ingredients_df = pd.DataFrame(final_irish_list)

#convert to csv file
irish_ingredients_df.to_csv('irish_ingredients.csv',index=False, encoding='utf-8')
irish_ingredients_df = pd.read_csv('irish_ingredients.csv')
irish_ingredients_df.columns=['Ingredients'] #name empty columns
irish_ingredients_df['Ingredients'] = irish_ingredients_df['Ingredients'].str.capitalize() #make nicer, caps first letter of each word
irish_ingredients_df=irish_ingredients_df.sort_values(by="Ingredients")

#drop the old index and relabel the index column, starting from 0 onwards
irish_ingredients_df=irish_ingredients_df.reset_index(drop=True) 
irish_ingredients_df.to_csv('irish_ingredients.csv', index = True, encoding='utf-8')
irish_ingredients_df = pd.read_csv('irish_ingredients.csv')

#rename the index column
irish_ingredients_df.columns=['Index','Ingredients']  
irish_ingredients_df.to_csv('irish_ingredients.csv', index = False, encoding='utf-8')
irish_ingredients_df


Number of unique ingredients in irish cuisine are: 1002


Unnamed: 0,Index,Ingredients
0,0,"""hellmann or best food real mayonnais"""
1,1,"""i cant believe its not butter!® spread"""
2,2,"""m&ms candy"""
3,3,"""pigs trotters"""
4,4,"""soft goats cheese"""
...,...,...
997,997,Yellow peppers
998,998,Yellow squash
999,999,Young nettle
1000,1000,Yukon gold potatoes


In [33]:
#test with italian:find out all the ingredients in a italian cuisine 
italian_list = "" 
for x in data_csv.Index:
    if data_csv.Cuisine[x] == 'italian':
        italian_list = italian_list+data_csv.Ingredients[x]+', '
italian_list

#change the string of ingredients in italian into a list instead
italian_list_non_unique = list(italian_list.split(", "))
italian_list_non_unique

#change the list of ingredients into a set because set cannot have duplicate values
italian_set = set(italian_list_non_unique)
final_italian_list = list(italian_set)
final_italian_list

final_italian_list.remove('') #the first data point is removed
number = len(final_italian_list)
print('Number of unique ingredients in italian cuisine are: '+str(number)) 
final_italian_list

#create a new dataframe consisting of italian ingredients only
italian_ingredients_df = pd.DataFrame(final_italian_list)

#convert to csv file
italian_ingredients_df.to_csv('italian_ingredients.csv',index=False, encoding='utf-8')
italian_ingredients_df = pd.read_csv('italian_ingredients.csv')
italian_ingredients_df.columns=['Ingredients'] #name empty columns
italian_ingredients_df['Ingredients'] = italian_ingredients_df['Ingredients'].str.capitalize() #make nicer, caps first letter of each word
italian_ingredients_df=italian_ingredients_df.sort_values(by="Ingredients")

#drop the old index and relabel the index column, starting from 0 onwards
italian_ingredients_df=italian_ingredients_df.reset_index(drop=True) 
italian_ingredients_df.to_csv('italian_ingredients.csv', index = True, encoding='utf-8')
italian_ingredients_df = pd.read_csv('italian_ingredients.csv')

#rename the index column
italian_ingredients_df.columns=['Index','Ingredients']  
italian_ingredients_df.to_csv('italian_ingredients.csv', index = False, encoding='utf-8')
italian_ingredients_df


Number of unique ingredients in italian cuisine are: 2936


Unnamed: 0,Index,Ingredients
0,0,"""campbells condensed cream of mushroom soup"""
1,1,"""hellmann or best food light mayonnais"""
2,2,"""hellmann or best food real mayonnais"""
3,3,"""hellmanns dijonnaise creamy dijon mustard"""
4,4,"""i cant believe its not butter!® all purpose s..."
...,...,...
2931,2931,Zesty italian dressing
2932,2932,Zinfandel
2933,2933,Ziti
2934,2934,Zucchini


In [34]:
#test with jamaican:find out all the ingredients in a jamaican cuisine 
jamaican_list = "" 
for x in data_csv.Index:
    if data_csv.Cuisine[x] == 'jamaican':
        jamaican_list = jamaican_list+data_csv.Ingredients[x]+', '
jamaican_list

#change the string of ingredients in jamaican into a list instead
jamaican_list_non_unique = list(jamaican_list.split(", "))
jamaican_list_non_unique

#change the list of ingredients into a set because set cannot have duplicate values
jamaican_set = set(jamaican_list_non_unique)
final_jamaican_list = list(jamaican_set)
final_jamaican_list

final_jamaican_list.remove('') #the first data point is removed
number = len(final_jamaican_list)
print('Number of unique ingredients in jamaican cuisine are: '+str(number)) 
final_jamaican_list

#create a new dataframe consisting of jamaican ingredients only
jamaican_ingredients_df = pd.DataFrame(final_jamaican_list)

#convert to csv file
jamaican_ingredients_df.to_csv('jamaican_ingredients.csv',index=False, encoding='utf-8')
jamaican_ingredients_df = pd.read_csv('jamaican_ingredients.csv')
jamaican_ingredients_df.columns=['Ingredients'] #name empty columns
jamaican_ingredients_df['Ingredients'] = jamaican_ingredients_df['Ingredients'].str.capitalize() #make nicer, caps first letter of each word
jamaican_ingredients_df=jamaican_ingredients_df.sort_values(by="Ingredients")

#drop the old index and relabel the index column, starting from 0 onwards
jamaican_ingredients_df=jamaican_ingredients_df.reset_index(drop=True) 
jamaican_ingredients_df.to_csv('jamaican_ingredients.csv', index = True, encoding='utf-8')
jamaican_ingredients_df = pd.read_csv('jamaican_ingredients.csv')

#rename the index column
jamaican_ingredients_df.columns=['Index','Ingredients']  
jamaican_ingredients_df.to_csv('jamaican_ingredients.csv', index = False, encoding='utf-8')
jamaican_ingredients_df


Number of unique ingredients in jamaican cuisine are: 878


Unnamed: 0,Index,Ingredients
0,0,"""hellmann or best food light mayonnais"""
1,1,Ackee
2,2,Active dry yeast
3,3,Adobo sauce
4,4,Adobo seasoning
...,...,...
873,873,Yoghurt
874,874,Yuca
875,875,Yucca
876,876,Yukon gold potatoes


In [35]:
#test with japanese:find out all the ingredients in a japanese cuisine 
japanese_list = "" 
for x in data_csv.Index:
    if data_csv.Cuisine[x] == 'japanese':
        japanese_list = japanese_list+data_csv.Ingredients[x]+', '
japanese_list

#change the string of ingredients in japanese into a list instead
japanese_list_non_unique = list(japanese_list.split(", "))
japanese_list_non_unique

#change the list of ingredients into a set because set cannot have duplicate values
japanese_set = set(japanese_list_non_unique)
final_japanese_list = list(japanese_set)
final_japanese_list

final_japanese_list.remove('') #the first data point is removed
number = len(final_japanese_list)
print('Number of unique ingredients in japanese cuisine are: '+str(number)) 
final_japanese_list

#create a new dataframe consisting of japanese ingredients only
japanese_ingredients_df = pd.DataFrame(final_japanese_list)

#convert to csv file
japanese_ingredients_df.to_csv('japanese_ingredients.csv',index=False, encoding='utf-8')
japanese_ingredients_df = pd.read_csv('japanese_ingredients.csv')
japanese_ingredients_df.columns=['Ingredients'] #name empty columns
japanese_ingredients_df['Ingredients'] = japanese_ingredients_df['Ingredients'].str.capitalize() #make nicer, caps first letter of each word
japanese_ingredients_df=japanese_ingredients_df.sort_values(by="Ingredients")

#drop the old index and relabel the index column, starting from 0 onwards
japanese_ingredients_df=japanese_ingredients_df.reset_index(drop=True) 
japanese_ingredients_df.to_csv('japanese_ingredients.csv', index = True, encoding='utf-8')
japanese_ingredients_df = pd.read_csv('japanese_ingredients.csv')

#rename the index column
japanese_ingredients_df.columns=['Index','Ingredients']  
japanese_ingredients_df.to_csv('japanese_ingredients.csv', index = False, encoding='utf-8')
japanese_ingredients_df


Number of unique ingredients in japanese cuisine are: 1442


Unnamed: 0,Index,Ingredients
0,0,1% low-fat milk
1,1,2% reduced-fat milk
2,2,A taste of thai rice noodles
3,3,Abura age
4,4,Active dry yeast
...,...,...
1437,1437,Yuzu
1438,1438,Yuzu juice
1439,1439,Yuzukosho
1440,1440,Zest


In [36]:
#test with korean:find out all the ingredients in a korean cuisine 
korean_list = "" 
for x in data_csv.Index:
    if data_csv.Cuisine[x] == 'korean':
        korean_list = korean_list+data_csv.Ingredients[x]+', '
korean_list

#change the string of ingredients in korean into a list instead
korean_list_non_unique = list(korean_list.split(", "))
korean_list_non_unique

#change the list of ingredients into a set because set cannot have duplicate values
korean_set = set(korean_list_non_unique)
final_korean_list = list(korean_set)
final_korean_list

final_korean_list.remove('') #the first data point is removed
number = len(final_korean_list)
print('Number of unique ingredients in korean cuisine are: '+str(number)) 
final_korean_list

#create a new dataframe consisting of korean ingredients only
korean_ingredients_df = pd.DataFrame(final_korean_list)

#convert to csv file
korean_ingredients_df.to_csv('korean_ingredients.csv',index=False, encoding='utf-8')
korean_ingredients_df = pd.read_csv('korean_ingredients.csv')
korean_ingredients_df.columns=['Ingredients'] #name empty columns
korean_ingredients_df['Ingredients'] = korean_ingredients_df['Ingredients'].str.capitalize() #make nicer, caps first letter of each word
korean_ingredients_df=korean_ingredients_df.sort_values(by="Ingredients")

#drop the old index and relabel the index column, starting from 0 onwards
korean_ingredients_df=korean_ingredients_df.reset_index(drop=True) 
korean_ingredients_df.to_csv('korean_ingredients.csv', index = True, encoding='utf-8')
korean_ingredients_df = pd.read_csv('korean_ingredients.csv')

#rename the index column
korean_ingredients_df.columns=['Index','Ingredients']  
korean_ingredients_df.to_csv('korean_ingredients.csv', index = False, encoding='utf-8')
korean_ingredients_df


Number of unique ingredients in korean cuisine are: 898


Unnamed: 0,Index,Ingredients
0,0,"""franks® redhot® original cayenne pepper sauce"""
1,1,Agave nectar
2,2,All purpose unbleached flour
3,3,All-purpose flour
4,4,Anchovies
...,...,...
893,893,Yellow peppers
894,894,Yellow squash
895,895,Yellow summer squash
896,896,Yellowfin tuna


In [37]:
#test with mexican:find out all the ingredients in a mexican cuisine 
mexican_list = "" 
for x in data_csv.Index:
    if data_csv.Cuisine[x] == 'mexican':
        mexican_list = mexican_list+data_csv.Ingredients[x]+', '
mexican_list

#change the string of ingredients in mexican into a list instead
mexican_list_non_unique = list(mexican_list.split(", "))
mexican_list_non_unique

#change the list of ingredients into a set because set cannot have duplicate values
mexican_set = set(mexican_list_non_unique)
final_mexican_list = list(mexican_set)
final_mexican_list

final_mexican_list.remove('') #the first data point is removed
number = len(final_mexican_list)
print('Number of unique ingredients in mexican cuisine are: '+str(number)) 
final_mexican_list

#create a new dataframe consisting of mexican ingredients only
mexican_ingredients_df = pd.DataFrame(final_mexican_list)

#convert to csv file
mexican_ingredients_df.to_csv('mexican_ingredients.csv',index=False, encoding='utf-8')
mexican_ingredients_df = pd.read_csv('mexican_ingredients.csv')
mexican_ingredients_df.columns=['Ingredients'] #name empty columns
mexican_ingredients_df['Ingredients'] = mexican_ingredients_df['Ingredients'].str.capitalize() #make nicer, caps first letter of each word
mexican_ingredients_df=mexican_ingredients_df.sort_values(by="Ingredients")

#drop the old index and relabel the index column, starting from 0 onwards
mexican_ingredients_df=mexican_ingredients_df.reset_index(drop=True) 
mexican_ingredients_df.to_csv('mexican_ingredients.csv', index = True, encoding='utf-8')
mexican_ingredients_df = pd.read_csv('mexican_ingredients.csv')

#rename the index column
mexican_ingredients_df.columns=['Index','Ingredients']  
mexican_ingredients_df.to_csv('mexican_ingredients.csv', index = False, encoding='utf-8')
mexican_ingredients_df


Number of unique ingredients in mexican cuisine are: 2695


Unnamed: 0,Index,Ingredients
0,0,"""best foods mayonnaise with lime juice"""
1,1,"""breakstones sour cream"""
2,2,"""campbells condensed cheddar cheese soup"""
3,3,"""campbells condensed cream of chicken soup"""
4,4,"""campbells condensed cream of mushroom soup"""
...,...,...
2690,2690,Yukon gold potatoes
2691,2691,Zest
2692,2692,Zesty italian dressing
2693,2693,Zucchini


In [38]:
#test with moroccan:find out all the ingredients in a moroccan cuisine 
moroccan_list = "" 
for x in data_csv.Index:
    if data_csv.Cuisine[x] == 'moroccan':
        moroccan_list = moroccan_list+data_csv.Ingredients[x]+', '
moroccan_list

#change the string of ingredients in moroccan into a list instead
moroccan_list_non_unique = list(moroccan_list.split(", "))
moroccan_list_non_unique

#change the list of ingredients into a set because set cannot have duplicate values
moroccan_set = set(moroccan_list_non_unique)
final_moroccan_list = list(moroccan_set)
final_moroccan_list

final_moroccan_list.remove('') #the first data point is removed
number = len(final_moroccan_list)
print('Number of unique ingredients in moroccan cuisine are: '+str(number)) 
final_moroccan_list

#create a new dataframe consisting of moroccan ingredients only
moroccan_ingredients_df = pd.DataFrame(final_moroccan_list)

#convert to csv file
moroccan_ingredients_df.to_csv('moroccan_ingredients.csv',index=False, encoding='utf-8')
moroccan_ingredients_df = pd.read_csv('moroccan_ingredients.csv')
moroccan_ingredients_df.columns=['Ingredients'] #name empty columns
moroccan_ingredients_df['Ingredients'] = moroccan_ingredients_df['Ingredients'].str.capitalize() #make nicer, caps first letter of each word
moroccan_ingredients_df=moroccan_ingredients_df.sort_values(by="Ingredients")

#drop the old index and relabel the index column, starting from 0 onwards
moroccan_ingredients_df=moroccan_ingredients_df.reset_index(drop=True) 
moroccan_ingredients_df.to_csv('moroccan_ingredients.csv', index = True, encoding='utf-8')
moroccan_ingredients_df = pd.read_csv('moroccan_ingredients.csv')

#rename the index column
moroccan_ingredients_df.columns=['Index','Ingredients']  
moroccan_ingredients_df.to_csv('moroccan_ingredients.csv', index = False, encoding='utf-8')
moroccan_ingredients_df


Number of unique ingredients in moroccan cuisine are: 975


Unnamed: 0,Index,Ingredients
0,0,"""i cant believ it not butter! made with olive ..."
1,1,"""piment despelette"""
2,2,1% low-fat milk
3,3,Acorn squash
4,4,Active dry yeast
...,...,...
970,970,Yellow peppers
971,971,Yellow squash
972,972,Yoghurt
973,973,Yukon gold potatoes


In [39]:
#test with russian:find out all the ingredients in a russian cuisine 
russian_list = "" 
for x in data_csv.Index:
    if data_csv.Cuisine[x] == 'russian':
        russian_list = russian_list+data_csv.Ingredients[x]+', '
russian_list

#change the string of ingredients in russian into a list instead
russian_list_non_unique = list(russian_list.split(", "))
russian_list_non_unique

#change the list of ingredients into a set because set cannot have duplicate values
russian_set = set(russian_list_non_unique)
final_russian_list = list(russian_set)
final_russian_list

final_russian_list.remove('') #the first data point is removed
number = len(final_russian_list)
print('Number of unique ingredients in russian cuisine are: '+str(number)) 
final_russian_list

#create a new dataframe consisting of russian ingredients only
russian_ingredients_df = pd.DataFrame(final_russian_list)

#convert to csv file
russian_ingredients_df.to_csv('russian_ingredients.csv',index=False, encoding='utf-8')
russian_ingredients_df = pd.read_csv('russian_ingredients.csv')
russian_ingredients_df.columns=['Ingredients'] #name empty columns
russian_ingredients_df['Ingredients'] = russian_ingredients_df['Ingredients'].str.capitalize() #make nicer, caps first letter of each word
russian_ingredients_df=russian_ingredients_df.sort_values(by="Ingredients")

#drop the old index and relabel the index column, starting from 0 onwards
russian_ingredients_df=russian_ingredients_df.reset_index(drop=True) 
russian_ingredients_df.to_csv('russian_ingredients.csv', index = True, encoding='utf-8')
russian_ingredients_df = pd.read_csv('russian_ingredients.csv')

#rename the index column
russian_ingredients_df.columns=['Index','Ingredients']  
russian_ingredients_df.to_csv('russian_ingredients.csv', index = False, encoding='utf-8')
russian_ingredients_df


Number of unique ingredients in russian cuisine are: 872


Unnamed: 0,Index,Ingredients
0,0,"""hellmanns® real mayonnaise"""
1,1,"""soft goats cheese"""
2,2,1% low-fat milk
3,3,2% reduced-fat milk
4,4,Active dry yeast
...,...,...
867,867,Xanthan gum
868,868,Yeast
869,869,Yellow onion
870,870,Yukon gold potatoes


In [40]:
#test with southern_us:find out all the ingredients in a southern_us cuisine 
southern_us_list = "" 
for x in data_csv.Index:
    if data_csv.Cuisine[x] == 'southern_us':
        southern_us_list = southern_us_list+data_csv.Ingredients[x]+', '
southern_us_list

#change the string of ingredients in southern_us into a list instead
southern_us_list_non_unique = list(southern_us_list.split(", "))
southern_us_list_non_unique

#change the list of ingredients into a set because set cannot have duplicate values
southern_us_set = set(southern_us_list_non_unique)
final_southern_us_list = list(southern_us_set)
final_southern_us_list

final_southern_us_list.remove('') #the first data point is removed
number = len(final_southern_us_list)
print('Number of unique ingredients in southern_us cuisine are: '+str(number)) 
final_southern_us_list

#create a new dataframe consisting of southern_us ingredients only
southern_us_ingredients_df = pd.DataFrame(final_southern_us_list)

#convert to csv file
southern_us_ingredients_df.to_csv('southern_us_ingredients.csv',index=False, encoding='utf-8')
southern_us_ingredients_df = pd.read_csv('southern_us_ingredients.csv')
southern_us_ingredients_df.columns=['Ingredients'] #name empty columns
southern_us_ingredients_df['Ingredients'] = southern_us_ingredients_df['Ingredients'].str.capitalize() #make nicer, caps first letter of each word
southern_us_ingredients_df=southern_us_ingredients_df.sort_values(by="Ingredients")

#drop the old index and relabel the index column, starting from 0 onwards
southern_us_ingredients_df=southern_us_ingredients_df.reset_index(drop=True) 
southern_us_ingredients_df.to_csv('southern_us_ingredients.csv', index = True, encoding='utf-8')
southern_us_ingredients_df = pd.read_csv('southern_us_ingredients.csv')

#rename the index column
southern_us_ingredients_df.columns=['Index','Ingredients']  
southern_us_ingredients_df.to_csv('southern_us_ingredients.csv', index = False, encoding='utf-8')
southern_us_ingredients_df


Number of unique ingredients in southern_us cuisine are: 2468


Unnamed: 0,Index,Ingredients
0,0,"""devils food cake mix"""
1,1,"""egglands best® eggs"""
2,2,"""franks® redhot® original cayenne pepper sauce"""
3,3,"""hellmann or best food real mayonnais"""
4,4,"""hellmanns® real mayonnaise"""
...,...,...
2463,2463,Yoplait
2464,2464,Yukon gold potatoes
2465,2465,Zest
2466,2466,Zesty italian dressing


In [41]:
#test with spanish:find out all the ingredients in a spanish cuisine 
spanish_list = "" 
for x in data_csv.Index:
    if data_csv.Cuisine[x] == 'spanish':
        spanish_list = spanish_list+data_csv.Ingredients[x]+', '
spanish_list

#change the string of ingredients in spanish into a list instead
spanish_list_non_unique = list(spanish_list.split(", "))
spanish_list_non_unique

#change the list of ingredients into a set because set cannot have duplicate values
spanish_set = set(spanish_list_non_unique)
final_spanish_list = list(spanish_set)
final_spanish_list

final_spanish_list.remove('') #the first data point is removed
number = len(final_spanish_list)
print('Number of unique ingredients in spanish cuisine are: '+str(number)) 
final_spanish_list

#create a new dataframe consisting of spanish ingredients only
spanish_ingredients_df = pd.DataFrame(final_spanish_list)

#convert to csv file
spanish_ingredients_df.to_csv('spanish_ingredients.csv',index=False, encoding='utf-8')
spanish_ingredients_df = pd.read_csv('spanish_ingredients.csv')
spanish_ingredients_df.columns=['Ingredients'] #name empty columns
spanish_ingredients_df['Ingredients'] = spanish_ingredients_df['Ingredients'].str.capitalize() #make nicer, caps first letter of each word
spanish_ingredients_df=spanish_ingredients_df.sort_values(by="Ingredients")

#drop the old index and relabel the index column, starting from 0 onwards
spanish_ingredients_df=spanish_ingredients_df.reset_index(drop=True) 
spanish_ingredients_df.to_csv('spanish_ingredients.csv', index = True, encoding='utf-8')
spanish_ingredients_df = pd.read_csv('spanish_ingredients.csv')

#rename the index column
spanish_ingredients_df.columns=['Index','Ingredients']  
spanish_ingredients_df.to_csv('spanish_ingredients.csv', index = False, encoding='utf-8')
spanish_ingredients_df


Number of unique ingredients in spanish cuisine are: 1268


Unnamed: 0,Index,Ingredients
0,0,"""piment despelette"""
1,1,"""soft goats cheese"""
2,2,1% low-fat milk
3,3,Active dry yeast
4,4,Adobo
...,...,...
1263,1263,Yolk
1264,1264,Yukon gold
1265,1265,Yukon gold potatoes
1266,1266,Zinfandel


In [42]:
#test with thai:find out all the ingredients in a thai cuisine 
thai_list = "" 
for x in data_csv.Index:
    if data_csv.Cuisine[x] == 'thai':
        thai_list = thai_list+data_csv.Ingredients[x]+', '
thai_list

#change the string of ingredients in thai into a list instead
thai_list_non_unique = list(thai_list.split(", "))
thai_list_non_unique

#change the list of ingredients into a set because set cannot have duplicate values
thai_set = set(thai_list_non_unique)
final_thai_list = list(thai_set)
final_thai_list

final_thai_list.remove('') #the first data point is removed
number = len(final_thai_list)
print('Number of unique ingredients in thai cuisine are: '+str(number)) 
final_thai_list

#create a new dataframe consisting of thai ingredients only
thai_ingredients_df = pd.DataFrame(final_thai_list)

#convert to csv file
thai_ingredients_df.to_csv('thai_ingredients.csv',index=False, encoding='utf-8')
thai_ingredients_df = pd.read_csv('thai_ingredients.csv')
thai_ingredients_df.columns=['Ingredients'] #name empty columns
thai_ingredients_df['Ingredients'] = thai_ingredients_df['Ingredients'].str.capitalize() #make nicer, caps first letter of each word
thai_ingredients_df=thai_ingredients_df.sort_values(by="Ingredients")

#drop the old index and relabel the index column, starting from 0 onwards
thai_ingredients_df=thai_ingredients_df.reset_index(drop=True) 
thai_ingredients_df.to_csv('thai_ingredients.csv', index = True, encoding='utf-8')
thai_ingredients_df = pd.read_csv('thai_ingredients.csv')

#rename the index column
thai_ingredients_df.columns=['Index','Ingredients']  
thai_ingredients_df.to_csv('thai_ingredients.csv', index = False, encoding='utf-8')
thai_ingredients_df


Number of unique ingredients in thai cuisine are: 1380


Unnamed: 0,Index,Ingredients
0,0,"""hellmann or best food real mayonnais"""
1,1,"""hellmanns® real mayonnaise"""
2,2,"""i cant believe its not butter!® spread"""
3,3,( oz.) tomato sauce
4,4,Acorn squash
...,...,...
1375,1375,Yellow squash
1376,1376,Yoghurt
1377,1377,Young coconut meat
1378,1378,Yukon gold potatoes


In [43]:
#test with vietnamese:find out all the ingredients in a vietnamese cuisine 
vietnamese_list = "" 
for x in data_csv.Index:
    if data_csv.Cuisine[x] == 'vietnamese':
        vietnamese_list = vietnamese_list+data_csv.Ingredients[x]+', '
vietnamese_list

#change the string of ingredients in vietnamese into a list instead
vietnamese_list_non_unique = list(vietnamese_list.split(", "))
vietnamese_list_non_unique

#change the list of ingredients into a set because set cannot have duplicate values
vietnamese_set = set(vietnamese_list_non_unique)
final_vietnamese_list = list(vietnamese_set)
final_vietnamese_list

final_vietnamese_list.remove('') #the first data point is removed
number = len(final_vietnamese_list)
print('Number of unique ingredients in vietnamese cuisine are: '+str(number)) 
final_vietnamese_list

#create a new dataframe consisting of vietnamese ingredients only
vietnamese_ingredients_df = pd.DataFrame(final_vietnamese_list)

#convert to csv file
vietnamese_ingredients_df.to_csv('vietnamese_ingredients.csv',index=False, encoding='utf-8')
vietnamese_ingredients_df = pd.read_csv('vietnamese_ingredients.csv')
vietnamese_ingredients_df.columns=['Ingredients'] #name empty columns
vietnamese_ingredients_df['Ingredients'] = vietnamese_ingredients_df['Ingredients'].str.capitalize() #make nicer, caps first letter of each word
vietnamese_ingredients_df=vietnamese_ingredients_df.sort_values(by="Ingredients")

#drop the old index and relabel the index column, starting from 0 onwards
vietnamese_ingredients_df=vietnamese_ingredients_df.reset_index(drop=True) 
vietnamese_ingredients_df.to_csv('vietnamese_ingredients.csv', index = True, encoding='utf-8')
vietnamese_ingredients_df = pd.read_csv('vietnamese_ingredients.csv')

#rename the index column
vietnamese_ingredients_df.columns=['Index','Ingredients']  
vietnamese_ingredients_df.to_csv('vietnamese_ingredients.csv', index = False, encoding='utf-8')
vietnamese_ingredients_df


Number of unique ingredients in vietnamese cuisine are: 1111


Unnamed: 0,Index,Ingredients
0,0,"""franks® redhot® original cayenne pepper sauce"""
1,1,"""hellmann or best food real mayonnais"""
2,2,Acorn squash
3,3,Active dry yeast
4,4,Adzuki beans
...,...,...
1106,1106,Yellow peppers
1107,1107,Yellow rock sugar
1108,1108,Yellow squash
1109,1109,Yukon gold potatoes


We now have finished all 20 csv files which contains all the number of unique ingredients in each cuisine. Next, we are going to count the number of frequency of those ingredients in each cuisine. We will start with the counts of the total ingredients for all cuisine.

In [44]:
ingredient_list_testing = []
ingredient_list_testing = ingredient_list
ingredient_list_testing = list(ingredient_list_testing.split(", "))
ingredient_list_testing

['romaine lettuce',
 'black olives',
 'grape tomatoes',
 'garlic',
 'pepper',
 'purple onion',
 'seasoning',
 'garbanzo beans',
 'feta cheese crumbles',
 'plain flour',
 'ground pepper',
 'salt',
 'tomatoes',
 'ground black pepper',
 'thyme',
 'eggs',
 'green tomatoes',
 'yellow corn meal',
 'milk',
 'vegetable oil',
 'eggs',
 'pepper',
 'salt',
 'mayonaise',
 'cooking oil',
 'green chilies',
 'grilled chicken breasts',
 'garlic powder',
 'yellow onion',
 'soy sauce',
 'butter',
 'chicken livers',
 'water',
 'vegetable oil',
 'wheat',
 'salt',
 'black pepper',
 'shallots',
 'cornflour',
 'cayenne pepper',
 'onions',
 'garlic paste',
 'milk',
 'butter',
 'salt',
 'lemon juice',
 'water',
 'chili powder',
 'passata',
 'oil',
 'ground cumin',
 'boneless chicken skinless thigh',
 'garam masala',
 'double cream',
 'natural yogurt',
 'bay leaf',
 'plain flour',
 'sugar',
 'butter',
 'eggs',
 'fresh ginger root',
 'salt',
 'ground cinnamon',
 'milk',
 'vanilla extract',
 'ground ginger',
 'po

In [45]:
#count number of appearance of each ingredient (non unique)
count = {} 
for item in ingredient_list_testing:
    if (item in count):
        count[item] += 1
    else:
        count[item] = 1    
count

{'romaine lettuce': 270,
 'black olives': 229,
 'grape tomatoes': 228,
 'garlic': 7380,
 'pepper': 4438,
 'purple onion': 1896,
 'seasoning': 137,
 'garbanzo beans': 148,
 'feta cheese crumbles': 358,
 'plain flour': 154,
 'ground pepper': 385,
 'salt': 18049,
 'tomatoes': 3058,
 'ground black pepper': 4785,
 'thyme': 361,
 'eggs': 3388,
 'green tomatoes': 108,
 'yellow corn meal': 341,
 'milk': 2263,
 'vegetable oil': 4385,
 'mayonaise': 781,
 'cooking oil': 483,
 'green chilies': 768,
 'grilled chicken breasts': 5,
 'garlic powder': 1442,
 'yellow onion': 1184,
 'soy sauce': 3296,
 'butter': 4848,
 'chicken livers': 65,
 'water': 7457,
 'wheat': 26,
 'black pepper': 2627,
 'shallots': 1477,
 'cornflour': 103,
 'cayenne pepper': 1523,
 'onions': 7972,
 'garlic paste': 282,
 'lemon juice': 1395,
 'chili powder': 2036,
 'passata': 24,
 'oil': 1970,
 'ground cumin': 2747,
 'boneless chicken skinless thigh': 343,
 'garam masala': 925,
 'double cream': 40,
 'natural yogurt': 18,
 'bay leaf

In [46]:
#write frequency of occurrences of ingredients to a csv file
import csv

with open('ingredients_count.csv', 'w', newline='', encoding = 'utf-8') as csvfile:
    fieldnames = ['Ingredient', 'Count']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for key in count:
        writer.writerow({'Ingredient': key, 'Count': count[key]})
        
count_df = pd.read_csv('ingredients_count.csv')
count_df =count_df.dropna() #drop any null entries
count_df.to_csv('ingredients_count.csv', encoding = 'utf-8', index = False)
count_df = pd.read_csv('ingredients_count.csv')
count_df

Unnamed: 0,Ingredient,Count
0,romaine lettuce,270
1,black olives,229
2,grape tomatoes,228
3,garlic,7380
4,pepper,4438
...,...,...
6719,Lipton® Iced Tea Brew Family Size Tea Bags,1
6720,Hidden Valley® Greek Yogurt Original Ranch® Di...,1
6721,lop chong,1
6722,tomato garlic pasta sauce,1


In [47]:
#obtain all the ingredients from greek cuisine
greek_ingredient_str=""
for x in data_csv.Index:
    if data_csv.Cuisine[x] == 'greek':
        greek_ingredient_str=greek_ingredient_str + data_csv.Ingredients[x]+', '
greek_ingredient_str        

'romaine lettuce, black olives, grape tomatoes, garlic, pepper, purple onion, seasoning, garbanzo beans, feta cheese crumbles, ground pork, finely chopped fresh parsley, onions, salt, vinegar, caul fat, minced garlic, dried oregano, red wine vinegar, olive oil, boneless chop pork, lemon juice, orange, anise, cinnamon sticks, unflavored gelatin, zinfandel, orange blossom honey, sugar, lemon, calimyrna figs, clove, honey, whipping cream, plain whole-milk yogurt, fresh dill, yoghurt, salt, myzithra, large eggs, cheese, feta cheese, phyllo, kefalotyri, ground black pepper, extra-virgin olive oil, onions, olive oil, salt, hamburger buns, paprika, chopped fresh mint, ground cinnamon, balsamic vinegar, feta cheese crumbles, baby spinach leaves, purple onion, ground lamb, pepper, dried mint flakes, salt, dried oregano, tomatoes, ground black pepper, garlic, dried dillweed, olive oil, red wine, lamb, plain yogurt, pita bread rounds, purple onion, cucumber, garbanzo beans, liquid, black pepper, 

In [48]:
#count number of appearance of each ingredient in greek cuisine (non unique)
greek_ingredient_list = list(greek_ingredient_str.split(', '))
count_greek_ingredient = {} 
for item in greek_ingredient_list:
    if (item in count_greek_ingredient):
        count_greek_ingredient[item] += 1
    else:
        count_greek_ingredient[item] = 1    
count_greek_ingredient

{'romaine lettuce': 39,
 'black olives': 31,
 'grape tomatoes': 26,
 'garlic': 216,
 'pepper': 203,
 'purple onion': 186,
 'seasoning': 3,
 'garbanzo beans': 23,
 'feta cheese crumbles': 252,
 'ground pork': 4,
 'finely chopped fresh parsley': 8,
 'onions': 185,
 'salt': 572,
 'vinegar': 3,
 'caul fat': 1,
 'minced garlic': 68,
 'dried oregano': 267,
 'red wine vinegar': 99,
 'olive oil': 504,
 'boneless chop pork': 1,
 'lemon juice': 183,
 'orange': 12,
 'anise': 5,
 'cinnamon sticks': 31,
 'unflavored gelatin': 2,
 'zinfandel': 2,
 'orange blossom honey': 3,
 'sugar': 77,
 'lemon': 129,
 'calimyrna figs': 1,
 'clove': 14,
 'honey': 67,
 'whipping cream': 2,
 'plain whole-milk yogurt': 5,
 'fresh dill': 108,
 'yoghurt': 10,
 'myzithra': 2,
 'large eggs': 72,
 'cheese': 15,
 'feta cheese': 191,
 'phyllo': 12,
 'kefalotyri': 9,
 'ground black pepper': 221,
 'extra-virgin olive oil': 229,
 'hamburger buns': 6,
 'paprika': 20,
 'chopped fresh mint': 68,
 'ground cinnamon': 73,
 'balsamic 

In [49]:
with open('greek_ingredients_count.csv', 'w', newline='', encoding = 'utf-8') as csvfile:
    fieldnames = ['Ingredient', 'Count']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for key in count_greek_ingredient:
        writer.writerow({'Ingredient': key, 'Count': count_greek_ingredient[key]})

In [50]:
greek_ingredients_count_df = pd.read_csv('greek_ingredients_count.csv')
greek_ingredients_count_df = greek_ingredients_count_df.dropna()
greek_ingredients_count_df['Ingredient'] = greek_ingredients_count_df['Ingredient'].str.capitalize()
greek_ingredients_count_df = greek_ingredients_count_df.sort_values(by="Ingredient")
greek_ingredients_count_df = greek_ingredients_count_df.reset_index(drop=True)
greek_ingredients_count_df.to_csv('greek_ingredients_count.csv', index = True, encoding='utf-8')
greek_ingredients_count_df = pd.read_csv('greek_ingredients_count.csv')
greek_ingredients_count_df.columns=['Index','Ingredient','Count']
greek_ingredients_count_df.to_csv('greek_ingredients_count.csv', index = False, encoding='utf-8')
greek_ingredients_count_df

Unnamed: 0,Index,Ingredient,Count
0,0,"""hellmann or best food light mayonnais""",1
1,1,"""hellmann or best food real mayonnais""",1
2,2,"""i cant believ it not butter! made with olive ...",1
3,3,"""uncle bens original converted brand rice""",1
4,4,(10 oz.) frozen chopped spinach,1
...,...,...,...
1198,1198,Zest,2
1199,1199,Zesty italian dressing,1
1200,1200,Zinfandel,2
1201,1201,Ziti,3


Next,we continue the same steps with the other 19 cuisines.

In [51]:
#obtain all the ingredients from brazilian cuisine
brazilian_ingredient_str=""
for x in data_csv.Index:
    if data_csv.Cuisine[x] == 'brazilian':
        brazilian_ingredient_str=brazilian_ingredient_str + data_csv.Ingredients[x]+', '
brazilian_ingredient_str 

#count number of appearance of each ingredient in brazilian cuisine (non unique)
brazilian_ingredient_list = list(brazilian_ingredient_str.split(', '))
count_brazilian_ingredient = {} 
for item in brazilian_ingredient_list:
    if (item in count_brazilian_ingredient):
        count_brazilian_ingredient[item] += 1
    else:
        count_brazilian_ingredient[item] = 1    
count_brazilian_ingredient

with open('brazilian_ingredients_count.csv', 'w', newline='', encoding = 'utf-8') as csvfile:
    fieldnames = ['Ingredient', 'Count']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for key in count_brazilian_ingredient:
        writer.writerow({'Ingredient': key, 'Count': count_brazilian_ingredient[key]})
        
brazilian_ingredients_count_df = pd.read_csv('brazilian_ingredients_count.csv')
brazilian_ingredients_count_df = brazilian_ingredients_count_df.dropna()
brazilian_ingredients_count_df['Ingredient'] = brazilian_ingredients_count_df['Ingredient'].str.capitalize()
brazilian_ingredients_count_df = brazilian_ingredients_count_df.sort_values(by="Ingredient")
brazilian_ingredients_count_df = brazilian_ingredients_count_df.reset_index(drop=True)
brazilian_ingredients_count_df.to_csv('brazilian_ingredients_count.csv', index = True, encoding='utf-8')
brazilian_ingredients_count_df = pd.read_csv('brazilian_ingredients_count.csv')
brazilian_ingredients_count_df.columns=['Index','Ingredient','Count']
brazilian_ingredients_count_df.to_csv('brazilian_ingredients_count.csv', index = False, encoding='utf-8')
brazilian_ingredients_count_df


Unnamed: 0,Index,Ingredient,Count
0,0,1% low-fat milk,1
1,1,2% reduced-fat milk,1
2,2,Acai juice,1
3,3,Active dry yeast,3
4,4,Adobo style seasoning,1
...,...,...,...
848,848,Yuca,1
849,849,Yucca,1
850,850,Yucca root,1
851,851,Yukon gold potatoes,1


In [52]:
#obtain all the ingredients from british cuisine
british_ingredient_str=""
for x in data_csv.Index:
    if data_csv.Cuisine[x] == 'british':
        british_ingredient_str=british_ingredient_str + data_csv.Ingredients[x]+', '
british_ingredient_str 

#count number of appearance of each ingredient in british cuisine (non unique)
british_ingredient_list = list(british_ingredient_str.split(', '))
count_british_ingredient = {} 
for item in british_ingredient_list:
    if (item in count_british_ingredient):
        count_british_ingredient[item] += 1
    else:
        count_british_ingredient[item] = 1    
count_british_ingredient

with open('british_ingredients_count.csv', 'w', newline='', encoding = 'utf-8') as csvfile:
    fieldnames = ['Ingredient', 'Count']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for key in count_british_ingredient:
        writer.writerow({'Ingredient': key, 'Count': count_british_ingredient[key]})
        
british_ingredients_count_df = pd.read_csv('british_ingredients_count.csv')
british_ingredients_count_df = british_ingredients_count_df.dropna()
british_ingredients_count_df['Ingredient'] = british_ingredients_count_df['Ingredient'].str.capitalize()
british_ingredients_count_df = british_ingredients_count_df.sort_values(by="Ingredient")
british_ingredients_count_df = british_ingredients_count_df.reset_index(drop=True)
british_ingredients_count_df.to_csv('british_ingredients_count.csv', index = True, encoding='utf-8')
british_ingredients_count_df = pd.read_csv('british_ingredients_count.csv')
british_ingredients_count_df.columns=['Index','Ingredient','Count']
british_ingredients_count_df.to_csv('british_ingredients_count.csv', index = False, encoding='utf-8')
british_ingredients_count_df


Unnamed: 0,Index,Ingredient,Count
0,0,"""colmans mustard powder""",1
1,1,"""coxs orange pippin""",1
2,2,2% reduced-fat milk,3
3,3,Active dry yeast,12
4,4,Aged cheddar cheese,1
...,...,...,...
1161,1161,Yellow split peas,1
1162,1162,Yoghurt,2
1163,1163,Yukon gold,1
1164,1164,Yukon gold potatoes,8


In [53]:
#obtain all the ingredients from cajun_creole cuisine
cajun_creole_ingredient_str=""
for x in data_csv.Index:
    if data_csv.Cuisine[x] == 'cajun_creole':
        cajun_creole_ingredient_str=cajun_creole_ingredient_str + data_csv.Ingredients[x]+', '
cajun_creole_ingredient_str 

#count number of appearance of each ingredient in cajun_creole cuisine (non unique)
cajun_creole_ingredient_list = list(cajun_creole_ingredient_str.split(', '))
count_cajun_creole_ingredient = {} 
for item in cajun_creole_ingredient_list:
    if (item in count_cajun_creole_ingredient):
        count_cajun_creole_ingredient[item] += 1
    else:
        count_cajun_creole_ingredient[item] = 1    
count_cajun_creole_ingredient

with open('cajun_creole_ingredients_count.csv', 'w', newline='', encoding = 'utf-8') as csvfile:
    fieldnames = ['Ingredient', 'Count']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for key in count_cajun_creole_ingredient:
        writer.writerow({'Ingredient': key, 'Count': count_cajun_creole_ingredient[key]})
        
cajun_creole_ingredients_count_df = pd.read_csv('cajun_creole_ingredients_count.csv')
cajun_creole_ingredients_count_df = cajun_creole_ingredients_count_df.dropna()
cajun_creole_ingredients_count_df['Ingredient'] = cajun_creole_ingredients_count_df['Ingredient'].str.capitalize()
cajun_creole_ingredients_count_df = cajun_creole_ingredients_count_df.sort_values(by="Ingredient")
cajun_creole_ingredients_count_df = cajun_creole_ingredients_count_df.reset_index(drop=True)
cajun_creole_ingredients_count_df.to_csv('cajun_creole_ingredients_count.csv', index = True, encoding='utf-8')
cajun_creole_ingredients_count_df = pd.read_csv('cajun_creole_ingredients_count.csv')
cajun_creole_ingredients_count_df.columns=['Index','Ingredient','Count']
cajun_creole_ingredients_count_df.to_csv('cajun_creole_ingredients_count.csv', index = False, encoding='utf-8')
cajun_creole_ingredients_count_df


Unnamed: 0,Index,Ingredient,Count
0,0,"""hellmann or best food real mayonnais""",1
1,1,"""hellmanns® real mayonnaise""",1
2,2,"""johnsonville® hot n spicy brats""",1
3,3,"""potatoes obrien""",1
4,4,"""tony chacheres seasoning""",1
...,...,...,...
1575,1575,Yukon gold,1
1576,1576,Yukon gold potatoes,1
1577,1577,Zatarains creole seasoning,1
1578,1578,Zatarain’s jambalaya mix,1


In [54]:
#obtain all the ingredients from chinese cuisine
chinese_ingredient_str=""
for x in data_csv.Index:
    if data_csv.Cuisine[x] == 'chinese':
        chinese_ingredient_str=chinese_ingredient_str + data_csv.Ingredients[x]+', '
chinese_ingredient_str 

#count number of appearance of each ingredient in chinese cuisine (non unique)
chinese_ingredient_list = list(chinese_ingredient_str.split(', '))
count_chinese_ingredient = {} 
for item in chinese_ingredient_list:
    if (item in count_chinese_ingredient):
        count_chinese_ingredient[item] += 1
    else:
        count_chinese_ingredient[item] = 1    
count_chinese_ingredient

with open('chinese_ingredients_count.csv', 'w', newline='', encoding = 'utf-8') as csvfile:
    fieldnames = ['Ingredient', 'Count']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for key in count_chinese_ingredient:
        writer.writerow({'Ingredient': key, 'Count': count_chinese_ingredient[key]})
        
chinese_ingredients_count_df = pd.read_csv('chinese_ingredients_count.csv')
chinese_ingredients_count_df = chinese_ingredients_count_df.dropna()
chinese_ingredients_count_df['Ingredient'] = chinese_ingredients_count_df['Ingredient'].str.capitalize()
chinese_ingredients_count_df = chinese_ingredients_count_df.sort_values(by="Ingredient")
chinese_ingredients_count_df = chinese_ingredients_count_df.reset_index(drop=True)
chinese_ingredients_count_df.to_csv('chinese_ingredients_count.csv', index = True, encoding='utf-8')
chinese_ingredients_count_df = pd.read_csv('chinese_ingredients_count.csv')
chinese_ingredients_count_df.columns=['Index','Ingredient','Count']
chinese_ingredients_count_df.to_csv('chinese_ingredients_count.csv', index = False, encoding='utf-8')
chinese_ingredients_count_df


Unnamed: 0,Index,Ingredient,Count
0,0,"""egglands best® eggs""",1
1,1,"""i cant believe its not butter!® spread""",1
2,2,"""pigs trotters""",1
3,3,33% less sodium smoked fully cooked ham,1
4,4,Abalone,2
...,...,...,...
1791,1791,Yellow squash,3
1792,1792,Yolk,1
1793,1793,Yu choy,3
1794,1794,Zest,1


In [55]:
#obtain all the ingredients from filipino cuisine
filipino_ingredient_str=""
for x in data_csv.Index:
    if data_csv.Cuisine[x] == 'filipino':
        filipino_ingredient_str=filipino_ingredient_str + data_csv.Ingredients[x]+', '
filipino_ingredient_str 

#count number of appearance of each ingredient in filipino cuisine (non unique)
filipino_ingredient_list = list(filipino_ingredient_str.split(', '))
count_filipino_ingredient = {} 
for item in filipino_ingredient_list:
    if (item in count_filipino_ingredient):
        count_filipino_ingredient[item] += 1
    else:
        count_filipino_ingredient[item] = 1    
count_filipino_ingredient

with open('filipino_ingredients_count.csv', 'w', newline='', encoding = 'utf-8') as csvfile:
    fieldnames = ['Ingredient', 'Count']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for key in count_filipino_ingredient:
        writer.writerow({'Ingredient': key, 'Count': count_filipino_ingredient[key]})
        
filipino_ingredients_count_df = pd.read_csv('filipino_ingredients_count.csv')
filipino_ingredients_count_df = filipino_ingredients_count_df.dropna()
filipino_ingredients_count_df['Ingredient'] = filipino_ingredients_count_df['Ingredient'].str.capitalize()
filipino_ingredients_count_df = filipino_ingredients_count_df.sort_values(by="Ingredient")
filipino_ingredients_count_df = filipino_ingredients_count_df.reset_index(drop=True)
filipino_ingredients_count_df.to_csv('filipino_ingredients_count.csv', index = True, encoding='utf-8')
filipino_ingredients_count_df = pd.read_csv('filipino_ingredients_count.csv')
filipino_ingredients_count_df.columns=['Index','Ingredient','Count']
filipino_ingredients_count_df.to_csv('filipino_ingredients_count.csv', index = False, encoding='utf-8')
filipino_ingredients_count_df


Unnamed: 0,Index,Ingredient,Count
0,0,"""franks® redhot® original cayenne pepper sauce""",1
1,1,7 up,2
2,2,Accent seasoning,6
3,3,Achiote powder,3
4,4,Acorn squash,1
...,...,...,...
944,944,Yellow squash,1
945,945,Young coconut meat,1
946,946,Yucca,1
947,947,Yukon gold potatoes,2


In [56]:
#obtain all the ingredients from french cuisine
french_ingredient_str=""
for x in data_csv.Index:
    if data_csv.Cuisine[x] == 'french':
        french_ingredient_str=french_ingredient_str + data_csv.Ingredients[x]+', '
french_ingredient_str 

#count number of appearance of each ingredient in french cuisine (non unique)
french_ingredient_list = list(french_ingredient_str.split(', '))
count_french_ingredient = {} 
for item in french_ingredient_list:
    if (item in count_french_ingredient):
        count_french_ingredient[item] += 1
    else:
        count_french_ingredient[item] = 1    
count_french_ingredient

with open('french_ingredients_count.csv', 'w', newline='', encoding = 'utf-8') as csvfile:
    fieldnames = ['Ingredient', 'Count']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for key in count_french_ingredient:
        writer.writerow({'Ingredient': key, 'Count': count_french_ingredient[key]})
        
french_ingredients_count_df = pd.read_csv('french_ingredients_count.csv')
french_ingredients_count_df = french_ingredients_count_df.dropna()
french_ingredients_count_df['Ingredient'] = french_ingredients_count_df['Ingredient'].str.capitalize()
french_ingredients_count_df = french_ingredients_count_df.sort_values(by="Ingredient")
french_ingredients_count_df = french_ingredients_count_df.reset_index(drop=True)
french_ingredients_count_df.to_csv('french_ingredients_count.csv', index = True, encoding='utf-8')
french_ingredients_count_df = pd.read_csv('french_ingredients_count.csv')
french_ingredients_count_df.columns=['Index','Ingredient','Count']
french_ingredients_count_df.to_csv('french_ingredients_count.csv', index = False, encoding='utf-8')
french_ingredients_count_df


Unnamed: 0,Index,Ingredient,Count
0,0,"""i cant believe its not butter!® spread""",1
1,1,"""piment despelette""",2
2,2,"""soft goats cheese""",5
3,3,( oz.) tomato sauce,1
4,4,1% low-fat chocolate milk,1
...,...,...,...
2101,2101,Young leeks,1
2102,2102,Yukon gold,2
2103,2103,Yukon gold potatoes,41
2104,2104,Zinfandel,5


In [57]:
#obtain all the ingredients from greek cuisine
greek_ingredient_str=""
for x in data_csv.Index:
    if data_csv.Cuisine[x] == 'greek':
        greek_ingredient_str=greek_ingredient_str + data_csv.Ingredients[x]+', '
greek_ingredient_str 

#count number of appearance of each ingredient in greek cuisine (non unique)
greek_ingredient_list = list(greek_ingredient_str.split(', '))
count_greek_ingredient = {} 
for item in greek_ingredient_list:
    if (item in count_greek_ingredient):
        count_greek_ingredient[item] += 1
    else:
        count_greek_ingredient[item] = 1    
count_greek_ingredient

with open('greek_ingredients_count.csv', 'w', newline='', encoding = 'utf-8') as csvfile:
    fieldnames = ['Ingredient', 'Count']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for key in count_greek_ingredient:
        writer.writerow({'Ingredient': key, 'Count': count_greek_ingredient[key]})
        
greek_ingredients_count_df = pd.read_csv('greek_ingredients_count.csv')
greek_ingredients_count_df = greek_ingredients_count_df.dropna()
greek_ingredients_count_df['Ingredient'] = greek_ingredients_count_df['Ingredient'].str.capitalize()
greek_ingredients_count_df = greek_ingredients_count_df.sort_values(by="Ingredient")
greek_ingredients_count_df = greek_ingredients_count_df.reset_index(drop=True)
greek_ingredients_count_df.to_csv('greek_ingredients_count.csv', index = True, encoding='utf-8')
greek_ingredients_count_df = pd.read_csv('greek_ingredients_count.csv')
greek_ingredients_count_df.columns=['Index','Ingredient','Count']
greek_ingredients_count_df.to_csv('greek_ingredients_count.csv', index = False, encoding='utf-8')
greek_ingredients_count_df


Unnamed: 0,Index,Ingredient,Count
0,0,"""hellmann or best food light mayonnais""",1
1,1,"""hellmann or best food real mayonnais""",1
2,2,"""i cant believ it not butter! made with olive ...",1
3,3,"""uncle bens original converted brand rice""",1
4,4,(10 oz.) frozen chopped spinach,1
...,...,...,...
1198,1198,Zest,2
1199,1199,Zesty italian dressing,1
1200,1200,Zinfandel,2
1201,1201,Ziti,3


In [58]:
#obtain all the ingredients from indian cuisine
indian_ingredient_str=""
for x in data_csv.Index:
    if data_csv.Cuisine[x] == 'indian':
        indian_ingredient_str=indian_ingredient_str + data_csv.Ingredients[x]+', '
indian_ingredient_str 

#count number of appearance of each ingredient in indian cuisine (non unique)
indian_ingredient_list = list(indian_ingredient_str.split(', '))
count_indian_ingredient = {} 
for item in indian_ingredient_list:
    if (item in count_indian_ingredient):
        count_indian_ingredient[item] += 1
    else:
        count_indian_ingredient[item] = 1    
count_indian_ingredient

with open('indian_ingredients_count.csv', 'w', newline='', encoding = 'utf-8') as csvfile:
    fieldnames = ['Ingredient', 'Count']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for key in count_indian_ingredient:
        writer.writerow({'Ingredient': key, 'Count': count_indian_ingredient[key]})
        
indian_ingredients_count_df = pd.read_csv('indian_ingredients_count.csv')
indian_ingredients_count_df = indian_ingredients_count_df.dropna()
indian_ingredients_count_df['Ingredient'] = indian_ingredients_count_df['Ingredient'].str.capitalize()
indian_ingredients_count_df = indian_ingredients_count_df.sort_values(by="Ingredient")
indian_ingredients_count_df = indian_ingredients_count_df.reset_index(drop=True)
indian_ingredients_count_df.to_csv('indian_ingredients_count.csv', index = True, encoding='utf-8')
indian_ingredients_count_df = pd.read_csv('indian_ingredients_count.csv')
indian_ingredients_count_df.columns=['Index','Ingredient','Count']
indian_ingredients_count_df.to_csv('indian_ingredients_count.csv', index = False, encoding='utf-8')
indian_ingredients_count_df


Unnamed: 0,Index,Ingredient,Count
0,0,"""quorn chikn tenders""",1
1,1,( oz.) tomato paste,2
2,2,(14 oz.) sweetened condensed milk,1
3,3,1% low-fat milk,9
4,4,2% reduced-fat milk,1
...,...,...,...
1662,1662,Yoghurt natural low fat,1
1663,1663,Yogurt cheese,2
1664,1664,Yogurt dressing,1
1665,1665,Yukon gold potatoes,27


In [59]:
#obtain all the ingredients from irish cuisine
irish_ingredient_str=""
for x in data_csv.Index:
    if data_csv.Cuisine[x] == 'irish':
        irish_ingredient_str=irish_ingredient_str + data_csv.Ingredients[x]+', '
irish_ingredient_str 

#count number of appearance of each ingredient in irish cuisine (non unique)
irish_ingredient_list = list(irish_ingredient_str.split(', '))
count_irish_ingredient = {} 
for item in irish_ingredient_list:
    if (item in count_irish_ingredient):
        count_irish_ingredient[item] += 1
    else:
        count_irish_ingredient[item] = 1    
count_irish_ingredient

with open('irish_ingredients_count.csv', 'w', newline='', encoding = 'utf-8') as csvfile:
    fieldnames = ['Ingredient', 'Count']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for key in count_irish_ingredient:
        writer.writerow({'Ingredient': key, 'Count': count_irish_ingredient[key]})
        
irish_ingredients_count_df = pd.read_csv('irish_ingredients_count.csv')
irish_ingredients_count_df = irish_ingredients_count_df.dropna()
irish_ingredients_count_df['Ingredient'] = irish_ingredients_count_df['Ingredient'].str.capitalize()
irish_ingredients_count_df = irish_ingredients_count_df.sort_values(by="Ingredient")
irish_ingredients_count_df = irish_ingredients_count_df.reset_index(drop=True)
irish_ingredients_count_df.to_csv('irish_ingredients_count.csv', index = True, encoding='utf-8')
irish_ingredients_count_df = pd.read_csv('irish_ingredients_count.csv')
irish_ingredients_count_df.columns=['Index','Ingredient','Count']
irish_ingredients_count_df.to_csv('irish_ingredients_count.csv', index = False, encoding='utf-8')
irish_ingredients_count_df


Unnamed: 0,Index,Ingredient,Count
0,0,"""hellmann or best food real mayonnais""",1
1,1,"""i cant believe its not butter!® spread""",1
2,2,"""m&ms candy""",1
3,3,"""pigs trotters""",1
4,4,"""soft goats cheese""",1
...,...,...,...
997,997,Yellow peppers,1
998,998,Yellow squash,1
999,999,Young nettle,1
1000,1000,Yukon gold potatoes,24


In [60]:
#obtain all the ingredients from italian cuisine
italian_ingredient_str=""
for x in data_csv.Index:
    if data_csv.Cuisine[x] == 'italian':
        italian_ingredient_str=italian_ingredient_str + data_csv.Ingredients[x]+', '
italian_ingredient_str 

#count number of appearance of each ingredient in italian cuisine (non unique)
italian_ingredient_list = list(italian_ingredient_str.split(', '))
count_italian_ingredient = {} 
for item in italian_ingredient_list:
    if (item in count_italian_ingredient):
        count_italian_ingredient[item] += 1
    else:
        count_italian_ingredient[item] = 1    
count_italian_ingredient

with open('italian_ingredients_count.csv', 'w', newline='', encoding = 'utf-8') as csvfile:
    fieldnames = ['Ingredient', 'Count']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for key in count_italian_ingredient:
        writer.writerow({'Ingredient': key, 'Count': count_italian_ingredient[key]})
        
italian_ingredients_count_df = pd.read_csv('italian_ingredients_count.csv')
italian_ingredients_count_df = italian_ingredients_count_df.dropna()
italian_ingredients_count_df['Ingredient'] = italian_ingredients_count_df['Ingredient'].str.capitalize()
italian_ingredients_count_df = italian_ingredients_count_df.sort_values(by="Ingredient")
italian_ingredients_count_df = italian_ingredients_count_df.reset_index(drop=True)
italian_ingredients_count_df.to_csv('italian_ingredients_count.csv', index = True, encoding='utf-8')
italian_ingredients_count_df = pd.read_csv('italian_ingredients_count.csv')
italian_ingredients_count_df.columns=['Index','Ingredient','Count']
italian_ingredients_count_df.to_csv('italian_ingredients_count.csv', index = False, encoding='utf-8')
italian_ingredients_count_df


Unnamed: 0,Index,Ingredient,Count
0,0,"""campbells condensed cream of mushroom soup""",1
1,1,"""hellmann or best food light mayonnais""",1
2,2,"""hellmann or best food real mayonnais""",8
3,3,"""hellmanns dijonnaise creamy dijon mustard""",1
4,4,"""i cant believe its not butter!® all purpose s...",1
...,...,...,...
2931,2931,Zesty italian dressing,11
2932,2932,Zinfandel,1
2933,2933,Ziti,26
2934,2934,Zucchini,326


In [61]:
#obtain all the ingredients from jamaican cuisine
jamaican_ingredient_str=""
for x in data_csv.Index:
    if data_csv.Cuisine[x] == 'jamaican':
        jamaican_ingredient_str=jamaican_ingredient_str + data_csv.Ingredients[x]+', '
jamaican_ingredient_str 

#count number of appearance of each ingredient in jamaican cuisine (non unique)
jamaican_ingredient_list = list(jamaican_ingredient_str.split(', '))
count_jamaican_ingredient = {} 
for item in jamaican_ingredient_list:
    if (item in count_jamaican_ingredient):
        count_jamaican_ingredient[item] += 1
    else:
        count_jamaican_ingredient[item] = 1    
count_jamaican_ingredient

with open('jamaican_ingredients_count.csv', 'w', newline='', encoding = 'utf-8') as csvfile:
    fieldnames = ['Ingredient', 'Count']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for key in count_jamaican_ingredient:
        writer.writerow({'Ingredient': key, 'Count': count_jamaican_ingredient[key]})
        
jamaican_ingredients_count_df = pd.read_csv('jamaican_ingredients_count.csv')
jamaican_ingredients_count_df = jamaican_ingredients_count_df.dropna()
jamaican_ingredients_count_df['Ingredient'] = jamaican_ingredients_count_df['Ingredient'].str.capitalize()
jamaican_ingredients_count_df = jamaican_ingredients_count_df.sort_values(by="Ingredient")
jamaican_ingredients_count_df = jamaican_ingredients_count_df.reset_index(drop=True)
jamaican_ingredients_count_df.to_csv('jamaican_ingredients_count.csv', index = True, encoding='utf-8')
jamaican_ingredients_count_df = pd.read_csv('jamaican_ingredients_count.csv')
jamaican_ingredients_count_df.columns=['Index','Ingredient','Count']
jamaican_ingredients_count_df.to_csv('jamaican_ingredients_count.csv', index = False, encoding='utf-8')
jamaican_ingredients_count_df


Unnamed: 0,Index,Ingredient,Count
0,0,"""hellmann or best food light mayonnais""",3
1,1,Ackee,9
2,2,Active dry yeast,2
3,3,Adobo sauce,1
4,4,Adobo seasoning,1
...,...,...,...
873,873,Yoghurt,1
874,874,Yuca,1
875,875,Yucca,1
876,876,Yukon gold potatoes,2


In [62]:
#obtain all the ingredients from japanese cuisine
japanese_ingredient_str=""
for x in data_csv.Index:
    if data_csv.Cuisine[x] == 'japanese':
        japanese_ingredient_str=japanese_ingredient_str + data_csv.Ingredients[x]+', '
japanese_ingredient_str 

#count number of appearance of each ingredient in japanese cuisine (non unique)
japanese_ingredient_list = list(japanese_ingredient_str.split(', '))
count_japanese_ingredient = {} 
for item in japanese_ingredient_list:
    if (item in count_japanese_ingredient):
        count_japanese_ingredient[item] += 1
    else:
        count_japanese_ingredient[item] = 1    
count_japanese_ingredient

with open('japanese_ingredients_count.csv', 'w', newline='', encoding = 'utf-8') as csvfile:
    fieldnames = ['Ingredient', 'Count']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for key in count_japanese_ingredient:
        writer.writerow({'Ingredient': key, 'Count': count_japanese_ingredient[key]})
        
japanese_ingredients_count_df = pd.read_csv('japanese_ingredients_count.csv')
japanese_ingredients_count_df = japanese_ingredients_count_df.dropna()
japanese_ingredients_count_df['Ingredient'] = japanese_ingredients_count_df['Ingredient'].str.capitalize()
japanese_ingredients_count_df = japanese_ingredients_count_df.sort_values(by="Ingredient")
japanese_ingredients_count_df = japanese_ingredients_count_df.reset_index(drop=True)
japanese_ingredients_count_df.to_csv('japanese_ingredients_count.csv', index = True, encoding='utf-8')
japanese_ingredients_count_df = pd.read_csv('japanese_ingredients_count.csv')
japanese_ingredients_count_df.columns=['Index','Ingredient','Count']
japanese_ingredients_count_df.to_csv('japanese_ingredients_count.csv', index = False, encoding='utf-8')
japanese_ingredients_count_df


Unnamed: 0,Index,Ingredient,Count
0,0,1% low-fat milk,1
1,1,2% reduced-fat milk,1
2,2,A taste of thai rice noodles,1
3,3,Abura age,4
4,4,Active dry yeast,2
...,...,...,...
1437,1437,Yuzu,5
1438,1438,Yuzu juice,3
1439,1439,Yuzukosho,2
1440,1440,Zest,1


In [63]:
#obtain all the ingredients from korean cuisine
korean_ingredient_str=""
for x in data_csv.Index:
    if data_csv.Cuisine[x] == 'korean':
        korean_ingredient_str=korean_ingredient_str + data_csv.Ingredients[x]+', '
korean_ingredient_str 

#count number of appearance of each ingredient in korean cuisine (non unique)
korean_ingredient_list = list(korean_ingredient_str.split(', '))
count_korean_ingredient = {} 
for item in korean_ingredient_list:
    if (item in count_korean_ingredient):
        count_korean_ingredient[item] += 1
    else:
        count_korean_ingredient[item] = 1    
count_korean_ingredient

with open('korean_ingredients_count.csv', 'w', newline='', encoding = 'utf-8') as csvfile:
    fieldnames = ['Ingredient', 'Count']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for key in count_korean_ingredient:
        writer.writerow({'Ingredient': key, 'Count': count_korean_ingredient[key]})
        
korean_ingredients_count_df = pd.read_csv('korean_ingredients_count.csv')
korean_ingredients_count_df = korean_ingredients_count_df.dropna()
korean_ingredients_count_df['Ingredient'] = korean_ingredients_count_df['Ingredient'].str.capitalize()
korean_ingredients_count_df = korean_ingredients_count_df.sort_values(by="Ingredient")
korean_ingredients_count_df = korean_ingredients_count_df.reset_index(drop=True)
korean_ingredients_count_df.to_csv('korean_ingredients_count.csv', index = True, encoding='utf-8')
korean_ingredients_count_df = pd.read_csv('korean_ingredients_count.csv')
korean_ingredients_count_df.columns=['Index','Ingredient','Count']
korean_ingredients_count_df.to_csv('korean_ingredients_count.csv', index = False, encoding='utf-8')
korean_ingredients_count_df


Unnamed: 0,Index,Ingredient,Count
0,0,"""franks® redhot® original cayenne pepper sauce""",1
1,1,Agave nectar,6
2,2,All purpose unbleached flour,2
3,3,All-purpose flour,31
4,4,Anchovies,17
...,...,...,...
893,893,Yellow peppers,1
894,894,Yellow squash,1
895,895,Yellow summer squash,1
896,896,Yellowfin tuna,1


In [64]:
#obtain all the ingredients from mexican cuisine
mexican_ingredient_str=""
for x in data_csv.Index:
    if data_csv.Cuisine[x] == 'mexican':
        mexican_ingredient_str=mexican_ingredient_str + data_csv.Ingredients[x]+', '
mexican_ingredient_str 

#count number of appearance of each ingredient in mexican cuisine (non unique)
mexican_ingredient_list = list(mexican_ingredient_str.split(', '))
count_mexican_ingredient = {} 
for item in mexican_ingredient_list:
    if (item in count_mexican_ingredient):
        count_mexican_ingredient[item] += 1
    else:
        count_mexican_ingredient[item] = 1    
count_mexican_ingredient

with open('mexican_ingredients_count.csv', 'w', newline='', encoding = 'utf-8') as csvfile:
    fieldnames = ['Ingredient', 'Count']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for key in count_mexican_ingredient:
        writer.writerow({'Ingredient': key, 'Count': count_mexican_ingredient[key]})
        
mexican_ingredients_count_df = pd.read_csv('mexican_ingredients_count.csv')
mexican_ingredients_count_df = mexican_ingredients_count_df.dropna()
mexican_ingredients_count_df['Ingredient'] = mexican_ingredients_count_df['Ingredient'].str.capitalize()
mexican_ingredients_count_df = mexican_ingredients_count_df.sort_values(by="Ingredient")
mexican_ingredients_count_df = mexican_ingredients_count_df.reset_index(drop=True)
mexican_ingredients_count_df.to_csv('mexican_ingredients_count.csv', index = True, encoding='utf-8')
mexican_ingredients_count_df = pd.read_csv('mexican_ingredients_count.csv')
mexican_ingredients_count_df.columns=['Index','Ingredient','Count']
mexican_ingredients_count_df.to_csv('mexican_ingredients_count.csv', index = False, encoding='utf-8')
mexican_ingredients_count_df


Unnamed: 0,Index,Ingredient,Count
0,0,"""best foods mayonnaise with lime juice""",2
1,1,"""breakstones sour cream""",1
2,2,"""campbells condensed cheddar cheese soup""",6
3,3,"""campbells condensed cream of chicken soup""",10
4,4,"""campbells condensed cream of mushroom soup""",1
...,...,...,...
2690,2690,Yukon gold potatoes,13
2691,2691,Zest,2
2692,2692,Zesty italian dressing,4
2693,2693,Zucchini,140


In [65]:
#obtain all the ingredients from moroccan cuisine
moroccan_ingredient_str=""
for x in data_csv.Index:
    if data_csv.Cuisine[x] == 'moroccan':
        moroccan_ingredient_str=moroccan_ingredient_str + data_csv.Ingredients[x]+', '
moroccan_ingredient_str 

#count number of appearance of each ingredient in moroccan cuisine (non unique)
moroccan_ingredient_list = list(moroccan_ingredient_str.split(', '))
count_moroccan_ingredient = {} 
for item in moroccan_ingredient_list:
    if (item in count_moroccan_ingredient):
        count_moroccan_ingredient[item] += 1
    else:
        count_moroccan_ingredient[item] = 1    
count_moroccan_ingredient

with open('moroccan_ingredients_count.csv', 'w', newline='', encoding = 'utf-8') as csvfile:
    fieldnames = ['Ingredient', 'Count']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for key in count_moroccan_ingredient:
        writer.writerow({'Ingredient': key, 'Count': count_moroccan_ingredient[key]})
        
moroccan_ingredients_count_df = pd.read_csv('moroccan_ingredients_count.csv')
moroccan_ingredients_count_df = moroccan_ingredients_count_df.dropna()
moroccan_ingredients_count_df['Ingredient'] = moroccan_ingredients_count_df['Ingredient'].str.capitalize()
moroccan_ingredients_count_df = moroccan_ingredients_count_df.sort_values(by="Ingredient")
moroccan_ingredients_count_df = moroccan_ingredients_count_df.reset_index(drop=True)
moroccan_ingredients_count_df.to_csv('moroccan_ingredients_count.csv', index = True, encoding='utf-8')
moroccan_ingredients_count_df = pd.read_csv('moroccan_ingredients_count.csv')
moroccan_ingredients_count_df.columns=['Index','Ingredient','Count']
moroccan_ingredients_count_df.to_csv('moroccan_ingredients_count.csv', index = False, encoding='utf-8')
moroccan_ingredients_count_df


Unnamed: 0,Index,Ingredient,Count
0,0,"""i cant believ it not butter! made with olive ...",1
1,1,"""piment despelette""",1
2,2,1% low-fat milk,1
3,3,Acorn squash,1
4,4,Active dry yeast,8
...,...,...,...
970,970,Yellow peppers,1
971,971,Yellow squash,5
972,972,Yoghurt,1
973,973,Yukon gold potatoes,10


In [66]:
#obtain all the ingredients from russian cuisine
russian_ingredient_str=""
for x in data_csv.Index:
    if data_csv.Cuisine[x] == 'russian':
        russian_ingredient_str=russian_ingredient_str + data_csv.Ingredients[x]+', '
russian_ingredient_str 

#count number of appearance of each ingredient in russian cuisine (non unique)
russian_ingredient_list = list(russian_ingredient_str.split(', '))
count_russian_ingredient = {} 
for item in russian_ingredient_list:
    if (item in count_russian_ingredient):
        count_russian_ingredient[item] += 1
    else:
        count_russian_ingredient[item] = 1    
count_russian_ingredient

with open('russian_ingredients_count.csv', 'w', newline='', encoding = 'utf-8') as csvfile:
    fieldnames = ['Ingredient', 'Count']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for key in count_russian_ingredient:
        writer.writerow({'Ingredient': key, 'Count': count_russian_ingredient[key]})
        
russian_ingredients_count_df = pd.read_csv('russian_ingredients_count.csv')
russian_ingredients_count_df = russian_ingredients_count_df.dropna()
russian_ingredients_count_df['Ingredient'] = russian_ingredients_count_df['Ingredient'].str.capitalize()
russian_ingredients_count_df = russian_ingredients_count_df.sort_values(by="Ingredient")
russian_ingredients_count_df = russian_ingredients_count_df.reset_index(drop=True)
russian_ingredients_count_df.to_csv('russian_ingredients_count.csv', index = True, encoding='utf-8')
russian_ingredients_count_df = pd.read_csv('russian_ingredients_count.csv')
russian_ingredients_count_df.columns=['Index','Ingredient','Count']
russian_ingredients_count_df.to_csv('russian_ingredients_count.csv', index = False, encoding='utf-8')
russian_ingredients_count_df


Unnamed: 0,Index,Ingredient,Count
0,0,"""hellmanns® real mayonnaise""",1
1,1,"""soft goats cheese""",2
2,2,1% low-fat milk,1
3,3,2% reduced-fat milk,1
4,4,Active dry yeast,35
...,...,...,...
867,867,Xanthan gum,2
868,868,Yeast,17
869,869,Yellow onion,9
870,870,Yukon gold potatoes,6


In [67]:
#obtain all the ingredients from southern_us cuisine
southern_us_ingredient_str=""
for x in data_csv.Index:
    if data_csv.Cuisine[x] == 'southern_us':
        southern_us_ingredient_str=southern_us_ingredient_str + data_csv.Ingredients[x]+', '
southern_us_ingredient_str 

#count number of appearance of each ingredient in southern_us cuisine (non unique)
southern_us_ingredient_list = list(southern_us_ingredient_str.split(', '))
count_southern_us_ingredient = {} 
for item in southern_us_ingredient_list:
    if (item in count_southern_us_ingredient):
        count_southern_us_ingredient[item] += 1
    else:
        count_southern_us_ingredient[item] = 1    
count_southern_us_ingredient

with open('southern_us_ingredients_count.csv', 'w', newline='', encoding = 'utf-8') as csvfile:
    fieldnames = ['Ingredient', 'Count']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for key in count_southern_us_ingredient:
        writer.writerow({'Ingredient': key, 'Count': count_southern_us_ingredient[key]})
        
southern_us_ingredients_count_df = pd.read_csv('southern_us_ingredients_count.csv')
southern_us_ingredients_count_df = southern_us_ingredients_count_df.dropna()
southern_us_ingredients_count_df['Ingredient'] = southern_us_ingredients_count_df['Ingredient'].str.capitalize()
southern_us_ingredients_count_df = southern_us_ingredients_count_df.sort_values(by="Ingredient")
southern_us_ingredients_count_df = southern_us_ingredients_count_df.reset_index(drop=True)
southern_us_ingredients_count_df.to_csv('southern_us_ingredients_count.csv', index = True, encoding='utf-8')
southern_us_ingredients_count_df = pd.read_csv('southern_us_ingredients_count.csv')
southern_us_ingredients_count_df.columns=['Index','Ingredient','Count']
southern_us_ingredients_count_df.to_csv('southern_us_ingredients_count.csv', index = False, encoding='utf-8')
southern_us_ingredients_count_df


Unnamed: 0,Index,Ingredient,Count
0,0,"""devils food cake mix""",1
1,1,"""egglands best® eggs""",1
2,2,"""franks® redhot® original cayenne pepper sauce""",1
3,3,"""hellmann or best food real mayonnais""",3
4,4,"""hellmanns® real mayonnaise""",2
...,...,...,...
2463,2463,Yoplait,1
2464,2464,Yukon gold potatoes,8
2465,2465,Zest,2
2466,2466,Zesty italian dressing,1


In [68]:
#obtain all the ingredients from spanish cuisine
spanish_ingredient_str=""
for x in data_csv.Index:
    if data_csv.Cuisine[x] == 'spanish':
        spanish_ingredient_str=spanish_ingredient_str + data_csv.Ingredients[x]+', '
spanish_ingredient_str 

#count number of appearance of each ingredient in spanish cuisine (non unique)
spanish_ingredient_list = list(spanish_ingredient_str.split(', '))
count_spanish_ingredient = {} 
for item in spanish_ingredient_list:
    if (item in count_spanish_ingredient):
        count_spanish_ingredient[item] += 1
    else:
        count_spanish_ingredient[item] = 1    
count_spanish_ingredient

with open('spanish_ingredients_count.csv', 'w', newline='', encoding = 'utf-8') as csvfile:
    fieldnames = ['Ingredient', 'Count']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for key in count_spanish_ingredient:
        writer.writerow({'Ingredient': key, 'Count': count_spanish_ingredient[key]})
        
spanish_ingredients_count_df = pd.read_csv('spanish_ingredients_count.csv')
spanish_ingredients_count_df = spanish_ingredients_count_df.dropna()
spanish_ingredients_count_df['Ingredient'] = spanish_ingredients_count_df['Ingredient'].str.capitalize()
spanish_ingredients_count_df = spanish_ingredients_count_df.sort_values(by="Ingredient")
spanish_ingredients_count_df = spanish_ingredients_count_df.reset_index(drop=True)
spanish_ingredients_count_df.to_csv('spanish_ingredients_count.csv', index = True, encoding='utf-8')
spanish_ingredients_count_df = pd.read_csv('spanish_ingredients_count.csv')
spanish_ingredients_count_df.columns=['Index','Ingredient','Count']
spanish_ingredients_count_df.to_csv('spanish_ingredients_count.csv', index = False, encoding='utf-8')
spanish_ingredients_count_df


Unnamed: 0,Index,Ingredient,Count
0,0,"""piment despelette""",2
1,1,"""soft goats cheese""",1
2,2,1% low-fat milk,7
3,3,Active dry yeast,4
4,4,Adobo,2
...,...,...,...
1263,1263,Yolk,1
1264,1264,Yukon gold,1
1265,1265,Yukon gold potatoes,21
1266,1266,Zinfandel,2


In [69]:
#obtain all the ingredients from thai cuisine
thai_ingredient_str=""
for x in data_csv.Index:
    if data_csv.Cuisine[x] == 'thai':
        thai_ingredient_str=thai_ingredient_str + data_csv.Ingredients[x]+', '
thai_ingredient_str 

#count number of appearance of each ingredient in thai cuisine (non unique)
thai_ingredient_list = list(thai_ingredient_str.split(', '))
count_thai_ingredient = {} 
for item in thai_ingredient_list:
    if (item in count_thai_ingredient):
        count_thai_ingredient[item] += 1
    else:
        count_thai_ingredient[item] = 1    
count_thai_ingredient

with open('thai_ingredients_count.csv', 'w', newline='', encoding = 'utf-8') as csvfile:
    fieldnames = ['Ingredient', 'Count']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for key in count_thai_ingredient:
        writer.writerow({'Ingredient': key, 'Count': count_thai_ingredient[key]})
        
thai_ingredients_count_df = pd.read_csv('thai_ingredients_count.csv')
thai_ingredients_count_df = thai_ingredients_count_df.dropna()
thai_ingredients_count_df['Ingredient'] = thai_ingredients_count_df['Ingredient'].str.capitalize()
thai_ingredients_count_df = thai_ingredients_count_df.sort_values(by="Ingredient")
thai_ingredients_count_df = thai_ingredients_count_df.reset_index(drop=True)
thai_ingredients_count_df.to_csv('thai_ingredients_count.csv', index = True, encoding='utf-8')
thai_ingredients_count_df = pd.read_csv('thai_ingredients_count.csv')
thai_ingredients_count_df.columns=['Index','Ingredient','Count']
thai_ingredients_count_df.to_csv('thai_ingredients_count.csv', index = False, encoding='utf-8')
thai_ingredients_count_df


Unnamed: 0,Index,Ingredient,Count
0,0,"""hellmann or best food real mayonnais""",2
1,1,"""hellmanns® real mayonnaise""",1
2,2,"""i cant believe its not butter!® spread""",1
3,3,( oz.) tomato sauce,1
4,4,Acorn squash,3
...,...,...,...
1375,1375,Yellow squash,5
1376,1376,Yoghurt,1
1377,1377,Young coconut meat,1
1378,1378,Yukon gold potatoes,4


In [70]:
#obtain all the ingredients from vietnamese cuisine
vietnamese_ingredient_str=""
for x in data_csv.Index:
    if data_csv.Cuisine[x] == 'vietnamese':
        vietnamese_ingredient_str=vietnamese_ingredient_str + data_csv.Ingredients[x]+', '
vietnamese_ingredient_str 

#count number of appearance of each ingredient in vietnamese cuisine (non unique)
vietnamese_ingredient_list = list(vietnamese_ingredient_str.split(', '))
count_vietnamese_ingredient = {} 
for item in vietnamese_ingredient_list:
    if (item in count_vietnamese_ingredient):
        count_vietnamese_ingredient[item] += 1
    else:
        count_vietnamese_ingredient[item] = 1    
count_vietnamese_ingredient

with open('vietnamese_ingredients_count.csv', 'w', newline='', encoding = 'utf-8') as csvfile:
    fieldnames = ['Ingredient', 'Count']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for key in count_vietnamese_ingredient:
        writer.writerow({'Ingredient': key, 'Count': count_vietnamese_ingredient[key]})
        
vietnamese_ingredients_count_df = pd.read_csv('vietnamese_ingredients_count.csv')
vietnamese_ingredients_count_df = vietnamese_ingredients_count_df.dropna()
vietnamese_ingredients_count_df['Ingredient'] = vietnamese_ingredients_count_df['Ingredient'].str.capitalize()
vietnamese_ingredients_count_df = vietnamese_ingredients_count_df.sort_values(by="Ingredient")
vietnamese_ingredients_count_df = vietnamese_ingredients_count_df.reset_index(drop=True)
vietnamese_ingredients_count_df.to_csv('vietnamese_ingredients_count.csv', index = True, encoding='utf-8')
vietnamese_ingredients_count_df = pd.read_csv('vietnamese_ingredients_count.csv')
vietnamese_ingredients_count_df.columns=['Index','Ingredient','Count']
vietnamese_ingredients_count_df.to_csv('vietnamese_ingredients_count.csv', index = False, encoding='utf-8')
vietnamese_ingredients_count_df


Unnamed: 0,Index,Ingredient,Count
0,0,"""franks® redhot® original cayenne pepper sauce""",1
1,1,"""hellmann or best food real mayonnais""",1
2,2,Acorn squash,1
3,3,Active dry yeast,1
4,4,Adzuki beans,1
...,...,...,...
1106,1106,Yellow peppers,1
1107,1107,Yellow rock sugar,8
1108,1108,Yellow squash,1
1109,1109,Yukon gold potatoes,1


### After putting and categorizing the respective cuisines and their ingredients into individual csv files and dataframes, data cleaning has concluded.