# Primary EDA 

Primary EDA for the food-rec-system database, will be looking at how the data looks and what I can do to create the recommendation system!

In [39]:
# load and import packages for analysis

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import ast

In [40]:
# load the dataset

food = pd.read_csv('raw-data_recipe.csv')

In [3]:
# view first five rows
food.head()

Unnamed: 0,recipe_id,recipe_name,aver_rate,image_url,review_nums,ingredients,cooking_directions,nutritions,reviews
0,222388,Homemade Bacon,5.0,https://images.media-allrecipes.com/userphotos...,3,pork belly^smoked paprika^kosher salt^ground b...,{'directions': u'Prep\n5 m\nCook\n2 h 45 m\nRe...,"{u'niacin': {u'hasCompleteData': False, u'name...","{8542392: {'rating': 5, 'followersCount': 11, ..."
1,240488,"Pork Loin, Apples, and Sauerkraut",4.764706,https://images.media-allrecipes.com/userphotos...,29,sauerkraut drained^Granny Smith apples sliced^...,{'directions': u'Prep\n15 m\nCook\n2 h 30 m\nR...,"{u'niacin': {u'hasCompleteData': False, u'name...","{3574785: {'rating': 5, 'followersCount': 0, '..."
2,218939,Foolproof Rosemary Chicken Wings,4.571429,https://images.media-allrecipes.com/userphotos...,12,chicken wings^sprigs rosemary^head garlic^oliv...,"{'directions': u""Prep\n20 m\nCook\n40 m\nReady...","{u'niacin': {u'hasCompleteData': True, u'name'...","{13774946: {'rating': 5, 'followersCount': 0, ..."
3,87211,Chicken Pesto Paninis,4.625,https://images.media-allrecipes.com/userphotos...,163,focaccia bread quartered^prepared basil pesto^...,{'directions': u'Prep\n15 m\nCook\n5 m\nReady ...,"{u'niacin': {u'hasCompleteData': True, u'name'...","{1563136: {'rating': 5, 'followersCount': 0, '..."
4,245714,Potato Bacon Pizza,4.5,https://images.media-allrecipes.com/userphotos...,2,red potatoes^strips bacon^Sauce:^heavy whippin...,{'directions': u'Prep\n20 m\nCook\n45 m\nReady...,"{u'niacin': {u'hasCompleteData': True, u'name'...","{2945555: {'rating': 5, 'followersCount': 6690..."


In [4]:
# look at the columns
food.columns

Index(['recipe_id', 'recipe_name', 'aver_rate', 'image_url', 'review_nums',
       'ingredients', 'cooking_directions', 'nutritions', 'reviews'],
      dtype='object')

# Analysis of Information in the data

- recipe_id: A recipe ID 
- recipe_name: Name of the recipe
- aver_rate: Average rating for the recipe 
- image_url: Image of the recipe
- review_nums: Number of reviews
- ingredients: Ingredient list
- cooking_directions: Directions for recipe
- nutritions: Nutritional 
- reviews: Information about reviews, including text, the user who wrote it, and number of followers

In [5]:
# checking if there are any null rows
food.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49698 entries, 0 to 49697
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   recipe_id           49698 non-null  int64  
 1   recipe_name         49698 non-null  object 
 2   aver_rate           49698 non-null  float64
 3   image_url           49698 non-null  object 
 4   review_nums         49698 non-null  int64  
 5   ingredients         49698 non-null  object 
 6   cooking_directions  49698 non-null  object 
 7   nutritions          49698 non-null  object 
 8   reviews             49698 non-null  object 
dtypes: float64(1), int64(2), object(6)
memory usage: 3.4+ MB


# Looking into the recipe name, ingredients, directions, nutritions, and reviews columns 

This is where most of the important information is stored for this project.

In [9]:
# name

# look at the first 10 recipes and see how they are formatted
for name_index in range(0,10):
    print(food['recipe_name'][name_index])


Homemade Bacon
Pork Loin, Apples, and Sauerkraut
Foolproof Rosemary Chicken Wings
Chicken Pesto Paninis
Potato Bacon Pizza
Latin-Inspired Spicy Cream Chicken Stew
Reuben Sandwich I
Turkey Black Bean Burgers
Cranberry Pork Chops II
Schnitzel Sandwich


# Analysis on Ingredients

Each ingredient seems to be separated by a "^", will need to break this out, and then store these into a list/array (preferably array b/c all one data type and a LOT of data).

In [14]:
# ingredients 

# look at the first 10 ingredient "lists" and see how they are formatted
for ingredients_index in range(0,10):
    print(food['ingredients'][ingredients_index])

pork belly^smoked paprika^kosher salt^ground black pepper
sauerkraut drained^Granny Smith apples sliced^large onion^caraway seeds^apple cider divided^brown sugar^Rub:^Thai seasoning^salt^garlic powder^ground black pepper^boneless pork loin roast
chicken wings^sprigs rosemary^head garlic^olive oil^lemon pepper^seasoned salt
focaccia bread quartered^prepared basil pesto^diced cooked chicken^diced green bell pepper^diced red onion^shredded Monterey Jack cheese
red potatoes^strips bacon^Sauce:^heavy whipping cream^butter^minced garlic^grated Parmesan cheese^Crust:^warm water (125 degrees F 52 degrees C)^honey^active dry yeast^vegetable oil^all-purpose flour^shredded mozzarella cheese
skinless boneless chicken breast halves^diced tomatoes^green salsa^black beans rinsed and drained^pinto beans drained and rinsed^kernel corn^taco seasoning^chopped fresh cilantro^ground red chile pepper^ground cumin^cream cheese
rye bread^butter^thinly sliced corned beef^sauerkraut^mozzarella cheese
extra lean

In [13]:
# first ingredient list
food['ingredients'][0]

'pork belly^smoked paprika^kosher salt^ground black pepper'

# Analysis on Directions

In [17]:
# directions 

# look at the first 10 directions and see how they are formatted
for directions_index in range(0,10):
    print(food['cooking_directions'][directions_index])
    print(type(food['cooking_directions'][directions_index]))

{'directions': u'Prep\n5 m\nCook\n2 h 45 m\nReady In\n11 h 50 m\nPreheat oven to 200 degrees F (95 degrees C).\nSeason pork belly with paprika, salt, and pepper. Tightly wrap pork twice in heavy-duty aluminum foil. Place on a baking sheet and bake in the preheated oven for 2 1/2 hours. Turn off the oven; let pork rest in the oven for 1 hour. Remove meat from oven, leaving it wrapped in aluminum foil, and refrigerate at least 8 hours or overnight.\nRemove pork from foil and slice across the grain in 1/4-inch thick slices. Working in batches, cook pork in a non-stick skillet over medium heat until golden and crisped, 6 to 8 minutes per slice.'}
<class 'str'>
{'directions': u'Prep\n15 m\nCook\n2 h 30 m\nReady In\n2 h 45 m\nPreheat oven to 325 degrees F (165 degrees C).\nMix sauerkraut, apples, onion, and caraway seeds in a large roasting pan. Stir 1/4 cup apple cider and brown sugar together in a separate bowl; pour over sauerkraut mixture.\nStir Thai seasoning, salt, garlic powder, and b

# Looking at the directions

Each direction is essentially a dictionary as a string class. There are also line splits in the "dictionary" that need to be broken out and parsed through. It could be worth keeping the line split (\n) to ensure that printing these directions will be in a clean and effective manner. The "u" before the string is okay, it means that the string is a unicode string -> they are default in Python 3. They don't need to be removed to save time.


# Analysis on Nutrition

In [20]:
# nutrition 

# look at the first 10 nutrition "lists" and see how they are formatted
for nutrition_index in range(0,10):
    print(food['nutritions'][nutrition_index])
    print(type(food['nutritions'][nutrition_index]))

{u'niacin': {u'hasCompleteData': False, u'name': u'Niacin Equivalents', u'amount': 9.319291, u'percentDailyValue': u'72', u'displayValue': u'9', u'unit': u'mg'}, u'sugars': {u'hasCompleteData': True, u'name': u'Sugars', u'amount': 0.09355932, u'percentDailyValue': u'0', u'displayValue': u'0.1', u'unit': u'g'}, u'sodium': {u'hasCompleteData': True, u'name': u'Sodium', u'amount': 2017.13, u'percentDailyValue': u'81', u'displayValue': u'2017', u'unit': u'mg'}, u'carbohydrates': {u'hasCompleteData': True, u'name': u'Carbohydrates', u'amount': 1.797819, u'percentDailyValue': u'< 1', u'displayValue': u'1.8', u'unit': u'g'}, u'vitaminB6': {u'hasCompleteData': False, u'name': u'Vitamin B6', u'amount': 0.2329798, u'percentDailyValue': u'15', u'displayValue': u'< 1', u'unit': u'mg'}, u'calories': {u'hasCompleteData': True, u'name': u'Calories', u'amount': 308.1481, u'percentDailyValue': u'15', u'displayValue': u'308', u'unit': u'kcal'}, u'thiamin': {u'hasCompleteData': False, u'name': u'Thiamin'

# Looking at the nutrition

Each nutrition "list" is essentially a dictionary as a string class. There are also line splits in the "dictionary" that need to be broken out and parsed through. It could be worth keeping the line split (\n) to ensure that printing these directions will be in a clean and effective manner. The "u" before the string is okay, it means that the string is a unicode string -> they are default in Python 3. They don't need to be removed to save time.

**One thing to note is to make sure that each dictionary has the same keys (consistency in terms of showing this information).**


# Analysis on Reviews

In [21]:
# reviews 

# look at the first 10 revieews and see how they are formatted
for review_index in range(0,10):
    print(food['reviews'][review_index])
    print(type(food['reviews'][review_index]))

{8542392: {'rating': 5, 'followersCount': 11, 'madeRecipesCount': 18, 'favoritesCount': 200, 'dateLastModified': u'2017-04-22T12:46:43.663', 'text': u"Best breakfast ever! I ran out of paprika while seasoning, so I used garlic piercer on the other half of the batch. Very good! Can't wait to make it again... and figure out how to use the drippings!", 'followingCount': 0}, 11174581: {'rating': 5, 'followersCount': 8, 'madeRecipesCount': 55, 'favoritesCount': 101, 'dateLastModified': u'2013-06-20T15:50:25.96', 'text': u"Awesome!\nIt's amazing.", 'followingCount': 0}, 8262477: {'rating': 5, 'followersCount': 0, 'madeRecipesCount': 1, 'favoritesCount': 52, 'dateLastModified': u'2015-02-14T07:27:51.307', 'text': u'The flavors came together well and it really was simple to prepare. My husband and I both enjoyed it!', 'followingCount': 0}}

<class 'str'>
{3574785: {'rating': 5, 'followersCount': 0, 'madeRecipesCount': 4, 'favoritesCount': 118, 'dateLastModified': u'2017-10-07T18:20:08.973', 't

# Looking at the reviews

Each review is essentially a dictionary as a string class. There are also line splits in the "dictionary" that need to be broken out and parsed through. It could be worth keeping the line split (\n) to ensure that printing these directions will be in a clean and effective manner. The "u" before the string is okay, it means that the string is a unicode string -> they are default in Python 3. They don't need to be removed to save time.

**One thing to note is to make sure that each dictionary has the same keys (consistency in terms of showing this information).**


# Reformatting Problematic Columns

Columns to be reformatted:
- directions
- reviews
- nutrition

In [24]:
# using the ast package to convert strings to dictionaries

# testing it out 
original_String = '{"John" : 1, "Rick" : 2, "Sam" : 3}'

# printing original string
print("The original string is : " + str(original_String))

# using ast.literal_eval() method
result = ast.literal_eval(original_String)

# print result
print("The converted dictionary is : " + str(result))

The original string is : {"John" : 1, "Rick" : 2, "Sam" : 3}
The converted dictionary is : {'John': 1, 'Rick': 2, 'Sam': 3}


In [34]:
# trying it out on a nutrition string
nutrition_string = food['nutritions'][0]
print(nutrition_string)

# implement ast literal eval to convert it to a dictionary
result = ast.literal_eval(nutrition_string)

print(result)
print(type(result))

{u'niacin': {u'hasCompleteData': False, u'name': u'Niacin Equivalents', u'amount': 9.319291, u'percentDailyValue': u'72', u'displayValue': u'9', u'unit': u'mg'}, u'sugars': {u'hasCompleteData': True, u'name': u'Sugars', u'amount': 0.09355932, u'percentDailyValue': u'0', u'displayValue': u'0.1', u'unit': u'g'}, u'sodium': {u'hasCompleteData': True, u'name': u'Sodium', u'amount': 2017.13, u'percentDailyValue': u'81', u'displayValue': u'2017', u'unit': u'mg'}, u'carbohydrates': {u'hasCompleteData': True, u'name': u'Carbohydrates', u'amount': 1.797819, u'percentDailyValue': u'< 1', u'displayValue': u'1.8', u'unit': u'g'}, u'vitaminB6': {u'hasCompleteData': False, u'name': u'Vitamin B6', u'amount': 0.2329798, u'percentDailyValue': u'15', u'displayValue': u'< 1', u'unit': u'mg'}, u'calories': {u'hasCompleteData': True, u'name': u'Calories', u'amount': 308.1481, u'percentDailyValue': u'15', u'displayValue': u'308', u'unit': u'kcal'}, u'thiamin': {u'hasCompleteData': False, u'name': u'Thiamin'

In [46]:
# Define a lambda function to convert the string to a dictionary
to_dict_lambda = lambda x: ast.literal_eval(x)

# Apply the lambda function to the nutritions, cooking directions, and reviews
food['nutritions'] = food['nutritions'].apply(to_dict_lambda)
food['cooking_directions'] = food['cooking_directions'].apply(to_dict_lambda)
food['reviews'] = food['reviews'].apply(to_dict_lambda)


In [51]:
# process ingredients

# Define a lambda function to format the ingredients string
format_ingredients = lambda x: '\n'.join(x.split('^'))

# Apply the lambda function to the DataFrame column 'ingredients'
food['ingredients'] = food['ingredients'].apply(format_ingredients)



In [52]:
# save the progress! 

# THIS FILE IS ADDED TO GIT IGNORE, it will be zipped instead

food.to_csv('data_recipe.csv')