In [19]:
# import relevant libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [20]:
df_json = pd.read_json('../data/raw/full_format_recipes.json')
df_json.shape

(20130, 11)

## Data Dictionary

| Column      | Non-Null Count | Dtype               | Description                                     |
|-------------|----------------|---------------------|-------------------------------------------------|
| ------      | -------------- | -----               | ------                                          |
| title       | 15969 non-null | object              | Title of the recipe                             |
| directions  | 15969 non-null | object              | Steps for the recipe                            |
| ingredients | 15969 non-null | object              | Ingredients plus description of how to cut them |
| categories  | 15969 non-null | object              | Array of categories                             |
| calories    | 15969 non-null | float64             | Calories                                        |
| rating      | 15969 non-null | float64             | Rating on a scale of 0 to 5                     |
| desc        | 10636 non-null | object              | Extra tidbits about the recipe etc.             |
| date        | 15969 non-null | datetime64[ns, UTC] | Date the recipe was created                     |
| sodium      | 15967 non-null | float64             | Sodium content                                  |
| fat         | 15901 non-null | float64             | Fat content                                     |
| protein     | 15922 non-null | float64             | Protein content                                 |

In [21]:
cols_to_drop = ["desc",
                "date",
                "sodium",
                "fat",
                "protein"]

df_small = df_json.drop(columns=cols_to_drop)

In [22]:
df_small.shape

(20130, 6)

In [23]:
df_small.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20130 entries, 0 to 20129
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   directions   20111 non-null  object 
 1   categories   20111 non-null  object 
 2   calories     15976 non-null  float64
 3   rating       20100 non-null  float64
 4   title        20111 non-null  object 
 5   ingredients  20111 non-null  object 
dtypes: float64(2), object(4)
memory usage: 943.7+ KB


In [24]:
df_small.sample(5)

Unnamed: 0,directions,categories,calories,rating,title,ingredients
19124,[Whisk lemon juice and mustard in small bowl. ...,"[Broil, High Fiber, Dinner, Halibut, Asparagus...",,4.375,Slow-Roasted Halibut with Shaved Asparagus and...,"[4 1/2 teaspoons fresh lemon juice, 1 tablespo..."
12345,[Preheat oven to 200°F. and oil a baking sheet...,"[Egg, Fruit, Dessert, Bake, Cocktail Party, Pi...",327.0,3.75,Passion-Fruit Meringues,"[2 large egg whites, 1/8 teaspoon cream of tar..."
13157,[Bring sugar and 1/2 cup water to a boil in a ...,"[Cake, Milk/Cream, Egg, Dessert, Bake, Valenti...",451.0,4.375,Caramel Pecan Cakes,"[2 cups sugar, 1 1/2 cups water, divided, 1/2 ..."
14530,[Puree the blueberries with the sugar and lemo...,"[Ice Cream Machine, Berry, Fruit, Dessert, Fre...",288.0,3.125,Blues-Busting Blueberry Ice Cream,"[2 1/2 cups fresh blueberries, rinsed, 1 cup s..."
3159,[Line large bowl with kitchen towel. Coarsely ...,"[Cheese, Potato, Vegetable, Side, Winter, Bon ...",303.0,3.75,Crispy Potato Galette,"[1 1/2 pounds russet potatoes, peeled, 3 table..."


In [25]:
print(f"Directions contains entries of type: {type(df_small['directions'][0])}")
print(f"Categories contains entries of type: {type(df_small['categories'][0])}")
print(f"Title contains entries of type: {type(df_small['title'][0])}")
print(f"Ingredients contains entries of type: {type(df_small['ingredients'][0])}")

Directions contains entries of type: <class 'list'>
Categories contains entries of type: <class 'list'>
Title contains entries of type: <class 'str'>
Ingredients contains entries of type: <class 'list'>


Let's convert these list types into strings

In [26]:
df_small['directionsStr'] = df_small['directions'].astype(str)
df_small['categoriesStr'] = df_small['categories'].astype(str)
df_small['ingredientsStr'] = df_small['ingredients'].astype(str)


In [27]:
print(f"Directions contains entries of type: {type(df_small['directionsStr'][0])}")
print(f"Categories contains entries of type: {type(df_small['categoriesStr'][0])}")
print(f"Title contains entries of type: {type(df_small['title'][0])}")
print(f"Ingredients contains entries of type: {type(df_small['ingredientsStr'][0])}")

Directions contains entries of type: <class 'str'>
Categories contains entries of type: <class 'str'>
Title contains entries of type: <class 'str'>
Ingredients contains entries of type: <class 'str'>


In [28]:
df_small['title'][0]

'Lentil, Apple, and Turkey Wrap '

There is an extra space at the end of the titles, let's remove that.

In [29]:
# Remove extra space from the end of the titles
df_small['title'] = df_small['title'].str.strip()

In [30]:
# verify that the space was removed
df_small['title'][0]

'Lentil, Apple, and Turkey Wrap'

In [31]:
# drop the original columns that contained list types
df_small.drop(columns=['directions','categories','ingredients'], inplace=True)

In [32]:
df_small.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20130 entries, 0 to 20129
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   calories        15976 non-null  float64
 1   rating          20100 non-null  float64
 2   title           20111 non-null  object 
 3   directionsStr   20130 non-null  object 
 4   categoriesStr   20130 non-null  object 
 5   ingredientsStr  20130 non-null  object 
dtypes: float64(2), object(4)
memory usage: 943.7+ KB


# Let's check for null values

In [33]:
df_small.isna().sum().sum()

4203

There are 4203 missing values.

In [34]:
df_small.isna().sum()

calories          4154
rating              30
title               19
directionsStr        0
categoriesStr        0
ingredientsStr       0
dtype: int64

Let's drop these rows that have null values

In [35]:
df_small = df_small.dropna(axis=0)

In [36]:
df_small.isna().sum()

calories          0
rating            0
title             0
directionsStr     0
categoriesStr     0
ingredientsStr    0
dtype: int64

In [37]:
df_small.shape

(15969, 6)

# check for duplicates

In [38]:
df_small.duplicated().sum()

1443

In [39]:
df_small.drop_duplicates(keep='first', inplace=True)

In [40]:
df_small.duplicated().sum()

0

In [41]:
df_small.shape

(14526, 6)

In [42]:
df_small.sample(5)

Unnamed: 0,calories,rating,title,directionsStr,categoriesStr,ingredientsStr
19703,2631.0,3.75,Baked Chicken with White Beans and Tomatoes,['Put oven rack in middle position and preheat...,"['Bean', 'Chicken', 'Poultry', 'Tomato', 'Bake...","['6 bacon slices (1/4 lb total), cut into 1-in..."
6207,226.0,0.0,"""Seethed"" Mussels with Parsley and Vinegar",['Place mussels in cold water and scrub them c...,"['Garlic', 'Herb', 'Shellfish', 'Quick & Easy'...","['4 pounds of mussels', '2 tablespoons butter'..."
19526,330.0,4.375,Whipped Ricotta with Honey and Mixed Berries,"['Blend ricotta, cream cheese, 2 tablespoons s...","['Berry', 'Dessert', 'Ricotta', 'Summer', 'Hon...","['2 cups whole-milk ricotta cheese', '4 ounces..."
8261,620.0,3.75,Fish-and-Chips,['1. Peel the potatoes and cut them into strip...,"['Egg', 'Potato', 'Vegetable', 'Kid-Friendly',...","['6 large russet potatoes', 'Vegetable oil, fo..."
11534,299.0,3.75,"Cauliflower with Bacon, Capers, Peppers, and R...",['Cook bacon in a 10- to 12-inch heavy skillet...,"['Fruit', 'Pepper', 'Side', 'Quick & Easy', 'R...","['3 oz bacon (3 or 4 slices), cut crosswise in..."


In [43]:
df_small.to_csv("../data/interim/full_recipes_cleaned_2.csv",index=True,index_label='recipeId')