In [107]:
# import relevant libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [108]:
df_json = pd.read_json('../data/raw/full_format_recipes.json')
df_json.shape

(20130, 11)

## Data Dictionary

| Column      | Non-Null Count | Dtype               | Description                                     |
|-------------|----------------|---------------------|-------------------------------------------------|
| ------      | -------------- | -----               | ------                                          |
| title       | 15969 non-null | object              | Title of the recipe                             |
| directions  | 15969 non-null | object              | Steps for the recipe                            |
| ingredients | 15969 non-null | object              | Ingredients plus description of how to cut them |
| categories  | 15969 non-null | object              | Array of categories                             |
| calories    | 15969 non-null | float64             | Calories                                        |
| rating      | 15969 non-null | float64             | Rating on a scale of 0 to 5                     |
| desc        | 10636 non-null | object              | Extra tidbits about the recipe etc.             |
| date        | 15969 non-null | datetime64[ns, UTC] | Date the recipe was created                     |
| sodium      | 15967 non-null | float64             | Sodium content                                  |
| fat         | 15901 non-null | float64             | Fat content                                     |
| protein     | 15922 non-null | float64             | Protein content                                 |

In [109]:
cols_to_drop = ["desc",
                "date",
                "sodium",
                "fat",
                "protein"]

df_small = df_json.drop(columns=cols_to_drop)

In [110]:
df_small.shape

(20130, 6)

In [111]:
df_small.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20130 entries, 0 to 20129
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   directions   20111 non-null  object 
 1   categories   20111 non-null  object 
 2   calories     15976 non-null  float64
 3   rating       20100 non-null  float64
 4   title        20111 non-null  object 
 5   ingredients  20111 non-null  object 
dtypes: float64(2), object(4)
memory usage: 943.7+ KB


In [112]:
df_small.sample(5)

Unnamed: 0,directions,categories,calories,rating,title,ingredients
6399,[Heat oil in large skillet over medium-high he...,"[Side, Sauté, Stir-Fry, Quick & Easy, Low/No S...",57.0,3.75,Stir-Fried Vegetables,"[1 1/2 tablespoons vegetable oil, 4 ounces gre..."
3881,[Seed and dice 10 tomato halves; transfer to l...,"[Condiment/Spread, Sauce, Blender, Onion, Toma...",38.0,3.75,"Tomato, Onion, and Serrano Chile Salsa","[1 1/4 pounds plum tomatoes, cut in half, 1/3 ..."
12120,"[Pat neck, giblets, and wing tips dry, then co...","[turkey, Thanksgiving, Quick & Easy, Gourmet]",59.0,4.375,Quick and Rich Turkey Stock,"[Neck, giblets (excluding liver), and wing tip..."
3259,[In a large (5-to 6-quart) kettle combine wate...,"[Soup/Stew, Milk/Cream, Herb, Onion, Tomato, L...",214.0,4.375,Lobster and Shrimp Bisque,"[16 cups water, 1 1/2 cups dry white wine, two..."
19998,[Preheat oven to 450° F. and lightly coat a sh...,"[Onion, Side, Roast, Low Fat, Low Sodium, Whea...",,4.375,Roasted Balsamic Red Onions,"[vegetable-oil cooking spray, 1 1/2 pounds red..."


In [113]:
print(f"Directions contains entries of type: {type(df_small['directions'][0])}")
print(f"Categories contains entries of type: {type(df_small['categories'][0])}")
print(f"Title contains entries of type: {type(df_small['title'][0])}")
print(f"Ingredients contains entries of type: {type(df_small['ingredients'][0])}")

Directions contains entries of type: <class 'list'>
Categories contains entries of type: <class 'list'>
Title contains entries of type: <class 'str'>
Ingredients contains entries of type: <class 'list'>


Let's convert these list types into strings

In [114]:
df_small['directionsStr'] = df_small['directions'].astype(str)
df_small['categoriesStr'] = df_small['categories'].astype(str)
df_small['ingredientsStr'] = df_small['ingredients'].astype(str)


In [115]:
print(f"Directions contains entries of type: {type(df_small['directionsStr'][0])}")
print(f"Categories contains entries of type: {type(df_small['categoriesStr'][0])}")
print(f"Title contains entries of type: {type(df_small['title'][0])}")
print(f"Ingredients contains entries of type: {type(df_small['ingredientsStr'][0])}")

Directions contains entries of type: <class 'str'>
Categories contains entries of type: <class 'str'>
Title contains entries of type: <class 'str'>
Ingredients contains entries of type: <class 'str'>


In [116]:
# drop the original columns that contained list types
df_small.drop(columns=['directions','categories','ingredients'], inplace=True)

In [117]:
df_small.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20130 entries, 0 to 20129
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   calories        15976 non-null  float64
 1   rating          20100 non-null  float64
 2   title           20111 non-null  object 
 3   directionsStr   20130 non-null  object 
 4   categoriesStr   20130 non-null  object 
 5   ingredientsStr  20130 non-null  object 
dtypes: float64(2), object(4)
memory usage: 943.7+ KB


# Let's check for null values

In [118]:
df_small.isna().sum().sum()

4203

There are 4203 missing values.

In [119]:
df_small.isna().sum()

calories          4154
rating              30
title               19
directionsStr        0
categoriesStr        0
ingredientsStr       0
dtype: int64

Let's drop these rows that have null values

In [120]:
df_small = df_small.dropna(axis=0)

In [121]:
df_small.isna().sum()

calories          0
rating            0
title             0
directionsStr     0
categoriesStr     0
ingredientsStr    0
dtype: int64

In [122]:
df_small.shape

(15969, 6)

# check for duplicates

In [123]:
df_small.duplicated().sum()

1443

In [124]:
df_small.drop_duplicates(keep='first', inplace=True)

In [125]:
df_small.duplicated().sum()

0

In [126]:
df_small.shape

(14526, 6)

In [127]:
df_small.sample(5)

Unnamed: 0,calories,rating,title,directionsStr,categoriesStr,ingredientsStr
9584,374.0,3.75,Clams with Fennel and Pernod,['Heat oil in heavy large pot over medium-high...,"['Shellfish', 'Tomato', 'Appetizer', 'Sauté', ...","['1/4 cup olive oil', '2 cups chopped onions',..."
19277,599.0,0.0,Pineapple Mango Chutney Dip with Curried Walnuts,"['Stir together cream cheese, chutney, and may...","['Condiment/Spread', 'Cocktail Party', 'Quick ...","['8 ounces cream cheese, softened', '1/2 cup t..."
16860,745.0,3.75,Hot Browns,['In a large heavy skillet cook bacon in batch...,"['Sandwich', 'Milk/Cream', 'Cheese', 'turkey',...","['12 bacon slices (about 10 ounces)', '12 slic..."
394,910.0,4.375,Spaghetti and Swiss Chard With Garlic Chips,['Heat oil in a 12-inch heavy skillet over med...,"['Garlic', 'Pasta', 'Sauté', 'Vegetarian', 'Qu...","['1/3 cup extra-virgin olive oil', '1 head gar..."
11552,1103.0,4.375,Pelau,"['If using dried peas, soak them overnight in ...","['Milk/Cream', 'Chicken', 'Onion', 'Rice', 'Sa...","['1 cup dry or 1 (12-ounce) can pigeon peas, p..."


In [129]:
df_small.to_csv("../data/interim/full_recipes_cleaned_2.csv",index=True,index_label='recipeId')