We need to figure out which of the columns in the csv file are 'categories' and which are 'ingredients'. By extracting what values are listed in categories from the JSON file, 

In [94]:
import pandas as pd

In [95]:
# read json file in as a dataframe
rec_df = pd.read_json('full_format_recipes.json', orient='columns')
rec_df.head()

Unnamed: 0,calories,categories,date,desc,directions,fat,ingredients,protein,rating,sodium,title
0,426.0,"[Sandwich, Bean, Fruit, Tomato, turkey, Vegeta...",2006-09-01 04:00:00,,"[1. Place the stock, lentils, celery, carrot, ...",7.0,"[4 cups low-sodium vegetable or chicken stock,...",30.0,2.5,559.0,"Lentil, Apple, and Turkey Wrap"
1,403.0,"[Food Processor, Onion, Pork, Bake, Bastille D...",2004-08-20 04:00:00,This uses the same ingredients found in boudin...,[Combine first 9 ingredients in heavy medium s...,23.0,"[1 1/2 cups whipping cream, 2 medium onions, c...",18.0,4.375,1439.0,Boudin Blanc Terrine with Red Onion Confit
2,165.0,"[Soup/Stew, Dairy, Potato, Vegetable, Fennel, ...",2004-08-20 04:00:00,,[In a large heavy saucepan cook diced fennel a...,7.0,"[1 fennel bulb (sometimes called anise), stalk...",6.0,3.75,165.0,Potato and Fennel Soup Hodge
3,,"[Fish, Olive, Tomato, Sauté, Low Fat, Low Cal,...",2009-03-27 04:00:00,The Sicilian-style tomato sauce has tons of Me...,[Heat oil in heavy large skillet over medium-h...,,"[2 tablespoons extra-virgin olive oil, 1 cup c...",,5.0,,Mahi-Mahi in Tomato Olive Sauce
4,547.0,"[Cheese, Dairy, Pasta, Vegetable, Side, Bake, ...",2004-08-20 04:00:00,,[Preheat oven to 350°F. Lightly grease 8x8x2-i...,32.0,"[1 12-ounce package frozen spinach soufflé, th...",20.0,3.125,452.0,Spinach Noodle Casserole


In [96]:
# extract all categories by appending them to the empty list categories_initial
categories_initial = []
for l in rec_df.categories:
    #print(str(type(l)) +" " + str(l))
    if type(l) == list:
        for cat in l:
            if cat not in categories_initial:
                categories_initial.append(cat)
                
# format categories
categories_initial = [cat.title() if cat.lower() == cat else cat for cat in categories_initial]                

# sort categories list and check it
categories_initial = sorted(categories_initial)
print('First 40 initial categories:', categories_initial[:40])
print('Number of initial categories:', len(categories_initial))

First 40 initial categories: ['#CAKEWEEK', '#WasteLess', '22-Minute Meals', '3-Ingredient Recipes', '30 Days of Groceries', 'Advance Prep Required', 'Alabama', 'Alaska', 'Alcoholic', 'Almond', 'Amaretto', 'Anchovy', 'Anise', 'Anniversary', 'Anthony Bourdain', 'Aperitif', 'Appetizer', 'Apple', 'Apple Juice', 'Apricot', 'Arizona', 'Artichoke', 'Arugula', 'Asian Pear', 'Asparagus', 'Aspen', 'Atlanta', 'Australia', 'Avocado', 'Back to School', 'Backyard BBQ', 'Bacon', 'Bake', 'Banana', 'Barley', 'Basil', 'Bass', 'Bastille Day', 'Bean', 'Beef']
Number of initial categories: 674


In [110]:
# there are too many categories, mostly due to the fact that ingredients are included as categories
# identify which categories are ingredients by searching for the category substring in the ingredients
#    list for that recipe

# extract all categories that are ingredients by appending them to the empty list ingredients_initial
ingredients_initial = []

for i, row in rec_df.iterrows():
    #print(str(type(l)) +" " + str(l))
    temp_cats = row.categories
    temp_ingr = row.ingredients
    if type(temp_cats) == list and type(temp_ingr) == list:
        for cat in temp_cats:
            cat = cat.lower()
            cat_title = cat.title()
            for ingr in temp_ingr:
                ingr = ingr.lower()
                if ingr.find(cat) != -1 and cat_title not in ingredients_initial:
                    ingredients_initial.append(cat_title)

ingredients_initial = sorted(ingredients_initial)
# print('Ingredients that are categories:', ingredients_initial)
print('Number of ingredients that are categories:', len(ingredients_initial))

Number of ingredients that are categories: 398


In [111]:
# some ingredients are indeed categories and not ingredients
# the initial ingredients list is easy to scan though, so remove the few categories that exist in it
# note: full food items (like Salad and Sandwich) are not ingredients from my perspective

not_ingredients = ['Bake', 'Blender', 'Boil', 'Bon Appétit', 'Braise', 'Breakfast', 'Broil', 'California', 
                   'Cocktail', 'Deep-Fry', 'Dessert', 'Dinner', 'Drink', 'Easter', 'Fall', 'Food Processor', 
                   'Fry', 'Game', 'Grill', 'Healthy', 'Ice Cream Machine', 'Juicer', 'Kosher', 'Mandoline', 
                   'Marinate', 'Microwave', 'Mixer', 'Mortar And Pestle', 'New York', 'Passover', 'Pasta Maker', 
                   'Pastry', 'Pie', 'Pizza', 'Poach', 'Pressure Cooker', 'Ramekin', 'Raw', 'Salad', 'Sandwich', 
                   'Sauté', 'Seafood', 'Side', 'Simmer', 'Skewer', 'Slow Cooker', 'Smoker', 'Snack', 'Spring',
                   'Steam', 'Stew', 'Stir-Fry', 'Summer', 'Tart', 'Thanksgiving', 'Vegan', 'Vegetarian',
                   'Winter']

# remove items from not_ingredients from the ingredients_initial list
ingredients = [ingr for ingr in ingredients_initial if ingr not in not_ingredients]

# check lengtih of ingredients list to make sure it's less than 398
print('Number of ingredients that are categories:', len(ingredients))

Number of ingredients that are categories: 340


In [113]:

categories = [cat for cat in categories_initial if cat not in ingredients]
print(categories)
print(len(categories))

['#CAKEWEEK', '#WasteLess', '22-Minute Meals', '3-Ingredient Recipes', '30 Days of Groceries', 'Advance Prep Required', 'Alabama', 'Alaska', 'Alcoholic', 'Anniversary', 'Anthony Bourdain', 'Aperitif', 'Appetizer', 'Arizona', 'Aspen', 'Atlanta', 'Australia', 'Back to School', 'Backyard BBQ', 'Bake', 'Bastille Day', 'Beverly Hills', 'Birthday', 'Biscuit', 'Blender', 'Boil', 'Bon Appétit', 'Bon App��tit', 'Boston', 'Braise', 'Breakfast', 'Broil', 'Brooklyn', 'Brownie', 'Brunch', 'Buffet', 'Bulgaria', 'Burrito', 'Butterscotch/Caramel', 'California', 'Cambridge', 'Camping', 'Canada', 'Casserole/Gratin', 'Caviar', 'Chicago', 'Christmas', 'Christmas Eve', 'Cinco de Mayo', 'Cobbler/Crumble', 'Cocktail', 'Cocktail Party', 'Coffee Grinder', 'Cognac/Armagnac', 'Colorado', 'Columbus', 'Condiment', 'Condiment/Spread', 'Connecticut', 'Cook Like a Diner', 'Cookbook Critic', 'Cookbooks', 'Cookie', 'Costa Mesa', 'Créme de Cacao', 'Crêpe', 'Cr��me de Cacao', 'Cuba', 'Cupcake', 'Custard', 'Dairy', 'Dairy

In [None]:
# this list is adequate for now