# Categories

Finds the unique categories and sorts them, finding the most frequent ones.

In [1]:
import pandas as pd
import json
import numpy as np
import csv

from tqdm import tqdm
import inflect

recipes_filename_parsed_ingredients_final = "../../data/full_format_recipes_parsed_ingredients_final.json"

In [2]:
df = pd.read_json(recipes_filename_parsed_ingredients_final)

In [3]:
all_categories_list = []
for idx in tqdm(range(len(df['categories']))):
    all_categories_list = all_categories_list + df['categories'][idx] 
all_categories = np.array(all_categories_list)

100%|███████████████████████████████████████████████████████████████████████████| 20111/20111 [00:55<00:00, 360.46it/s]


In [4]:
unique, counts = np.unique(all_categories, return_counts=True)
unique_df = pd.DataFrame(data = zip(unique, counts), columns = ['Unique', 'Counts'])
unique_df = unique_df.sort_values(by = 'Counts', ascending = False)
print(len(unique))

674


In [5]:
unique_df.head(20)

Unnamed: 0,Unique,Counts
57,Bon Appétit,9355
453,Peanut Free,8390
574,Soy Free,8088
624,Tree Nut Free,7044
636,Vegetarian,6846
240,Gourmet,6648
310,Kosher,6175
462,Pescatarian,6042
502,Quick & Easy,5372
654,Wheat/Gluten-Free,4906


In [6]:
n_most_frequent = 100
most_frequent_categories = unique_df['Unique'][:n_most_frequent].values.astype(str)
most_frequent_categories

array(['Bon Appétit', 'Peanut Free', 'Soy Free', 'Tree Nut Free',
       'Vegetarian', 'Gourmet', 'Kosher', 'Pescatarian', 'Quick & Easy',
       'Wheat/Gluten-Free', 'Bake', 'Summer', 'Dessert', 'Dairy Free',
       'Side', 'No Sugar Added', 'Winter', 'Fall', 'Dinner',
       'Sugar Conscious', 'Healthy', 'Kidney Friendly', 'Onion', 'Tomato',
       'Vegetable', 'Sauté', 'Milk/Cream', 'Fruit', 'Vegan',
       'Kid-Friendly', 'Egg', 'Spring', 'Herb', 'Garlic', 'Salad',
       'Dairy', 'Thanksgiving', 'Appetizer', 'Lunch', 'Cheese', 'Chicken',
       'Roast', 'No-Cook', 'Soup/Stew', 'Cocktail Party', 'Ginger',
       'Potato', 'Chill', 'Lemon', 'Grill/Barbecue', 'Drink', 'Sauce',
       'Low Cal', 'Christmas', 'High Fiber', 'Food Processor', 'Pasta',
       'Backyard BBQ', 'Fish', 'Low Fat', 'Pork', 'Condiment/Spread',
       'Nut', 'Leafy Green', 'Party', 'Simmer', 'Citrus', 'Chocolate',
       'Mushroom', 'Orange', 'Alcoholic', 'Brunch', 'Beef', 'Paleo',
       'Bell Pepper', 'Cake', 

## Keeps only handpicked categories

In [7]:
picked_categories = ['Peanut Free', 'Soy Free', 'Tree Nut Free',
       'Vegetarian', 'Gourmet', 'Kosher', 'Pescatarian',
       'Wheat/Gluten-Free', 'Dairy Free', 'No Sugar Added']

In [8]:
df['categories_preformatted'] = [[] for i in range(len(df['categories']))]
for j in tqdm(range(len(df['categories']))):
    df['categories_preformatted'][j] = [x for x in df['categories'][j] if x in picked_categories]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
100%|██████████████████████████████████████████████████████████████████████████| 20111/20111 [00:06<00:00, 3240.15it/s]


In [9]:
print(df['categories'][3])
print(df['categories_preformatted'][3])

['Fish', 'Olive', 'Tomato', 'Sauté', 'Low Fat', 'Low Cal', 'High Fiber', 'Dinner', 'Healthy', 'Simmer', 'Bon Appétit', 'Pescatarian', 'Dairy Free', 'Peanut Free', 'Tree Nut Free', 'Soy Free', 'Kosher']
['Pescatarian', 'Dairy Free', 'Peanut Free', 'Tree Nut Free', 'Soy Free', 'Kosher']


In [10]:
df.head()

Unnamed: 0,directions,fat,date,categories,calories,desc,protein,rating,title,ingredients,sodium,ingredients_individually,ingredients_preformatted,ingredients_to_cnn,categories_preformatted
0,"[1. Place the stock, lentils, celery, carrot, ...",7.0,2006-09-01 04:00:00,"[Sandwich, Bean, Fruit, Tomato, turkey, Vegeta...",426.0,,30.0,2.5,"Lentil, Apple, and Turkey Wrap","[4 cups low-sodium vegetable or chicken stock,...",559.0,"[chicken stock, lentils, lentils, celery, carr...","[turkey, tomato, oil, apple, bibb lettuce, chi...","[turkey, tomato, oil, apple, chicken, carrot, ...",[]
1,[Combine first 9 ingredients in heavy medium s...,23.0,2004-08-20 04:00:00,"[Food Processor, Onion, Pork, Bake, Bastille D...",403.0,This uses the same ingredients found in boudin...,18.0,4.375,Boudin Blanc Terrine with Red Onion Confit,"[1 1/2 cups whipping cream, 2 medium onions, c...",1439.0,"[cream, onions, salt, bay leaves, cloves, garl...","[cream, shallot, clove, salt, parsley, currant...","[cream, shallot, butter, onion, chicken, pork,...",[]
2,[In a large heavy saucepan cook diced fennel a...,7.0,2004-08-20 04:00:00,"[Soup/Stew, Dairy, Potato, Vegetable, Fennel, ...",165.0,,6.0,3.75,Potato and Fennel Soup Hodge,"[1 fennel bulb (sometimes called anise), stalk...",165.0,"[fennel bulb, onion, butter, potatoes, chicken...","[fennel, milk, chicken, potato, butter, onion]","[fennel, milk, chicken, potato, butter, onion]",[Gourmet]
3,[Heat oil in heavy large skillet over medium-h...,,2009-03-27 04:00:00,"[Fish, Olive, Tomato, Sauté, Low Fat, Low Cal,...",,The Sicilian-style tomato sauce has tons of Me...,,5.0,Mahi-Mahi in Tomato Olive Sauce,"[2 tablespoons extra-virgin olive oil, 1 cup c...",,"[olive oil, onion, wine, anchovy paste, tomato...","[oil, tomato, anchovy, fillet, bread, olive, o...","[oil, tomato, orange, wine, onion]","[Pescatarian, Dairy Free, Peanut Free, Tree Nu..."
4,[Preheat oven to 350°F. Lightly grease 8x8x2-i...,32.0,2004-08-20 04:00:00,"[Cheese, Dairy, Pasta, Vegetable, Side, Bake, ...",547.0,,20.0,3.125,Spinach Noodle Casserole,"[1 12-ounce package frozen spinach soufflé, th...",452.0,"[spinach, egg noodles, cream, pesto sauce, nut...","[cream, cheese, egg noodle, spinach, nutmeg]","[cream, cheese]",[Vegetarian]


### Save fully parsed dataframe

In [11]:
recipes_filename_parsed = "../../data/full_format_recipes_parsed.json"
df.to_json(recipes_filename_parsed)