In [120]:
from pathlib import Path
import shutil

import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
import locale
#from html.parser import HTMLParser
# import website_func.py to use its functions
from website_func import *
from scrap import *
import os #To read the file
#reload every module each time
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Some quick analysis of the data
We received a data set of .html file containing the website content of recipes.
We wanted to sort them by website, in order to, more easily, find a pattern among them. This will enable us to do the "scraping" of the pages. First we thought about moving the files in a folder corresponding to their website, but it would be a waste of time and a big computational effort. Thus, we came up with a (probably) faster solution : we could simply write the name of the file within its corresponding website folder. By inspecting the files, we saw that the first line was always containing a comment with the name of the file and the complete website. Using readlines and split, we could easily retrieve the name of the website.

We launched this process, but an alarm appeared describing a Trojan virus in the file "1c2cb6f0df04cf5a9d0baa116c6aa7bb.html". 
We had then to quarantine or maybe remove the file, as we have quite enough data.
By doing so, we remarked the file "msg.log" that could help us into fastering the processus as its content is formed of the name of the file together with its website.

In [121]:
with open("recipePages/msg.log", 'r') as f:
    first_lines = f.readlines()[45:49]
    print(*first_lines, sep="--")

59b9d3c43ee8df52fcd04128475059f5.html	http://allrecipes.com/Recipe/bacon-and-potato-soup/detail.aspx	2012-09-09 10:55:41 URL:http://allrecipes.com/Recipe/bacon-and-potato-soup/detail.aspx [225697/225697] -> "59b9d3c43ee8df52fcd04128475059f5.tmp" [1]
--129b4dad66e6d0d59bbc6f3200f9e927.html	http://familyoven.com/recipes/search?terms=Dairy%20Free%20Alfredo%20Sauce	2012-09-09 10:55:42 URL:http://familyoven.com/recipes/search?terms=Dairy%20Free%20Alfredo%20Sauce [77640] -> "129b4dad66e6d0d59bbc6f3200f9e927.tmp" [1]
--bafcd30bc137238af0cd9954f7d0701e.html	http://www.yummly.com/recipes/chicken-brunswick-stew-lima-beans	Syntax error in Set-Cookie: fbsr_54208124338=""; Expires=Thu, 01-Jan-1970 00:00:10 GMT; Path=/, JSESSIONID=27D73083292D7FE540727DC4AD6C50D2; Path=/; HttpOnly, yv="lJSpLE5cHmHHgrNrJgtM/w=="; Version=1; Max-Age=31536000; Expires=Mon, 09-Sep-2013 17:55:42 GMT; Path=/ at position 120.
--2012-09-09 10:55:43 URL:http://www.yummly.com/recipes/chicken-brunswick-stew-lima-beans [286650]

We observed that occasionally a line containing other info that are not necessary can appear (in the example above, this is the case in the last line). Those lines are ignored during the sorting process.
Also we noticed there are no file extensions other than .html and .log (see below). Thus, we know exacty what the contents of the folder recipePages are.

In [122]:
# We can find in the folder that, excepted the html files, there is only the log
# Get all filenames (i.e. path) that are in recipePages folder 
pathlist = Path("recipePages/").glob('**/*')
i = 0
for path in pathlist:
     # because path is object not string
    path_in_str = str(path)
    if not path_in_str.endswith(".html"):
        print(path_in_str)

recipePages\msg.log


##### Sorting part of the content of the folder "recipePages" :

In [123]:
# If override True -> delete SortedFiles and re-execute the function (long)
sort_website_from_log(override=False)

Finished sorting the files


After the sorting comes the scraping part. In order to have the maximum number of data and to save some time, we'll first scrap the websites containing the higher number of files.

In [124]:
website_count=pd.Series()

for (root,dirs,files) in os.walk('SortedFiles', topdown="True"):   
        for website in dirs:
            if website == ".ipynb_checkpoints":
                continue
            filename = "SortedFiles/" + website +"/filesName.txt"
            
            f = open(filename, "r")
            number_of_link = len(f.readlines())
            line_to_add=pd.Series(number_of_link,index=[website])
            website_count = website_count.append(line_to_add)

In [125]:
#print(website_count)
print("Number of website with more than 1000 files:", len(website_count[website_count.values >1000]))
print("Number of website with less than 1000 files:", len(website_count[website_count.values <1000]))

Number of website with more than 1000 files: 19
Number of website with less than 1000 files: 85


As we can observe, 19 websites have a really high number of files, higher than a thousand. The number of the files of those domains represents 53% of the total number of files.Those are the ones that we'll focus our attention on.

In [126]:
website_count.sort_values(ascending=False,inplace=True)
website_count_used=website_count[website_count.values>1000]
print(website_count_used)
total_number_files = website_count_used.values.sum()
chosen_number_files = website_count_used[0:3].values.sum()

print("Number of files in the 3 first domains : ", chosen_number_files)
print("Percentage of chosen files among the total: ", chosen_number_files/total_number_files)

allrecipes.com             28355
food.com                   14661
foodnetwork.com            11997
yummly.com                  6590
cooks.com                   5547
epicurious.com              5060
tasteofhome.com             4820
myrecipes.com               3805
recipes.sparkpeople.com     3671
cdkitchen.com               2584
bettycrocker.com            2325
cookeatshare.com            2044
southernfood.about.com      2012
grouprecipes.com            1916
recipe.com                  1460
kraftrecipes.com            1420
eatingwell.com              1400
ifood.tv                    1295
delish.com                  1050
dtype: int64
Number of files in the 3 first domains :  55013
Percentage of chosen files among the total:  0.539279692585186


In [127]:
website_list_used=website_count_used.index.tolist()

In [128]:
#Words that have to be removed from the string in the website to keep only the ingredient:
list_ingredient_to_remove = ["metrics","metric","of","teaspoon", "cup", "cups","teaspoons","ounce","ounces","pounds","pound","tablespoons","chopped","quarts","fresh",\
                            "light","plain","popped","medium","shredded","bunch","tablespoon","to","taste","pinch","freshly","ground",\
                            "canned","carcass","cubes","dried","frozen","all","purpose","grated","minced","degree","degrees","optional",\
                            "jar","quartered","marinated","strips","strip","asian","stalks","package","can","cans","box","container",\
                            "evaporated","bottle","cans","extract","squares","german","slices","crushed","uncooked","seasoning","small",\
                            "sweet","packed","sliced","heavy","condensed","finely","long","grain","sweetened","firmly","cooked","crusts","lean",\
                            "caps","mix","steaks","large","instant","crumbs","semisweet","distilled","packages","pint","miniature","preserves",\
                            "processed","chunky","prepared","seasoned","american","stock","top","sirloin","tip","peeled","spears","leaves",\
                            "belgian","bob","italian","bottles","boneless","roast","breast","meat","tenderloins","granules","chips","converted",\
                            "containers","coleslaw","florets","cube","cubed","diced","crumbled","head","thighs","halves","drumsticks","blend",\
                            "leaf","bow","tie","stuffing","dry","boiling","cover","coarse","kosher","extra","virgin","fat","free","fluid","hot",\
                            "chops","loin","shoulder","roasted","for","frying","diced","thawed","bone","hocks","shucked","leftover","corned",\
                            "weed","summer","wild","whole","very","ripe","unbaked","crust","unsalted","unsifted","unsweetened","drops","drop",\
                            "paste","thickly","swiss","skinless","flavored","quart","puff","kernels","kernel","cracked","as","tortillas","chunk",\
                            "dash","dark","hash","brown","hashbrown","hungarian","rinsed","thin","thinly","thick","stewed","cleaned","gelatin",\
                            "sifted","skim","slivered","sprigs","sage","roma","romano","pasta","reduced","sodium","refried","refrigerated",\
                            "portobello","nuggets","tater","pinto","pudding","pitted","pinches","relish","penne","deep","mild","melted",\
                            "maraschino","loaf","link","jumbo","inch","jars","jasmine","generous","gold","graham","granulated","fried","skins",\
                            "fire","firm","flaked","flank","family","size","eagle","curd","half","russet","round","vidalia","unbleached","toll",\
                            "morsels","toasted","square","soft","sharp","serrano","seedless","old","provolone","process","premium","pouch","betty",\
                            "piri","chunks","smoked","other","original","breaded","tender","tenders","flowerets","nonfat","no","calorie","tail",\
                            "greek","great","northern","southern","envelope","dill","cold","round","steak","brisket","baby","assorted"]
###################PASTA ??
list_ingredient_to_remove = sorted(list_ingredient_to_remove)
#print(list_ingredient_to_remove)

Creation of the recipe data:
We have to be aware of certain things:
- we might get a website containing only a list of recipes
- we might find no rating -> we'll have to discard it
- we might not be able to open and read the file
- 

In [129]:
#Create the data from the information found on the HTML pages

recipe_data = pd.DataFrame(columns = ['Website','Recipe','Prepare time', 'Ranking', 'Reviews', 'Ingredients'])  
list_unique_ingredients = []
unique_ingredients_data = pd.DataFrame(columns = ['Ingredient','Count'])

for (root,dirs,files) in os.walk('recipePages', topdown="True"):  
    for webpage in files:
        filename= 'recipePages/'+webpage
        
        #Try to open it, and if it doesn't, we don't analyse this webpage
        try:
            f = open(filename,'r')
            first_line=f.readline()
            f.close()
        except:
            print("We can't read the page: ",filename)
            continue

        website = str(first_line).split("/")[2].strip("www.")
        
        #Determine if we have to spare the data
        if website in website_list_used :
            
            #Read the data from the first website (allrecipes.com)
            if website in website_list_used[0]:
                recipe_data, list_unique_ingredients, unique_ingredients_data = scrap_allrecipes(website, filename,list_ingredient_to_remove, \
                                            list_unique_ingredients, recipe_data,website_list_used,unique_ingredients_data)
            
            # Food.com
            if website in website_list_used[1]:
                #print(" --- ", filename)
                recipe_data, list_unique_ingredients, unique_ingredients_data = scrap_food(website, filename,list_ingredient_to_remove, \
                                            list_unique_ingredients, recipe_data,website_list_used,unique_ingredients_data)
                #print('\n')
            if website in website_list_used[2]:
                #recipe_data, list_unique_ingredients1, unique_ingredients_data1
                recipe_data, list_unique_ingredients, unique_ingredients_data = scrap_foodnetwork(website, filename,list_ingredient_to_remove,list_unique_ingredients, recipe_data,website_list_used,unique_ingredients_data)
                #print('\n')
print(list_unique_ingredients)

We can't read the page:  recipePages/0000758edcc24c88341ca0cd779f69b0.html
We don't care about this page (foodnetwork):  recipePages/000255456bafed9c5783bff87cc0b27c.html
We don't care about this page:  recipePages/00159b3caa005f435b2f2f6cb4e1c2b5.html
We don't care about this page (foodnetwork):  recipePages/001ec670080a960a6fbcc5e4dd20b8cc.html
We don't care about this page (foodnetwork):  recipePages/001f044d051cf4f44ec53524a1a45bd8.html
We don't care about this page (foodnetwork):  recipePages/0032c0e5c5097f07698665c4b92c31fe.html
We can't read the page:  recipePages/0033dbc11a8c89f825af6ab3317d108e.html
We don't care about this page (foodnetwork):  recipePages/00341ca30fd1c405aac95cd1bcf13a6d.html
We don't care about this page (foodnetwork):  recipePages/0037a640c8110d2ec41248836acd1052.html
We don't care about this page:  recipePages/0037be341418ef23ae83615ae8325b14.html
We don't care about this page (foodnetwork):  recipePages/003b70ef8429eeca518d19ce9345c292.html
Beautifulsoup 

KeyboardInterrupt: 

In [130]:
recipe_data

Unnamed: 0,Website,Recipe,Prepare time,Ranking,Reviews,Ingredients
0,foodnetwork.com,French Dip Sandwiches,15,5,127,"[butter, shallot, flour, jigger sherry, beef c..."
1,food.com,Halloween Party Treat (Candy Corn and Peanut Mix),10,4.93,100,[salted peanuts]
2,allrecipes.com,Baked Buffalo Wings,120,4.4,519,"[flour, cayenne pepper, garlic powder, salt, c..."
3,food.com,Homemade Grand Marnier,15,5,2,"[orange zest, sugar, cognac, brandy]"
4,allrecipes.com,Fast Rice Pudding,15,4.1,75,"[white rice, cinnamon, milk, white sugar]"
...,...,...,...,...,...,...
3024,allrecipes.com,Fresh Apple Cake,,4.5,33,"[vegetable oil, white sugar, egg, vanilla, flo..."
3025,allrecipes.com,CheckerBoard Cookies II,,3.8,11,"[flour, sugar, baking powder, butter, egg yolk..."
3026,allrecipes.com,Cinnamon Rolls II,210,4.5,286,"[milk, water, butter, salt, flour, active yeas..."
3027,allrecipes.com,Shirley Temple I,1,4.2,28,"[carbonated beverage, grenadine syrup, cherry]"


In [30]:


copy_check = ['wheat', 'white flour', 'salt', 'white pepper', 'black pepper', 'chicken', 'canola oil', 'olive oil', 'chicken broth', 'lemon juice', 'artichoke hearts', 'capers', 'butter', 'parsley', 'corn syrup', 'margarine', 'water', 'sugar', 'marshmallows', 'popcorn', 'cream cheese', 'orange zest', 'orange juice', 'walnuts', 'baking potato', 'cheddar cheese', 'rolled oats', 'milk', 'pumpkin puree', 'pumpkin pie spice', 'cinnamon sugar', 'asparagus', 'cooking spray', 'pepper', 'soy sauce', 'balsamic vinegar', 'bay', 'onion', 'shallots', 'chicken bouillon', 'potatoes', 'corn', 'cream', 'baking flour', 'garbanzo bean flour', 'cornstarch', 'tapioca starch', 'baking powder', 'xanthan gum', 'oregano', 'parmesan cheese', 'white sugar', 'lukewarm water', 'active yeast', 'egg', 'apple cider vinegar', 'garlic', 'flour', 'warm water', 'shortening', 'cajun', 'monkfish fillets', 'cloves garlic', 'tomato', 'mushrooms', 'white wine', 'beef', 'white cooking wine', 'pear juice', 'sesame oil', 'green onion', 'sesame seeds', 'broccoli', 'cream of mushroom soup', 'rice', 'cheese sauce', 'tomatoes', 'tomato sauce', 'beef bouillon', 'carrots', 'white vinegar', 'cabbage', 'bread', 'cayenne pepper', 'oil', 'mayonnaise', 'horseradish', 'sour cream', 'worcestershire sauce', 'mustard', 'cottage cheese', 'clove garlic', 'cider vinegar', 'walnut oil', 'endives', 'apple', 'stemmed watercress', 'blue cheese', 'zucchini', 'pineapple juice', 'vanilla', 'pineapple', 'chicken liver', 'cooking sherry', 'mace', 'pumpkin', 'cinnamon', 'ginger', 'nutmeg', 'pie', 'cranberries', 'white wine vinegar', 'wasabi', 'cilantro', 'tuna', 'sausage', 'green pepper', 'coconut', 'baking soda', 'ramen noodle soup', 'almonds', 'salad oil', 'mozzarella cheese', 'basil', 'onion powder', 'garlic powder', 'teriyaki sauce', 'liquid smoke flavoring', 'red pepper flakes', 'pastry shells', 'green beans', 'garlic chili sauce', 'honey', 'vegetable cooking spray', 'or chicken', 'cream of celery soup', 'molasses', 'corn oil', 'cloves', 'allspice', 'nonstick cooking spray', 'ziti', 'vegetable oil', 'yellow squash', 'spinach', 'vegetable sauce', 'alfredo sauce', 'acorn squash', 'celery', 'oysters', 'bread flour', 'cornmeal', 'egg white', 'poppy seeds', 'juice', 'lemon', 'ginger root', 'cumin', 'coriander', 'pork', 'tea bags', 'peaches', 'stevia powder', 'red bell pepper', 'green bell pepper', 'red pepper', 'chile', 'apple juice', 'oats', 'raisins', 'red lentils', 'poultry', 'green lentils', 'jimmy hearty pork sausage crumbles', 'cheese', 'ricotta cheese', 'mushroom', 'strawberry', 'strawberries', 'banana', 'pie shells', 'cream of tartar', 'paprika', 'thyme', 'celery seed', 'tilapia fillets', 'white bread', 'biscuit baking', 'pecans', 'halibut', 'feta cheese', 'bacon', 'red onion', 'white rice', 'black drain', 'reserve liquid', 'chives', 'avocados', 'lime juice', 'pizza', 'cherry tomatoes', 'monterey jack cheese', 'brownie', 'peanut butter', 'chili powder', 'kidney beans', 'cannellini beans', 'chile sauce', 'beef broth', 'jalapeno pepper', 'fillets tilapia', 'bay tm', 'garlic salt', 'cauliflower', 'mashed banana', 'white cake', 'pistachio', 'carbonated beverage', 'whipped topping', 'cracker', 'chocolate', 'cream of chicken soup', 'cheese food', 'snow peas', 'salad dressing', 'angel food cake', 'mahi mahi fillets', 'coconut milk', 'mango', 'pie shell', 'bok choy', 'oyster sauce', 'pepper sauce', 'barbeque sauce', 'peach', 'and tomatoes', 'whipping cream', 'vodka', 'beans', 'taco', 'vegetable broth', 'turnips', 'potato', 'garbanzo beans', 'turmeric', 'saffron', 'curry powder', 'couscous', 'and rice', 'pine nuts', 'and sour', 'triple sec', 'tequila', 'orange liqueur grand', 'limes', 'egg yolk', 'cornflakes cereal', 'eggplant', 'french sandwich rolls', 'buttermilk', 'dressing', 'limeade concentrate', 'cherries', 'lime', 'white onion', 'basil sauce', 'manicotti shells', 'rhubarb', 'wheat berries', 'tomatillos', 'enchilada sauce', 'green chile peppers', 'sandwich bread', 'vanilla vodka', 'amaretto liqueur', 'recipe pastry a single pie', 'ham', 'adobo', 'packet sazon', 'red potatoes', 'red wine', 'or bay', 'elbow macaroni', 'minestrone soup', 'mixed vegetables', 'meatloaf', 'cucumbers', 'lemon pepper', 'sauerkraut', 'kielbasa', 'navy beans', 'anise', 'brussels sprouts', 'dijon mustard', 'bulk pork breakfast sausage', 'pepperjack cheese', 'white hominy', 'black beans', 'tortilla', 'avocado', 'eggnog', 'rum', 'amaranth flour', 'tapioca flour', 'arrowroot flour', 'milk chocolate candy kisses', 'yellow onion', 'spaghetti sauce', 'tomato puree', 'recipe pastry a double pie', 'dates', 'linguini', 'clams', 'espresso coffee', 'gingerbread syrup', 'whipped cream', 'vanilla powder', 'wax beans', 'nuts', 'almond', 'rotini', 'black olives', 'green tomatoes', 'jicama', 'ketchup', 'rice vinegar', 'salsa', 'salt pork', 'egg noodles', 'iceberg lettuce', 'green peas', 'coconut cream', 'tamarind pulp', 'chickpea flour', 'coriander seed', 'mustard seed', 'black peppercorns', 'greens', 'yogurt', 'ice', 'green salsa', 'chow mein noodles', 'mint', 'spaghetti', 'croutons', 'crabmeat', 'lobster', 'garlic pepper', 'lemon grass', 'bulb garlic', 'red chile pepper', 'bunches cilantro', 'wonton wrappers', 'plump lemongrass stalk', 'curry', 'thai basil', 'pickle', 'lamb', 'chili sauce', 'wax pepper', 'garam masala', 'yellow food coloring', 'red food coloring', 'pattypan squash', 'rosemary', 'lamb rump', 'red wine vinegar', 'yellow bell pepper', 'and turkey', 'lasagna noodles', 'collard greens', 'acini di pepe', 'mandarin oranges', 'green olives', 'salami', 'green cardamom pods', 'gallon milk', 'mixed greens', 'nectarines', 'grapefruit', 'honey balsamic', 'strawberries in syrup', 'water chestnuts', 'vegetable soup', 'sauce', 'canning', 'lids', 'rings', 'carrot', 'habanero peppers', 'pouches liquid pectin', 'cake flour', 'whipped cream cheese', 'beer', 'and', 'tofu', 'rice noodles', 'butter shortening', 'turkey', 'kale', 'hubbard squash cut into', 'seeds removed', 'apple pie spice', 'salted butter', 'onion soup', 'or ginger ale', 'moist white cake', 'coffee powder', 'coffee', 'coffee liqueur', 'mascarpone cheese', 'cocoa powder', 'thousand island dressing', 'breakfast sausage', 'liquid', 'shrimp', 'blue cheese salad dressing', 'cayenne pepper sauce', 'buffalo wing sauce', 'chicken in water', 'vegetables', 'and potatoes', 'into', 'gravy', 'tips', 'burgundy wine', 'shallot', 'buttery crackers', 'grape jelly', 'rigatoni', 'cream of tomato soup', 'pizza sauce', 'pepperoni sausage', 'lime sherbet', 'ginger ale', 'marshmallow creme', 'tahini', 'sourdough bread', 'imitation crabmeat', 'blackberries', 'raspberries', 'blueberries', 'new mexico chile pods', 'cotija cheese', 'green grapes', 'red grapes', 'flat beer', 'yeast', 'and mashed potatoes', 'rice wine vinegar', 'creamy peanut butter', 'green cabbage', 'red cabbage', 'napa cabbage', 'spaghetti noodles', 'red pepper sauce', 'individually wrapped caramels', 'pecan', 'cheddar', 'sour cream mashed potatoes', 'fully ham', 'bacon flavor bits', 'crescent dinner rolls', 'broken pieces vermicelli', 'leeks', 'tomato juice', 'clam juice', 'deveined', 'mussels', 'clams in shell', 'sea scallops', 'beef stew', 'barley', 'caraway seeds', 'orecchiette', 'plum tomatoes', 'yellow cake', 'granny smith apples', 'sugar dusting', 'peppermint oil', 'any color food coloring', 'cooking oil', 'chile peppers', 'dinner rolls', 'butterscotch', 'tomato soup', 'less', 'vinegar', 'or broccoli', 'chile pepper', 'fillets tilapia fillets', 'seafood', 'wheat bread', 'white corn', 'yellow corn', 'pepper salmon', 'sweetener', 'style', 'cling peaches', 'stone cornmeal', 'pancakes', 'pancake', 'nutmeg whipped cream', 'corn flour', 'water needed', 'lukewarm milk', 'rye flour', 'phyllo dough', 'white chocolate', 'chocolate cake']

In [20]:

'''
ingredient_index = unique_ingredients_data[unique_ingredients_data['Ingredient']=='salt'].index[0]
unique_ingredients_data.at[ingredient_index,'Count']= unique_ingredients_data['Count'][ingredient_index] + 1
'''



#ingredients_ordered_alphabet = unique_ingredients_data.reindex(sorted(unique_ingredients_data['Ingredient']), axis=1).copy()

ingredients_ordered_alphabet = unique_ingredients_data.sort_values(by=['Ingredient'])

#print(ingredients_ordered_alphabet)


with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(ingredients_ordered_alphabet)



                      Ingredient Count
120                 active yeast     1
173                      almonds     1
71                       annatto     1
151                        apple     1
122          apple cider vinegar     2
20              artichoke hearts     1
110                    asparagus     2
114                 baking flour     1
104                baking potato     1
50                 baking powder     4
171                  baking soda     1
32              balsamic vinegar     4
57                         basil     2
33                           bay     3
64                          beef     6
137                beef bouillon     1
83                    beef broth     1
61                   black drain     1
14                  black pepper    13
153                  blue cheese     2
140                        bread     1
134                     broccoli     2
5                         butter    18
78                  butterscotch     1
139                      

In [17]:
list_all_ingredients = []
for data in recipe_data['Ingredients']:
    #print(data)    
    #ingredients_strip = data.strip()
    for i in range(len(data)):
        list_all_ingredients.append(data[i])
    
    
print(len(list_all_ingredients))
list_unique_ingredients = set(list_all_ingredients)
print(len(list_unique_ingredients))
set(list_unique_ingredients)





2605
510


{'acini di pepe',
 'acorn squash',
 'active yeast',
 'adobo',
 'alfredo sauce',
 'allspice',
 'almond',
 'almonds',
 'amaranth flour',
 'amaretto liqueur',
 'and',
 'and mashed potatoes',
 'and potatoes',
 'and rice',
 'and sour',
 'and tomatoes',
 'and turkey',
 'angel food cake',
 'anise',
 'any color food coloring',
 'apple',
 'apple cider vinegar',
 'apple juice',
 'apple pie spice',
 'arrowroot flour',
 'artichoke hearts',
 'asparagus',
 'avocado',
 'avocados',
 'bacon',
 'bacon flavor bits',
 'baking flour',
 'baking potato',
 'baking powder',
 'baking soda',
 'balsamic vinegar',
 'banana',
 'barbeque sauce',
 'barley',
 'basil',
 'basil sauce',
 'bay',
 'bay tm',
 'beans',
 'beef',
 'beef bouillon',
 'beef broth',
 'beef stew',
 'beer',
 'biscuit baking',
 'black beans',
 'black drain',
 'black olives',
 'black pepper',
 'black peppercorns',
 'blackberries',
 'blue cheese',
 'blue cheese salad dressing',
 'blueberries',
 'bok choy',
 'bread',
 'bread flour',
 'breakfast sausage'

In [18]:
recipe_data

Unnamed: 0,Website,Recipe,Ranking,Reviews,Ingredients
0,allrecipes.com,Chicken Breast Cutlets with Artichokes and Capers,4.4,80,"[wheat, white flour, salt, white pepper, black..."
1,allrecipes.com,Best Ever Popcorn Balls,4.4,322,"[corn syrup, margarine, water, sugar, marshmal..."
2,allrecipes.com,Orange Cream Cheese Frosting,4.6,39,"[cream cheese, butter, orange zest, orange jui..."
3,allrecipes.com,Perfect Baked Potato,4.7,410,"[baking potato, olive oil, salt, butter, black..."
4,allrecipes.com,Pumpkin Oatmeal,3.9,88,"[rolled oats, milk, pumpkin puree, pumpkin pie..."
...,...,...,...,...,...
288,allrecipes.com,Peanut Butter Bars I,4.6,1378,"[butter, margarine, cracker, sugar, peanut but..."
289,allrecipes.com,German Rye Bread,3.8,27,"[active yeast, warm water, lukewarm milk, whit..."
290,allrecipes.com,Spanakopita II,4.7,201,"[vegetable oil, onion, spinach, flour, feta ch..."
291,allrecipes.com,White Chocolate Blondie Brownies,4.3,23,"[butter, white chocolate, egg, white sugar, va..."


wtf
carrot


'    \nlist_unique_ingredients.sort()\nlist_unique_ingredients\n'