# Data Cleaning

In [1]:
import pandas as pd
import numpy as np

In [2]:
train_df = pd.read_json("train.json")
train_df['ingredients_clean'] = [', '.join(item).strip() for item in train_df['ingredients']]  

In [3]:
train_df[:5]

Unnamed: 0,cuisine,id,ingredients,ingredients_clean
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes...","romaine lettuce, black olives, grape tomatoes,..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g...","plain flour, ground pepper, salt, tomatoes, gr..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g...","eggs, pepper, salt, mayonaise, cooking oil, gr..."
3,indian,22213,"[water, vegetable oil, wheat, salt]","water, vegetable oil, wheat, salt"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe...","black pepper, shallots, cornflour, cayenne pep..."


In [4]:
import unicodedata
import re

def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    only_ascii = nfkd_form.encode('ASCII', 'ignore')
    return only_ascii

In [5]:
train_df['ingredients_clean'] = [', '.join([item.lower() for item in line]).strip() for line in train_df['ingredients']]  
train_df[280:286]

Unnamed: 0,cuisine,id,ingredients,ingredients_clean
280,mexican,4114,"[tostada shells, shredded lettuce, avocado, ja...","tostada shells, shredded lettuce, avocado, jal..."
281,chinese,3605,"[honey, ginger, whiskey, chicken wings, green ...","honey, ginger, whiskey, chicken wings, green o..."
282,cajun_creole,41363,"[chicken stock, kosher salt, unsalted butter, ...","chicken stock, kosher salt, unsalted butter, s..."
283,italian,23437,"[Bertolli® Classico Olive Oil, boneless skinle...","bertolli® classico olive oil, boneless skinles..."
284,mexican,9291,"[lime wedges, powdered sugar, orange liqueur, ...","lime wedges, powdered sugar, orange liqueur, t..."
285,italian,45423,"[( oz.) tomato sauce, ground veal, sliced m...","( oz.) tomato sauce, ground veal, sliced mu..."


In [6]:
train_df['ingredients_clean'] = [[remove_accents(item) for item in line] for line in train_df['ingredients']]  
train_df[280:286]

Unnamed: 0,cuisine,id,ingredients,ingredients_clean
280,mexican,4114,"[tostada shells, shredded lettuce, avocado, ja...","[tostada shells, shredded lettuce, avocado, ja..."
281,chinese,3605,"[honey, ginger, whiskey, chicken wings, green ...","[honey, ginger, whiskey, chicken wings, green ..."
282,cajun_creole,41363,"[chicken stock, kosher salt, unsalted butter, ...","[chicken stock, kosher salt, unsalted butter, ..."
283,italian,23437,"[Bertolli® Classico Olive Oil, boneless skinle...","[Bertolli Classico Olive Oil, boneless skinles..."
284,mexican,9291,"[lime wedges, powdered sugar, orange liqueur, ...","[lime wedges, powdered sugar, orange liqueur, ..."
285,italian,45423,"[( oz.) tomato sauce, ground veal, sliced m...","[( oz.) tomato sauce, ground veal, sliced m..."


In [7]:
train_df['ingredients_clean'] = [[re.sub(r'\(.*\)\s', '', item).strip() for item in line] for line in train_df['ingredients_clean']]  
train_df[280:286]

Unnamed: 0,cuisine,id,ingredients,ingredients_clean
280,mexican,4114,"[tostada shells, shredded lettuce, avocado, ja...","[tostada shells, shredded lettuce, avocado, ja..."
281,chinese,3605,"[honey, ginger, whiskey, chicken wings, green ...","[honey, ginger, whiskey, chicken wings, green ..."
282,cajun_creole,41363,"[chicken stock, kosher salt, unsalted butter, ...","[chicken stock, kosher salt, unsalted butter, ..."
283,italian,23437,"[Bertolli® Classico Olive Oil, boneless skinle...","[Bertolli Classico Olive Oil, boneless skinles..."
284,mexican,9291,"[lime wedges, powdered sugar, orange liqueur, ...","[lime wedges, powdered sugar, orange liqueur, ..."
285,italian,45423,"[( oz.) tomato sauce, ground veal, sliced m...","[tomato sauce, ground veal, sliced mushrooms, ..."


In [8]:
train_df['ingredients_clean'] = [[re.sub(r'[!@#$%^&*.,\'\"]', '', item).strip() for item in line] for line in train_df['ingredients_clean']] 
train_df[3275:3281]

Unnamed: 0,cuisine,id,ingredients,ingredients_clean
3275,chinese,44886,"[cream cheese, white sugar, garlic powder, sou...","[cream cheese, white sugar, garlic powder, sou..."
3276,southern_us,40197,"[cornmeal, green tomatoes, sugar, vegetable oil]","[cornmeal, green tomatoes, sugar, vegetable oil]"
3277,thai,7998,"[fish sauce, lemongrass, salmon fillets, Thai ...","[fish sauce, lemongrass, salmon fillets, Thai ..."
3278,italian,21305,"[I Can't Believe It's Not Butter!® Spread, lin...","[I Cant Believe Its Not Butter Spread, linguin..."
3279,mexican,48611,"[minced onion, fat skimmed chicken broth, dice...","[minced onion, fat skimmed chicken broth, dice..."
3280,mexican,4275,"[fresh cilantro, chicken breasts, cream cheese...","[fresh cilantro, chicken breasts, cream cheese..."


In [9]:
train_df['ingredients_clean'] = [[item.split(',')[0].strip() for item in line] for line in train_df['ingredients_clean']] 
train_df[1200:1206]

Unnamed: 0,cuisine,id,ingredients,ingredients_clean
1200,southern_us,42945,"[cream cheese, soften, toasted pecans, dried c...","[cream cheese soften, toasted pecans, dried cr..."
1201,korean,23436,"[soy sauce, all-purpose flour, red pepper, lar...","[soy sauce, all-purpose flour, red pepper, lar..."
1202,irish,10150,"[ground cinnamon, baking soda, all-purpose flo...","[ground cinnamon, baking soda, all-purpose flo..."
1203,indian,14509,"[tomato sauce, chicken, brown rice, plain yogu...","[tomato sauce, chicken, brown rice, plain yogu..."
1204,indian,2487,"[boneless chicken thighs, unsalted roasted pea...","[boneless chicken thighs, unsalted roasted pea..."
1205,southern_us,43627,"[fresh ginger, dark sesame oil, turnip greens,...","[fresh ginger, dark sesame oil, turnip greens,..."


In [10]:
train_df['ingredients_clean'] = [[re.sub(r'low.*fat', 'lowfat', item).strip() for item in line] for line in train_df['ingredients_clean']]  
train_df['ingredients_clean'] = [[re.sub(r'non.*fat', 'nonfat', item).strip() for item in line] for line in train_df['ingredients_clean']]  
train_df['ingredients_clean'] = [[re.sub(r'reduced.*fat', 'reducedfat', item).strip() for item in line] for line in train_df['ingredients_clean']]
train_df[20:26]

Unnamed: 0,cuisine,id,ingredients,ingredients_clean
20,thai,13121,"[pork loin, roasted peanuts, chopped cilantro ...","[pork loin, roasted peanuts, chopped cilantro ..."
21,mexican,40523,"[roma tomatoes, kosher salt, purple onion, jal...","[roma tomatoes, kosher salt, purple onion, jal..."
22,southern_us,40989,"[low-fat mayonnaise, pepper, salt, baking pota...","[lowfat mayonnaise, pepper, salt, baking potat..."
23,chinese,29630,"[sesame seeds, red pepper, yellow peppers, wat...","[sesame seeds, red pepper, yellow peppers, wat..."
24,italian,49136,"[marinara sauce, flat leaf parsley, olive oil,...","[marinara sauce, flat leaf parsley, olive oil,..."
25,chinese,26705,"[sugar, lo mein noodles, salt, chicken broth, ...","[sugar, lo mein noodles, salt, chicken broth, ..."


In [11]:
train_df['ingredients_clean'] = [[re.sub(r'(lowfat)|(reducedfat)|(nonfat)', '', item).strip() for item in line] for line in train_df['ingredients_clean']]  
train_df[20:26]

Unnamed: 0,cuisine,id,ingredients,ingredients_clean
20,thai,13121,"[pork loin, roasted peanuts, chopped cilantro ...","[pork loin, roasted peanuts, chopped cilantro ..."
21,mexican,40523,"[roma tomatoes, kosher salt, purple onion, jal...","[roma tomatoes, kosher salt, purple onion, jal..."
22,southern_us,40989,"[low-fat mayonnaise, pepper, salt, baking pota...","[mayonnaise, pepper, salt, baking potatoes, eg..."
23,chinese,29630,"[sesame seeds, red pepper, yellow peppers, wat...","[sesame seeds, red pepper, yellow peppers, wat..."
24,italian,49136,"[marinara sauce, flat leaf parsley, olive oil,...","[marinara sauce, flat leaf parsley, olive oil,..."
25,chinese,26705,"[sugar, lo mein noodles, salt, chicken broth, ...","[sugar, lo mein noodles, salt, chicken broth, ..."


In [12]:
train_df['ingredients_clean'] = [[re.sub(r'^all.*purpose ', '', item).strip() for item in line] for line in train_df['ingredients_clean']] 
train_df[940:946]

Unnamed: 0,cuisine,id,ingredients,ingredients_clean
940,japanese,4734,[sushi rice],[sushi rice]
941,mexican,1972,"[pepper, dried pinto beans, chopped cilantro, ...","[pepper, dried pinto beans, chopped cilantro, ..."
942,mexican,27995,"[ground chipotle chile pepper, salsa verde, bu...","[ground chipotle chile pepper, salsa verde, bu..."
943,mexican,40555,"[all-purpose flour, jalapeno chilies, chopped ...","[flour, jalapeno chilies, chopped cilantro fre..."
944,spanish,25506,"[olive oil, garlic, onions, pepper, diced toma...","[olive oil, garlic, onions, pepper, diced toma..."
945,southern_us,45401,"[granulated sugar, butter, cornmeal, firmly pa...","[granulated sugar, butter, cornmeal, firmly pa..."


In [13]:
train_df['ingredients_clean'] = [[re.sub(r'[0-9]+ ounc\s', '', item.strip()) for item in line] for line in train_df['ingredients_clean']]  
train_df['ingredients_clean'] = [[re.sub(r'.*lb. ', '', item.strip()) for item in line] for line in train_df['ingredients_clean']]  
train_df['ingredients_clean'] = [[re.sub(r'.*% ', '', item.strip()) for item in line] for line in train_df['ingredients_clean']]  

train_df[285:291]

Unnamed: 0,cuisine,id,ingredients,ingredients_clean
285,italian,45423,"[( oz.) tomato sauce, ground veal, sliced m...","[tomato sauce, ground veal, sliced mushrooms, ..."
286,italian,31462,"[solid pack pumpkin, dry white wine, corn star...","[solid pack pumpkin, dry white wine, corn star..."
287,italian,45412,"[fresh rosemary, fresh thyme leaves, flat leaf...","[fresh rosemary, fresh thyme leaves, flat leaf..."
288,chinese,33048,"[black pepper, whole grain thin spaghetti, sca...","[black pepper, whole grain thin spaghetti, sca..."
289,italian,42879,"[pistachios, salt, dried cherry, butter, large...","[pistachios, salt, dried cherry, butter, large..."
290,thai,6164,"[water, green onions, galangal, fish sauce, pe...","[water, green onions, galangal, fish sauce, pe..."


In [None]:
#http://www.nltk.org
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


In [14]:
#http://stackoverflow.com/questions/25534214/nltk-wordnet-lemmatizer-shouldnt-it-lemmatize-all-inflections-of-a-word
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import grid_search

In [15]:
train_df['ingredients_clean_lem'] = [[WordNetLemmatizer().lemmatize(item) for item in line] for line in train_df['ingredients']]
train_df[:5]

Unnamed: 0,cuisine,id,ingredients,ingredients_clean,ingredients_clean_lem
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes...","[romaine lettuce, black olives, grape tomatoes...","[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g...","[plain flour, ground pepper, salt, tomatoes, g...","[plain flour, ground pepper, salt, tomato, gro..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g...","[eggs, pepper, salt, mayonaise, cooking oil, g...","[egg, pepper, salt, mayonaise, cooking oil, gr..."
3,indian,22213,"[water, vegetable oil, wheat, salt]","[water, vegetable oil, wheat, salt]","[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe...","[black pepper, shallots, cornflour, cayenne pe...","[black pepper, shallot, cornflour, cayenne pep..."


In [16]:
train_df['ingredients_clean_stem'] = [[PorterStemmer().stem(item) for item in line] for line in train_df['ingredients']]   
train_df[:5]

Unnamed: 0,cuisine,id,ingredients,ingredients_clean,ingredients_clean_lem,ingredients_clean_stem
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes...","[romaine lettuce, black olives, grape tomatoes...","[romaine lettuce, black olives, grape tomatoes...","[romaine lettuc, black ol, grape tomato, garli..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g...","[plain flour, ground pepper, salt, tomatoes, g...","[plain flour, ground pepper, salt, tomato, gro...","[plain flour, ground pepp, salt, tomato, groun..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g...","[eggs, pepper, salt, mayonaise, cooking oil, g...","[egg, pepper, salt, mayonaise, cooking oil, gr...","[egg, pepper, salt, mayonais, cooking oil, gre..."
3,indian,22213,"[water, vegetable oil, wheat, salt]","[water, vegetable oil, wheat, salt]","[water, vegetable oil, wheat, salt]","[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe...","[black pepper, shallots, cornflour, cayenne pe...","[black pepper, shallot, cornflour, cayenne pep...","[black pepp, shallot, cornflour, cayenne pepp,..."


In [29]:
from collections import Counter
import json

def load_data_as_lists(filename):
    with open(filename) as data_file:    
        data = json.load(data_file)
        
    ids, cuisines, ingredients = [], [], []
    for i in range(len(data)):
        ids.append(data[i]['id'])
        cuisines.append(data[i]['cuisine'])
        for j in range(len(data[i]['ingredients'])):
            ingredients.append(data[i]['ingredients'][j])
                
    return ids, cuisines, ingredients

In [81]:
ids, cuisines, ingredients = load_data_as_lists('train.json')

In [105]:
ingredients_series = pd.Series(ingredients)
ingredients_series_u = ingredients_series.str.encode('utf-8', errors='strict')
ingredients_vc = ingredients_series_u.value_counts()
ingredients_vc[:5]

salt         18049
onions        7972
olive oil     7972
water         7457
garlic        7380
dtype: int64

In [106]:
threshold = 10000 # Anything greater than the threshold will be removed

to_remove = ingredients_vc[ingredients_vc >= threshold].index
ingredients_series_u.replace(to_remove, np.nan, inplace=True)

In [107]:
ingredients_vc[:5]

salt         18049
onions        7972
olive oil     7972
water         7457
garlic        7380
dtype: int64

In [108]:
ingredients_series_u.replace(to_remove, 0, inplace=True)

In [110]:
ingredients_vc = ingredients_vc.drop(to_remove[0])

In [111]:
ingredients_vc[:5]

onions       7972
olive oil    7972
water        7457
garlic       7380
sugar        6434
dtype: int64