# Part 2: Further Cleaning and Preprocessing

In [2]:
import spacy

import pandas as pd

from spacy import displacy

from collections import Counter


In [3]:
df = pd.read_csv('./reduced_by_half_recipe_data2.csv')

In [4]:
df.dropna(inplace = True)
df.reset_index(inplace = True)

In [5]:
df.drop(columns = ['index', 'Unnamed: 0'],inplace = True)

In [6]:
nlp=spacy.load("en_core_web_sm")
nlp

<spacy.lang.en.English at 0x1116c37f0>

In [9]:
# Adjective
# Looking at adjectives we might not need
adj = []
for token in df['raw_ingredients'][:1000]:
    doc = nlp(token)
    for i in doc:
        if (i.pos_=='ADJ'): 
            adj.append(i)
        
#Removing the duplicated words from the list        
adj = [i.text for i in adj]

result_adj = []
for item in adj:
    if item not in result_adj:
        result_adj.append(item)
result_adj

['brown',
 'sour',
 'frozen',
 'garlic',
 'powdered',
 'extra',
 'lean',
 'green',
 'whole',
 'dark',
 'sweet',
 'vegetable',
 'italian',
 'black',
 'yellow',
 'parmesan',
 'warm',
 'multi',
 '-',
 'colored',
 'mixed',
 'salad',
 'dry',
 'crumb',
 'coconut',
 'cold',
 'thin',
 'olive',
 'walnut',
 'junior',
 'spanish',
 'orange',
 'moist',
 'instant',
 'american',
 'triple',
 'chinese',
 'purple',
 'chili',
 'unflavored',
 'rosemary',
 'wild',
 'lasagne',
 'hot',
 'light',
 'plain',
 'heavy',
 'mayonnaise',
 'unsalted',
 'canadian',
 'swiss',
 'crisp',
 'sesame',
 'dripping',
 'regular',
 'commerical',
 'golden',
 'currant',
 'oatmeal',
 'chunky',
 'irish',
 'pink',
 'breadcrumb',
 'faux',
 'crabmeat',
 'ready',
 'crab',
 'chive',
 'baked',
 'stew',
 'cheez',
 'crunchy',
 'pet',
 'coarse',
 'raw',
 'northern',
 'solid',
 'pumpkin',
 'borden',
 'liquid',
 'delicious',
 'stale',
 'patty',
 'cinnamon',
 'blanched',
 'hearty',
 'wide',
 'clear',
 'firm',
 'nondairy',
 'caraway',
 'lite',
 

In [10]:
# Verbs
# Looking at verbs we might not need
verbs = []
for token in df['raw_ingredients'][:1000]:
    doc = nlp(token)
    for i in doc:
        if (i.pos_=='VERB'): 
            verbs.append(i)
        
#Removing the duplicated words from the list        
verbs = [i.text for i in verbs]

result_verbs = []
for item in verbs:
    if item not in result_verbs:
        result_verbs.append(item)
result_verbs

['baking',
 'condensed',
 'pitted',
 'cleaned',
 'pear',
 'curry',
 'drop',
 'margarine',
 'corned',
 'granulated',
 'whipping',
 'frozen',
 'dried',
 'steak',
 'bite',
 'artichoke',
 'coloring',
 'flavored',
 'slivered',
 'topping',
 'flaked',
 'candied',
 'floured',
 'salad',
 'paraffin',
 'whipped',
 'romaine',
 'crushed',
 'roll',
 'butterscotch',
 'sausage',
 'molasses',
 'buttered',
 'pickling',
 'great',
 'flavoring',
 'shell',
 'chop',
 'dressing',
 'stove',
 'stuffing',
 'cooking',
 'toasted',
 'powdered',
 'rolled',
 'quartered',
 'orange',
 'lettuce',
 'peppercorn',
 'concentrate',
 'sparkling',
 'seasoned',
 'bisquick',
 'creamed',
 'playing',
 'kiss',
 'reduced',
 'recipe',
 'squeezed']

##### Annotation
Using spaCy I was able to identify verbs and adjectives still present in the ingredient column that were unnecessary. Once the verbs and adjective were identified they were added to a list containing only their unique values. Once in that list, I went on and manually explored how that word was used in the ingredient list. If I deemed that word unnecessary I added it to the second word_remover list.

In [11]:
word_remover = ['freshly','unsliced','peel','cooking','melt','broiler-fryers','unprocessed',
                'confectionery',"'s",'homestyle','instant','packet','squeezed','frozen','extra','whole',
                'warm', 'cold','thin','junior baby', 'baby', 'moist', 'spanish', 'american', 'chinese','regular',
                'light', 'canadian','crisp','commercial', 'ready', 'delicious','stale', 'wide','clear',
                 'firm','liter', 'unbaked', 'smooth', 'generous','cleaned','granulated']

In [12]:
pat = r'\b(?:{})\b'.format('|'.join(word_remover))
df['raw_ingredients'] = df['raw_ingredients'].str.replace(pat, '')

  df['raw_ingredients'] = df['raw_ingredients'].str.replace(pat, '')


In [None]:
#df.to_csv('recipe_cleaned_spacy.csv')

##### Annotation:
The cleaned data set was exported and EDA was conducted. While conducting EDA I found out that the ingredient count was not distributed and that certain recipes exceeded the 20 ingredient mark. I decided to drop all recipes that exceeded 20 ingredients and exported the data set into its final csv file. 

In [13]:
df['ingredient_count'] = [len(i.split(', ')) for i in df['raw_ingredients']]

In [15]:
df = df[df['ingredient_count']<=20]

In [17]:
df.drop(columns = ['ingredient_count'],inplace = True)

In [19]:
df.reset_index(inplace = True)

In [20]:
#df.to_csv('recipe_cleaned_spacy2.csv')