In [57]:
import spacy
import pandas as pd
import glob

In [58]:
nlp = spacy.load('en')

In [6]:
files = sorted(glob.glob('layer1.json.[0].valid.gz'))
df = pd.concat([pd.read_pickle(fp) for fp in files], ignore_index=True)
df.shape

(99999, 9)

In [7]:
# let's filter out epicurious recipes with less than 10 steps
keep_columns = ['id', 'partition', 
                'collection', 'title', 
                'ingr_count', 'instr_count', 
                'ingredients', 'instructions']

epicurious = df[(df.collection == 'www.epicurious.com') 
   & (df.instr_count<=10)][keep_columns].sort_values(by=['instr_count'], ascending=False)

# train, val, test counts
epicurious.groupby('partition').size()

partition
test      316
train    1403
val       297
dtype: int64

In [43]:
epicurious[(epicurious.ingr_count==1)].head()

Unnamed: 0,id,partition,collection,title,ingr_count,instr_count,ingredients,instructions
37223,093de97d8c,train,www.epicurious.com,Poached Eggs over Vegetables,1,8,[{'text': '4 eggs'}],[{'text': 'Use a nonstick frying pan that will...
21658,055df8d631,test,www.epicurious.com,Garlic Broth with Sage and Parsley,1,7,[{'text': '2 to 3 teaspoons sliced green garli...,[{'text': 'Peel and slice fresh garlic: 2 to 3...
68730,1116160ea1,train,www.epicurious.com,Clarified Butter,1,6,"[{'text': '2 cups (4 sticks) unsalted butter, ...",[{'text': 'Place butter pieces in 4-cup glass ...
99103,188fbfe409,test,www.epicurious.com,Corned Beef,1,6,[{'text': '3- to 4-pound corned beef brisket'}],[{'text': 'In a kettle combine beef with cold ...
96647,17f419f514,train,www.epicurious.com,Grilled Corn on the Cob,1,5,"[{'text': '8 ears corn in husks, outer layer o...","[{'text': 'Prepare grill.'}, {'text': 'Peel ba..."


In [67]:
print(epicurious['instructions'][21658])

[{'text': 'Peel and slice fresh garlic: 2 to 3 teaspoons sliced green garlic, or 1 or 2 sliced cloves per cup of broth.'}, {'text': 'Bring some light chicken broth to a boil with a few leaves of fresh sage.'}, {'text': 'Once it comes to a boil use a skimmer to remove the sage leaves (if the sage cooks too long, the broth will become bitter and dark).'}, {'text': 'Add the garlic to the broth along with salt to taste.'}, {'text': 'Cook for about 5 minutes.'}, {'text': 'Drizzle olive oil over a toasted slice of day-old bread, put it in a bowl, ladle soup over the bread, add a pinch of coarsely chopped parsley, and serve.'}, {'text': 'For a heartier soup, poach an egg in the broth and serve it on top of the bread.'}]


In [68]:
# Text: The original word text.
# Lemma: The base form of the word.
# POS: The simple part-of-speech tag.
# Tag: The detailed part-of-speech tag.
# Dep: Syntactic dependency, i.e. the relation between tokens.
# Shape: The word shape – capitalisation, punctuation, digits.
# is alpha: Is the token an alpha character?
# is stop: Is the token part of a stop list, i.e. the most common words of the language?

for each in epicurious['instructions'][21658]:
    instruction = nlp(list(each.values())[0])
    for token in ingredient:
        print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
          token.shape_, token.is_alpha, token.is_stop)
    
    # display dependency tree
    spacy.displacy.serve(instruction, style='dep')

Peel peel NOUN NN ROOT Xxxx True False
and and CCONJ CC cc xxx True True
slice slice VERB VB conj xxxx True False
fresh fresh ADJ JJ amod xxxx True False
garlic garlic NOUN NN dobj xxxx True False
: : PUNCT : punct : False False
2 2 NUM CD quantmod d False False
to to ADP IN quantmod xx True True
3 3 NUM CD nummod d False False
teaspoons teaspoon NOUN NNS appos xxxx True False
sliced slice VERB VBN acl xxxx True False
green green ADJ JJ amod xxxx True False
garlic garlic NOUN NN dobj xxxx True False
, , PUNCT , punct , False False
or or CCONJ CC cc xx True True
1 1 NUM CD conj d False False
or or CCONJ CC cc xx True True
2 2 NUM CD conj d False False
sliced sliced ADJ JJ amod xxxx True False
cloves clove NOUN NNS conj xxxx True False
per per ADP IN prep xxx True True
cup cup NOUN NN pobj xxx True False
of of ADP IN prep xx True True
broth broth NOUN NN pobj xxxx True False
. . PUNCT . punct . False False


ValueError: buffer source array is read-only

In [69]:
# another simple example...
doc = nlp(u'This is a sentence.')
displacy.serve(doc, style='dep')

ValueError: buffer source array is read-only