In [1]:
import spacy
import explacy # simple parser visualization
import pandas as pd
import glob
from importlib import reload

In [2]:
nlp = spacy.load('en')

In [3]:
reload(explacy)

<module 'explacy' from '/root/w266_final/explacy.py'>

In [4]:
# suprisingly becomes child of was, but maybe this is more correct
text = 'The salad was surprisingly tasty.'
explacy.print_parse_info(nlp, text)

Dep tree Token        Dep    Tag Pos  
──────── ──────────── ────── ─── ─────
   ┌─►   The          det    DT  DET  
┌─►└──   salad        nsubj  NN  NOUN 
└┬┬┬──   was          ROOT   VBD VERB 
 ││└─►   surprisingly advmod RB  ADV  
 │└──►   tasty        acomp  JJ  ADJ  
 └───►   .            punct  .   PUNCT


In [5]:
# this looks right... 
text = 'The salad was unexpectedly tasty.'
explacy.print_parse_info(nlp, text)

Dep tree Token        Dep    Tag Pos  
──────── ──────────── ────── ─── ─────
    ┌─►  The          det    DT  DET  
 ┌─►└──  salad        nsubj  NN  NOUN 
┌┼─────  was          ROOT   VBD VERB 
││  ┌─►  unexpectedly advmod RB  ADV  
│└─►└──  tasty        acomp  JJ  ADJ  
└─────►  .            punct  .   PUNCT


In [6]:
files = sorted(glob.glob('layer1.json.[0].valid.gz'))
df = pd.concat([pd.read_pickle(fp) for fp in files], ignore_index=True)
df.shape

(99999, 9)

In [7]:
# let's filter out epicurious recipes with less than 10 steps
keep_columns = ['id', 'partition', 
                'collection', 'title', 
                'ingr_count', 'instr_count', 
                'ingredients', 'instructions']

epicurious = df[(df.collection == 'www.epicurious.com') 
   & (df.instr_count<=10)][keep_columns].sort_values(by=['instr_count'], ascending=False)

# train, val, test counts
epicurious.groupby('partition').size()

partition
test      316
train    1403
val       297
dtype: int64

In [8]:
epicurious[(epicurious.ingr_count==1)].head()

Unnamed: 0,id,partition,collection,title,ingr_count,instr_count,ingredients,instructions
37223,093de97d8c,train,www.epicurious.com,Poached Eggs over Vegetables,1,8,[{'text': '4 eggs'}],[{'text': 'Use a nonstick frying pan that will...
21658,055df8d631,test,www.epicurious.com,Garlic Broth with Sage and Parsley,1,7,[{'text': '2 to 3 teaspoons sliced green garli...,[{'text': 'Peel and slice fresh garlic: 2 to 3...
68730,1116160ea1,train,www.epicurious.com,Clarified Butter,1,6,"[{'text': '2 cups (4 sticks) unsalted butter, ...",[{'text': 'Place butter pieces in 4-cup glass ...
99103,188fbfe409,test,www.epicurious.com,Corned Beef,1,6,[{'text': '3- to 4-pound corned beef brisket'}],[{'text': 'In a kettle combine beef with cold ...
96647,17f419f514,train,www.epicurious.com,Grilled Corn on the Cob,1,5,"[{'text': '8 ears corn in husks, outer layer o...","[{'text': 'Prepare grill.'}, {'text': 'Peel ba..."


In [9]:
for each in epicurious['instructions'][21658]:
    print(list(each.values())[0])

Peel and slice fresh garlic: 2 to 3 teaspoons sliced green garlic, or 1 or 2 sliced cloves per cup of broth.
Bring some light chicken broth to a boil with a few leaves of fresh sage.
Once it comes to a boil use a skimmer to remove the sage leaves (if the sage cooks too long, the broth will become bitter and dark).
Add the garlic to the broth along with salt to taste.
Cook for about 5 minutes.
Drizzle olive oil over a toasted slice of day-old bread, put it in a bowl, ladle soup over the bread, add a pinch of coarsely chopped parsley, and serve.
For a heartier soup, poach an egg in the broth and serve it on top of the bread.


In [10]:
# spaCy parameters under Token:
# Text: The original word text.
# Lemma: The base form of the word.
# POS: The simple part-of-speech tag.
# Tag: The detailed part-of-speech tag.
# Dep: Syntactic dependency, i.e. the relation between tokens.
# Shape: The word shape – capitalisation, punctuation, digits.
# is alpha: Is the token an alpha character?
# is stop: Is the token part of a stop list, i.e. the most common words of the language?

for each in epicurious['instructions'][21658]:
    instruction = list(each.values())[0]
    # print parser tree for each step
    explacy.print_parse_info(nlp, instruction)

Dep tree                  Token     Dep      Tag Pos  
───────────────────────── ───────── ──────── ─── ─────
┌┬─────────────┬┬─────┬── Peel      ROOT     NN  NOUN 
││             ││     └─► and       cc       CC  CCONJ
││             │└─►┌───── slice     conj     VB  VERB 
││             │   │  ┌─► fresh     amod     JJ  ADJ  
││             │   └─►└── garlic    dobj     NN  NOUN 
││             └────────► :         punct    :   PUNCT
││                   ┌──► 2         quantmod CD  NUM  
││                   │┌─► to        quantmod IN  ADP  
││                ┌─►└┴── 3         nummod   CD  NUM  
│└─►┌─────────────┴────── teaspoons appos    NNS NOUN 
│   └─►┌───────────────── sliced    acl      VBN VERB 
│      │              ┌─► green     amod     JJ  ADJ  
│      └─►┌───────┬──┬┼── garlic    dobj     NN  NOUN 
│         │       │  │└─► ,         punct    ,   PUNCT
│         │       │  └──► or        cc       CC  CCONJ
│         │       └─►┌┬── 1         conj     CD  NUM  
│         