# Difficulty Exploration

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import spacy

In [2]:
dirty = pd.read_csv('data/dirty_recipes.csv')

In [3]:
dirty.head()

Unnamed: 0,title,method,ingredients,url
0,"Lentil, Apple, and Turkey Wrap","1. Place the stock, lentils, celery, carrot, t...",['4 cups low-sodium vegetable or chicken stock...,
1,Boudin Blanc Terrine with Red Onion Confit,Combine first 9 ingredients in heavy medium sa...,"['1 1/2 cups whipping cream', '2 medium onions...",
2,Potato and Fennel Soup Hodge,In a large heavy saucepan cook diced fennel an...,"['1 fennel bulb (sometimes called anise), stal...",
3,Mahi-Mahi in Tomato Olive Sauce,Heat oil in heavy large skillet over medium-hi...,"['2 tablespoons extra-virgin olive oil', '1 cu...",
4,Spinach Noodle Casserole,Preheat oven to 350°F. Lightly grease 8x8x2-in...,"['1 12-ounce package frozen spinach soufflé, t...",


In [4]:
# check a method example
dirty['method'][20_000]

'Peel a wide strip from around the middle of each potato. In a kettle combine potatoes and garlic with enough salted water to cover by 2 inches and boil until potatoes are just tender, about 15 minutes. Drain mixture. Transfer potatoes to a bowl and transfer garlic to a blender. Add to blender lemon juice, oil, and salt and pepper to taste and purée dressing. In the bowl toss potatoes with dressing and parsley. Potatoes may be prepared 4 hours ahead and kept covered. Serve potatoes warm or at room temperature.'

## Extract Verbs

In [4]:
nlp = spacy.load('en_core_web_lg')

In [6]:
# take verbs from first 20,000 recipes

verbs = []
for row in tqdm(dirty['method'].values[:20000]):
    doc = nlp(row)
    for tok in doc:
        if (tok.pos_ == 'VERB'):
            if tok.dep_ == 'ROOT':
                verbs.append(tok.lemma_.lower())
            if tok.dep_ == 'conj':
                verbs.append(tok.lemma_.lower())
            if tok.dep_ == 'xcomp':
                verbs.append(tok.lemma_.lower())
            if tok.dep_ == 'nsubj':
                verbs.append(tok.lemma_.lower())
            if tok.dep_ == 'det':
                verbs.append(tok.lemma_.lower())

verbs = set(verbs)

# ROOT, conj, xcomp (open clausal complement, i.e. "boil" in "bring to boil"), nsubj

100%|████████████████████████████████████████████████████████████| 20000/20000 [14:40<00:00, 22.72it/s]


In [16]:
# left with 1,233 unique verbs
# take a random sample of another 20,000

verbs_rand = []
for row in tqdm(np.random.choice(dirty['method'].values, 20_000)):
    doc = nlp(row)
    for tok in doc:
        if (tok.pos_ == 'VERB'):
            if tok.dep_ == 'ROOT':
                verbs_rand.append(tok.lemma_.lower())
            if tok.dep_ == 'conj':
                verbs_rand.append(tok.lemma_.lower())
            if tok.dep_ == 'xcomp':
                verbs_rand.append(tok.lemma_.lower())
            if tok.dep_ == 'nsubj':
                verbs_rand.append(tok.lemma_.lower())
            if tok.dep_ == 'det':
                verbs_rand.append(tok.lemma_.lower())

verbs_rand = set(verbs_rand)

100%|████████████████████████████████████████████████████████████| 20000/20000 [10:30<00:00, 31.73it/s]


In [22]:
# again

verbs_rand_2 = []
for row in tqdm(np.random.choice(dirty['method'].values, 20_000)):
    doc = nlp(row)
    for tok in doc:
        if (tok.pos_ == 'VERB'):
            if tok.dep_ == 'ROOT':
                verbs_rand_2.append(tok.lemma_.lower())
            if tok.dep_ == 'conj':
                verbs_rand_2.append(tok.lemma_.lower())
            if tok.dep_ == 'xcomp':
                verbs_rand_2.append(tok.lemma_.lower())
            if tok.dep_ == 'nsubj':
                verbs_rand_2.append(tok.lemma_.lower())
            if tok.dep_ == 'det':
                verbs_rand_2.append(tok.lemma_.lower())

verbs_rand_2 = set(verbs_rand_2)

100%|████████████████████████████████████████████████████████████| 20000/20000 [10:20<00:00, 32.23it/s]


In [25]:
# check length of joined verb sets
len(verbs | verbs_rand | verbs_rand_2)

2344

In [35]:
# join verbs
verb_list = list(verbs | verbs_rand | verbs_rand_2)

# check for 'flambe'
('flambe' in verb_list)

True

In [46]:
# quick save for the 30+ minutes it took to pull these verbs
# and I didn't use a random state
# and I need to annotate this in a spreadsheet for time

pd.Series(verb_list).to_csv('data/verb_list.csv', index=False)

## Difficulty Metric plus Manual Annotation

Manual annotation done in separate spreadsheet.

In [5]:
# read in annotated verbs
verb_annot = pd.read_csv('data/verb_list_annot.csv')

# replace zeros with nulls, drop nulls
verb_annot['skill_level'].replace(0, np.nan, inplace=True)
verb_annot.dropna(inplace=True)
verb_annot['skill_level'] *= 2
verb_annot['skill_level'].value_counts()

2.0    123
4.0     99
6.0     47
8.0     19
Name: skill_level, dtype: int64

In [143]:
def manual_difficulty_weighting(method_string):
    '''
    matches method string to weighted actions in verb_annot, returns weight value
    requires spacy
    '''
    doc = nlp(method_string)
    weights = 0
    for tok in doc:
        if tok.pos_ == 'VERB' and tok.dep_ == 'ROOT':
            if tok.lemma_ in verb_annot['verb'].values:
                weights += verb_annot.loc[verb_annot['verb'] == tok.lemma_, 'skill_level'].values[0]
    return weights

def action_counter(method_string):
    '''
    hunts for root verb, counts number of actions/moves
    requires spacy
    '''  
    doc = nlp(method_string)
    actions = 0
    for tok in doc:
        if tok.pos_ == 'VERB' and tok.dep_ == 'ROOT':
            actions += 1
    return actions

def difficulty(method_string):
    '''
    returns actions plus weights for a difficulty rating
    '''
    return manual_difficulty_weighting(method_string) + action_counter(method_string)

## Readability Score

$Readability = 100 * RLW + ASL$ where $RLW = \frac{n_{lw}}{n_w}$, $ASL = \frac{n_w}{n_s}$, $n_{lw} =$ number of words longer than 6 characters, $n_w =$ number of words, and $n_s =$ number of sentences.

In [178]:
def readability_score(method_string):
    '''
    returns LIX readability score
    '''
    doc = nlp(method_string)
    words = [tok for tok in doc if tok.is_alpha]
    n_words = len(words)
    n_long_words = len([word for word in words if len(word) > 6])
    n_sents = len(list(doc.sents))
    rlw = n_long_words / n_words
    asl = n_words / n_sents
    readability = 100 * rlw + asl
    return round(readability)

## Total Effort Metric

In [6]:
def effort(method_string, difficulty_weight = 1):
    '''
    returns engineered effort metric, difficulty weighted by user input
    '''
    return readability_score(method_string) + difficulty_weight * difficulty(method_string)