In [481]:
from ingredient_phrase_tagger.training.cli import Cli
from ingredient_phrase_tagger.training.cli import utils as ingred_utils

import pandas as pd
import numpy as np
import os
import re
import pickle
import random
import string
from nltk.stem.wordnet import WordNetLemmatizer

from keras.preprocessing.text import text_to_word_sequence

# Model libraries
from tagger_model import *

# Recommendation Model
import gensim
from IPython.core.debugger import set_trace

In [2]:
# Objects for later use
dataPath = '../data/'
ingred_mod_save_name = 'ingredient_model_clean_tags_crf_wordOnly'
ingred_crf_mod = True

In [3]:
# Read in raw data
json_files = [os.path.join(dataPath, file) for file in os.listdir(dataPath) if
              file.endswith('.json')]

raw = pd.concat([pd.read_json(file) for file in json_files])
raw.reset_index(inplace=True)

## Pre-process Ingredients

Ingredient Model to Apply Named-Entity-Recognition to Ingredients to be able to pull out the actual ingredients

In [7]:
def parse_ingredients(recipes_ingredients):
    return [[ingred_utils.tokenize(ingredient) for ingredient in recipe] 
                    for recipe in recipes_ingredients]

def reshape_ingredients(row):
    """Reformat so that instead of each row being one recipe with several 
       ingredients, each row will be one ingredient"""
    index = [row.name] * len(row['token_ingred'])
    return pd.Series(row['token_ingred'], index = index)

def predict_ingred_ner(raw):
    """Predict NER ingredients"""
    
    # Tokenize the ingredients
    raw['token_ingred'] = parse_ingredients(raw.ingredients)
    
    # Reshape ingredients for tagging
    ingreds = []
    for i in range(raw.shape[0]):
        ingreds.append(reshape_ingredients(raw.iloc[i]))
    ingred_data = pd.concat(ingreds)

    # Load ingredient tagger lexicon
    ingred_lexicon = lexiconTransformer(words_min_freq=2, unknown_tag_token='OTHER', saveNamePrefix='Ingred_mod')
    ingred_lexicon.load_lexicon()

    # Convert Ingredients from words to tokens for modeling
    indx_ingred, _ = ingred_lexicon.transform(ingred_data, [])
    indx_ingred = pd.Series(indx_ingred, index=ingred_data.index)

    # Combine sentences and tokens into a DataFrame
    ingred_final = pd.concat([ingred_data, indx_ingred], axis=1)
    ingred_final.columns = ['sents', 'sent_indx']

    # Ingredient parameters
    n_word_embedding_nodes=300
    n_tag_embedding_nodes=150
    n_RNN_nodes=400
    n_dense_nodes=200

    ingred_mod = create_test_model(ingred_mod_save_name, ingred_lexicon, crf=ingred_crf_mod, 
                                   n_word_embedding_nodes=n_word_embedding_nodes,
                                   n_tag_embedding_nodes=n_tag_embedding_nodes,
                                   n_RNN_nodes=n_RNN_nodes, 
                                   n_dense_nodes=n_dense_nodes)

    ingred_preds = predict_new_tag(ingred_mod, ingred_final, ingred_lexicon)
    
    ingred_final['tags'] = pd.Series(ingred_preds, index=ingred_final.index)
    
    return ingred_final
#     ingred_res = pd.concat([ingred_preds, ingred_preds], axis=1)
#     ingred_res.columns = ['sents', 'sent_indx', 'predictions']
#     return ingred_res

In [8]:
# Predict tags of ingredients
# ingred_preds = predict_ingred_ner(raw)

# Save model output so don't need to re-run each time
# ingred_preds.to_pickle(os.path.join(dataPath, 'ingred_predictions.pkl'))

# Load model output
ingred_preds = pd.read_pickle(os.path.join(dataPath, 'ingred_predictions.pkl'))

  return _compile(pattern, flags).split(string, maxsplit)


In [188]:
table = str.maketrans({key: None for key in string.punctuation})

def get_ingred(row, table=table):
    """Find the ingredients tagged by the model.
    
       If no ingredients are tagged, randomly select
       one as long as it isn't a number.
    """
    tagList = [ingred for ingred, tag in zip(row['sents'], row['tags']) if tag == 'NAME']
    
    if tagList == []:
        noNums = [token for token in row['sents'] if not re.search(r'\d', token)]
        if noNums == []:
            return ''
        asSent = random.choice(noNums)
    else:
        asSent = ' '.join(tagList)
    
    removeNums = re.sub(r'\d+', '', asSent)
    removePunct = removeNums.translate(table)
#     removePunct = re.sub(r'{}'.format(string.punctuation), '', removeNums)
    removeExtraSpaces = re.sub(r'\s+', ' ', removePunct)
    removeBegSpace = re.sub(r'^\s', '', removeExtraSpaces)
    return removeBegSpace

In [214]:
# Pull out the ingredients and then recombine all ingredients for 
# one recipe back into a list on one row
ingredients = ingred_preds.apply(get_ingred, axis=1)
ingredients = ingredients.groupby(ingredients.index).apply(lambda x: [y for y in set(x.tolist()) if y != ''])
ingredients.name = 'clean_ingredients'

In [215]:
with_ingreds = raw.join(ingredients)

# Remove those recipes that don't have ingredients
with_ingreds = with_ingreds[~with_ingreds.ingredients.apply(lambda x: x == [] or x is None)]

In [216]:
max_ingred_len = get_max_seq_len(with_ingreds['clean_ingredients'])

In [270]:
ingred_w2v = gensim.models.Word2Vec(with_ingreds['clean_ingredients'],
                                   size=50, min_count=1, workers=-1,
                                   window=max_ingred_len)

In [303]:
def convert_word_mat_to_mean_embed(word_mat, w2v):
    """Finds the average embedding for a list of words"""
    dim = ingred_w2v.layer1_size
    return [np.mean([w2v.wv.word_vec(w) for w in words if w in w2v.wv.vocab.keys()]
                             or [np.zeros(dim)], axis=0)
                     for words in word_mat]

In [311]:
with_ingreds['avg_ingred_embedding'] = convert_word_mat_to_mean_embed(with_ingreds.clean_ingredients, ingred_w2v)

## Pre-process directions

In [403]:
wordnet = WordNetLemmatizer()

In [404]:
def clean_and_tokenize_directions(directions, wordnet=wordnet):
    """Clean up directions for a recipe by:
    
       1. Removing 'Photograph by... statements since these wasted text
       2. Joining all steps into one string
       3. Removing numbers since only interested in cooking verbs
       4. Remove C. and F. which are Celsius and Farenheit indicators
       5. Removing extra white space.
    """
    directions = [wordnet.lemmatize(x.lower()) for x in directions if not re.search(r'^Photograph', x, re.IGNORECASE)]
    oneText = ' '.join(directions)
    noNumbers = re.sub(r'(\d+)\s?x\s?\d+', '', oneText)
    noNumbers = re.sub(r'\d+', '', noNumbers)
    noDegrees = re.sub(r' (f|c)\.?\b', '', noNumbers)
    clean_directions = re.sub(r'\s+', ' ', noDegrees)
    tokenized_directions = text_to_word_sequence(clean_directions)
    return tokenized_directions

In [405]:
with_ingreds['clean_directions'] = with_ingreds['directions'].apply(clean_and_tokenize_directions)

In [406]:
dir_w2v = gensim.models.Word2Vec(with_ingreds['clean_directions'],
                                 size=250, min_count=4, workers=-1,
                                 window=3)

In [407]:
cooking_verbs = ['puree', 'cover', 'crumble', 'roll', 'layer', 'saute', 'rotat', 
                 'bak', 'heat', 'blend', 'dress', 'melt', 'stir', 'trim', 'soak', 
                 'microwave', 'cook', 'wrap', 'steam', 'scrape', 'gather', 
                 'quarter', 'spray', 'reduce', 'char', 'pour', 'juice', 'crush', 
                 'wash', 'sift', 'pound', 'marinat', 'spread', 'mix', 'shred', 
                 'dice', 'brush', 'stem', 'cut', 'boil', 'grate', 'slice', 'whisk', 
                 'heat', 'grill', 'fry', 'freeze', 'stuff', 'top', 'toss', 'stew', 
                 'beat', 'swirl', 'warm', 'garnish', 'grease', 'squeeze', 'flour']

In [408]:
def pull_out_cooking_verbs(directions, cooking_verbs=cooking_verbs):
    return [token for token in directions if token in cooking_verbs]

In [409]:
with_ingreds['direction_verbs'] = with_ingreds['clean_directions'].apply(pull_out_cooking_verbs)

In [533]:
with_ingreds['avg_directions_embedded'] = convert_word_mat_to_mean_embed(with_ingreds['direction_verbs'], dir_w2v)

## Clean other Columns

In [531]:
timeDict = {'hr': 60, 'min': 1, 'day': 1440}
def calc_time(timeText, timeDict=timeDict):
    """Calculate time in minutes based on text"""
    num, time = re.search(r'(\d+)\s+(\w+)', timeText).groups()
    return timeDict[time] * int(num)

def find_calc_and_sum_all_time(timeInfo):
    
    if isinstance(timeInfo, list):
        if timeInfo == []:
            return np.NaN
        timeInfo = timeInfo[0]
    
    if not timeInfo:
        return np.NaN

    matches = re.findall(r'(\d+\s+\w+)', timeInfo)
    if matches:
        return sum([calc_time(time) for time in matches])
    return 0

In [532]:
with_ingreds['cleaned_total_time'] = with_ingreds.totalTime.apply(find_calc_and_sum_all_time)

In [530]:
with_ingreds['cleaned_total_time'].isnull().sum()

8182

In [534]:
finalVars = ['avg_ingred_embedding', 'avg_directions_embedded', 'cleaned_total_time']

finalData = with_ingreds.loc[with_ingreds['cleaned_total_time'].notnull(), finalVars]

In [536]:
finalData

Unnamed: 0,avg_ingred_embedding,avg_directions_embedded,cleaned_total_time
0,"[-0.002554071, 0.00012274507, -0.00066108914, ...","[-0.0002142841, 0.000621173, 0.00038798864, 0....",100.0
1,"[-0.0017624553, -0.0010912433, -0.0037185396, ...","[0.00024043143, -0.00052727404, 0.0007577184, ...",90.0
2,"[-0.0029958806, 0.0005020515, 0.0008761308, 0....","[-0.00010260405, 7.114514e-05, 0.00014821571, ...",65.0
3,"[0.00034149885, 0.0010884015, -0.0020943417, 0...","[0.00038024978, -0.0010569141, 0.00025668694, ...",370.0
4,"[-0.0007655992, -0.00072226144, -0.0004589106,...","[0.0004267349, -0.00055982184, -1.9646184e-05,...",15.0
5,"[0.0019211074, -0.004860644, 0.0057362337, 0.0...","[0.0007993486, -0.0015581148, -2.8636268e-05, ...",490.0
6,"[-0.00049967814, 0.0020041328, -0.0014023451, ...","[0.00023668956, 0.00034705573, 0.0002668118, -...",180.0
7,"[-0.0020711154, -0.0010029364, -0.0017682001, ...","[0.0012741819, -0.00018735132, -0.0009197527, ...",120.0
8,"[-0.0017636544, 0.0025071239, 0.0011285231, 2....","[-9.8682074e-05, 0.00024463306, 5.8578444e-06,...",50.0
9,"[0.002081891, -0.002143404, 0.0034835201, 0.00...","[0.0002362327, -0.0013590717, 0.00031585593, -...",250.0
