In [1]:
from ingredient_phrase_tagger.training.cli import Cli
from ingredient_phrase_tagger.training.cli import utils as ingred_utils

import pandas as pd
import numpy as np
import os
import re
import pickle
import random
import string
import math
from nltk.stem.wordnet import WordNetLemmatizer

from keras.preprocessing.text import text_to_word_sequence

# Model libraries
from tagger_model import *

# Recommendation Model
import gensim
from sklearn.metrics.pairwise import pairwise_distances, cosine_similarity, euclidean_distances, manhattan_distances
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from IPython.core.debugger import set_trace 

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# Objects for later use
dataPath = '../data/'
ingred_mod_save_name = 'ingredient_model_clean_tags_crf_wordOnly'
ingred_crf_mod = True

In [3]:
# Read in raw data
json_files = [os.path.join(dataPath, file) for file in os.listdir(dataPath) if
              file.endswith('.json')]

raw = pd.concat([pd.read_json(file) for file in json_files])
raw.reset_index(inplace=True)

## Pre-process Ingredients

Ingredient Model to Apply Named-Entity-Recognition to Ingredients to be able to pull out the actual ingredients

In [4]:
def parse_ingredients(recipes_ingredients):
    return [[ingred_utils.tokenize(ingredient) for ingredient in recipe] 
                    for recipe in recipes_ingredients]

def reshape_ingredients(row):
    """Reformat so that instead of each row being one recipe with several 
       ingredients, each row will be one ingredient"""
    index = [row.name] * len(row['token_ingred'])
    return pd.Series(row['token_ingred'], index = index)

def predict_ingred_ner(raw):
    """Predict NER ingredients"""
    
    # Tokenize the ingredients
    raw['token_ingred'] = parse_ingredients(raw.ingredients)
    
    # Reshape ingredients for tagging
    ingreds = []
    for i in range(raw.shape[0]):
        ingreds.append(reshape_ingredients(raw.iloc[i]))
    ingred_data = pd.concat(ingreds)

    # Load ingredient tagger lexicon
    ingred_lexicon = lexiconTransformer(words_min_freq=2, unknown_tag_token='OTHER', saveNamePrefix='Ingred_mod')
    ingred_lexicon.load_lexicon()

    # Convert Ingredients from words to tokens for modeling
    indx_ingred, _ = ingred_lexicon.transform(ingred_data, [])
    indx_ingred = pd.Series(indx_ingred, index=ingred_data.index)

    # Combine sentences and tokens into a DataFrame
    ingred_final = pd.concat([ingred_data, indx_ingred], axis=1)
    ingred_final.columns = ['sents', 'sent_indx']

    # Ingredient parameters
    n_word_embedding_nodes=300
    n_tag_embedding_nodes=150
    n_RNN_nodes=400
    n_dense_nodes=200

    ingred_mod = create_test_model(ingred_mod_save_name, ingred_lexicon, crf=ingred_crf_mod, 
                                   n_word_embedding_nodes=n_word_embedding_nodes,
                                   n_tag_embedding_nodes=n_tag_embedding_nodes,
                                   n_RNN_nodes=n_RNN_nodes, 
                                   n_dense_nodes=n_dense_nodes)

    ingred_preds = predict_new_tag(ingred_mod, ingred_final, ingred_lexicon)
    
    ingred_final['tags'] = pd.Series(ingred_preds, index=ingred_final.index)
    
    return ingred_final
#     ingred_res = pd.concat([ingred_preds, ingred_preds], axis=1)
#     ingred_res.columns = ['sents', 'sent_indx', 'predictions']
#     return ingred_res

In [5]:
# Predict tags of ingredients
# ingred_preds = predict_ingred_ner(raw)

# Save model output so don't need to re-run each time
# ingred_preds.to_pickle(os.path.join(dataPath, 'ingred_predictions.pkl'))

# Load model output
ingred_preds = pd.read_pickle(os.path.join(dataPath, 'ingred_predictions.pkl'))

In [6]:
table = str.maketrans({key: None for key in string.punctuation})

def get_ingred(row, table=table):
    """Find the ingredients tagged by the model.
    
       If no ingredients are tagged, randomly select
       one as long as it isn't a number.
    """
    tagList = [ingred for ingred, tag in zip(row['sents'], row['tags']) if tag == 'NAME']
    
    if tagList == []:
        noNums = [token for token in row['sents'] if not re.search(r'\d', token)]
        if noNums == []:
            return ''
        asSent = random.choice(noNums)
    else:
        asSent = ' '.join(tagList)
    
    removeNums = re.sub(r'\d+', '', asSent)
    removePunct = removeNums.translate(table)
#     removePunct = re.sub(r'{}'.format(string.punctuation), '', removeNums)
    removeExtraSpaces = re.sub(r'\s+', ' ', removePunct)
    removeBegSpace = re.sub(r'^\s', '', removeExtraSpaces)
    return removeBegSpace

In [7]:
# Pull out the ingredients and then recombine all ingredients for 
# one recipe back into a list on one row
ingredients = ingred_preds.apply(get_ingred, axis=1)
ingredients = ingredients.groupby(ingredients.index).apply(lambda x: [y for y in set(x.tolist()) if y != ''])
ingredients.name = 'clean_ingredients'

In [8]:
with_ingreds = raw.join(ingredients)

# Remove those recipes that don't have ingredients
with_ingreds = with_ingreds[~with_ingreds.ingredients.apply(lambda x: x == [] or x is None)]

In [9]:
max_ingred_len = get_max_seq_len(with_ingreds['clean_ingredients'])

In [10]:
ingred_w2v = gensim.models.Word2Vec(with_ingreds['clean_ingredients'],
                                   size=50, min_count=1, workers=-1,
                                   window=max_ingred_len)

In [11]:
def convert_word_mat_to_mean_embed(word_mat, w2v):
    """Finds the average embedding for a list of words"""
    dim = ingred_w2v.layer1_size
    return [np.mean([w2v.wv.word_vec(w) for w in words if w in w2v.wv.vocab.keys()]
                             or [np.zeros(dim)], axis=0)
                     for words in word_mat]

In [12]:
with_ingreds['avg_ingred_embedding'] = convert_word_mat_to_mean_embed(with_ingreds.clean_ingredients, ingred_w2v)

## Pre-process directions

In [13]:
wordnet = WordNetLemmatizer()

def clean_and_tokenize_directions(directions, wordnet=wordnet):
    """Clean up directions for a recipe by:
    
       1. Removing 'Photograph by... statements since these wasted text
       2. Joining all steps into one string
       3. Removing numbers since only interested in cooking verbs
       4. Remove C. and F. which are Celsius and Farenheit indicators
       5. Removing extra white space.
    """
    directions = [wordnet.lemmatize(x.lower()) for x in directions if not re.search(r'^Photograph', x, re.IGNORECASE)]
    oneText = ' '.join(directions)
    noNumbers = re.sub(r'(\d+)\s?x\s?\d+', '', oneText)
    noNumbers = re.sub(r'\d+', '', noNumbers)
    noDegrees = re.sub(r' (f|c)\.?\b', '', noNumbers)
    clean_directions = re.sub(r'\s+', ' ', noDegrees)
    tokenized_directions = text_to_word_sequence(clean_directions)
    return tokenized_directions

In [14]:
with_ingreds['clean_directions'] = with_ingreds['directions'].apply(clean_and_tokenize_directions)

In [15]:
dir_w2v = gensim.models.Word2Vec(with_ingreds['clean_directions'],
                                 size=250, min_count=4, workers=-1,
                                 window=3)

In [16]:
cooking_verbs = ['puree', 'cover', 'crumble', 'roll', 'layer', 'saute', 'rotat', 
                 'bak', 'heat', 'blend', 'dress', 'melt', 'stir', 'trim', 'soak', 
                 'microwave', 'cook', 'wrap', 'steam', 'scrape', 'gather', 
                 'quarter', 'spray', 'reduce', 'char', 'pour', 'juice', 'crush', 
                 'wash', 'sift', 'pound', 'marinat', 'spread', 'mix', 'shred', 
                 'dice', 'brush', 'stem', 'cut', 'boil', 'grate', 'slice', 'whisk', 
                 'heat', 'grill', 'fry', 'freeze', 'stuff', 'top', 'toss', 'stew', 
                 'beat', 'swirl', 'warm', 'garnish', 'grease', 'squeeze', 'flour',
                 'place', 'press', 'whip', 'chill', 'combine', 'add', 'use',
                 'thread', 'arrange', 'measure', 'select', 'grind']

In [17]:
def pull_out_cooking_verbs(directions, cooking_verbs=cooking_verbs):
    return re.findall(r'{}'.format('|'.join(cooking_verbs)), ' '.join(directions))
#     return [token for token in directions if token in cooking_verbs]

In [18]:
with_ingreds['direction_verbs'] = with_ingreds['clean_directions'].apply(pull_out_cooking_verbs)

# Filter out those without any directions
with_ingreds = with_ingreds.loc[~with_ingreds['direction_verbs'].apply(lambda x: x == [])]

with_ingreds['avg_directions_embedded'] = convert_word_mat_to_mean_embed(with_ingreds['direction_verbs'], dir_w2v)

## Clean other Columns

Make sure recipe names are unique so that each name is a key.

In [19]:
def clean_recipe_names(names):
    """Replace recipe names if the names already exist"""    
    counts = dict()
    newNames = []
    for name in names:
        counts[name] = counts.get(name, 0) + 1
        newNames.append('{} {}'.format(name, str(counts[name])))
        
    return pd.Series(newNames, index=names.index)

with_ingreds['unique_name'] = clean_recipe_names(with_ingreds.name)

Clean total time column for use in recommendation model

In [20]:
timeDict = {'hr': 60, 'min': 1, 'day': 1440}
def calc_time(timeText, timeDict=timeDict):
    """Calculate time in minutes based on text"""
    num, time = re.search(r'(\d+)\s+(\w+)', timeText).groups()
    return timeDict[time] * int(num)

def find_calc_and_sum_all_time(timeInfo):
    
    if isinstance(timeInfo, list):
        if timeInfo == []:
            return np.NaN
        timeInfo = timeInfo[0]
    
    if not timeInfo:
        return np.NaN

    matches = re.findall(r'(\d+\s+\w+)', timeInfo)
    if matches:
        return sum([calc_time(time) for time in matches])
    return 0

In [21]:
with_ingreds['cleaned_total_time'] = with_ingreds.totalTime.apply(find_calc_and_sum_all_time)

In [22]:
# Count the number of recipes that will be deleted if total Time is missing
with_ingreds['cleaned_total_time'].loc[(with_ingreds['cleaned_total_time'].isnull())].shape

(8154,)

In [23]:
# Count the number of recipes that will be deleted if total Time is missing
with_ingreds['cleaned_total_time'].loc[(with_ingreds['cleaned_total_time'].isnull()) |
                                       (with_ingreds['cleaned_total_time'] > 2880)].shape

(8391,)

In [24]:
# Scale time 
def scale_time_to_embeddings(totalTime):
    """Scale the Total Time column so that it has the 
       same Min/Max as the emeddings so that it doesn't 
       dominate the recommendations.
       
       Will make sure the time is logged since some recipes call
       for days which greatly skews the recipe.
       """
    minNum = min(min(with_ingreds.avg_ingred_embedding.apply(min)),
                 min(with_ingreds.avg_directions_embedded.apply(min)))
    maxNum = max(max(with_ingreds.avg_ingred_embedding.apply(max)),
                 max(with_ingreds.avg_directions_embedded.apply(max)))
    scaler = MinMaxScaler(feature_range=(minNum, maxNum))
    return scaler.fit_transform(totalTime)

In [25]:
# Delete off rows with missing data and recipes longer than 2 days
finalData = with_ingreds.loc[(with_ingreds['cleaned_total_time'].notnull()) |
                                       (with_ingreds['cleaned_total_time'] < 2880)]

In [26]:
finalData['logged_total_time'] = finalData['cleaned_total_time'].apply(math.log)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [27]:
finalData['scaled_total_time'] = scale_time_to_embeddings(finalData['logged_total_time'].reshape(-1, 1))

  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [3]:
# Save for later use
finalData.to_pickle(os.path.join(dataPath, 'final_data.pkl'))

In [28]:
# Create final data for model
def create_model_data(finalData):
    """Prepare an array to be used for recommendation model.
    
       Will pull out the correct columns, turn the embedding columns
       from one column to several columns, scale all columns to put 
       them in the same feature space.
    """
    
    modelVars = ['avg_ingred_embedding', 'avg_directions_embedded', 'scaled_total_time']
    tmpData = finalData.loc[:, modelVars]
    
    avgIngredCols = ['AvgIngredEmbed{}'.format(i) for i in range(len(tmpData['avg_ingred_embedding'][0]))]
    avgDirCols = ['AvgDirEmbed{}'.format(i) for i in range(len(tmpData['avg_directions_embedded'][0]))]
    modelData = pd.concat([pd.DataFrame.from_records(tmpData['avg_ingred_embedding'],
                                                     columns=avgIngredCols),
                           pd.DataFrame.from_records(tmpData['avg_directions_embedded'],
                                                     columns=avgDirCols),
                           pd.DataFrame(tmpData['scaled_total_time'].tolist())], axis=1, ignore_index=True)
    modelData.index = tmpData.index
    return modelData
    

In [22]:
modelData = create_model_data(finalData)
modelData = modelData.dropna()

In [4]:
# Save for later use
modelData.to_pickle(os.path.join(dataPath, 'model_data.pkl'))