In [1]:
from ingredient_phrase_tagger.training.cli import Cli
from ingredient_phrase_tagger.training.cli import utils as ingred_utils

import pandas as pd
import numpy as np
import os
import re
import pickle

# Model libraries
from tagger_model import *

  (fname, cnt))
  (fname, cnt))
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# Objects for later use
dataPath = '../data/'
ingred_mod_save_name = 'ingredient_model_clean_tags_crf_wordOnly'
ingred_crf_mod = True

In [13]:
# Read in raw data
json_files = [os.path.join(dataPath, file) for file in os.listdir(dataPath) if
              file.endswith('.json')]

raw = pd.concat([pd.read_json(file) for file in json_files])
raw.reset_index(inplace=True)

In [12]:
raw.reset_index?


Ingredient Model to Apply Named-Entity-Recognition to Ingredients to be able to pull out the actual ingredients

In [4]:
def parse_ingredients(recipes_ingredients):
    return [[ingred_utils.tokenize(ingredient) for ingredient in recipe] 
                    for recipe in recipes_ingredients]

def reshape_ingredients(row):
    """Reformat so that instead of each row being one recipe with several 
       ingredients, each row will be one ingredient"""
    index = [row.name] * len(row['token_ingred'])
    return pd.Series(row['token_ingred'], index = index)

def predict_ingred_ner(raw):
    """Predict NER ingredients"""
    
    # Tokenize the ingredients
    raw['token_ingred'] = parse_ingredients(raw.ingredients)
    
    # Reshape ingredients for tagging
    ingreds = []
    for i in range(raw.shape[0]):
        ingreds.append(reshape_ingredients(raw.iloc[i]))
    ingred_data = pd.concat(ingreds)

    # Load ingredient tagger lexicon
    ingred_lexicon = lexiconTransformer(words_min_freq=2, unknown_tag_token='OTHER', saveNamePrefix='Ingred_mod')
    ingred_lexicon.load_lexicon()

    # Convert Ingredients from words to tokens for modeling
    indx_ingred, _ = ingred_lexicon.transform(ingred_data, [])
    indx_ingred = pd.Series(indx_ingred, index=ingred_data.index)

    # Combine sentences and tokens into a DataFrame
    ingred_final = pd.concat([ingred_data, indx_ingred], axis=1)
    ingred_final.columns = ['sents', 'sent_indx']

    # Ingredient parameters
    n_word_embedding_nodes=300
    n_tag_embedding_nodes=150
    n_RNN_nodes=400
    n_dense_nodes=200

    ingred_mod = create_test_model(ingred_mod_save_name, ingred_lexicon, crf=ingred_crf_mod, 
                                   n_word_embedding_nodes=n_word_embedding_nodes,
                                   n_tag_embedding_nodes=n_tag_embedding_nodes,
                                   n_RNN_nodes=n_RNN_nodes, 
                                   n_dense_nodes=n_dense_nodes)

    ingred_preds = predict_new_tag(ingred_mod, ingred_final, ingred_lexicon)
    
    return (ingred_final, ingred_preds)
#     ingred_res = pd.concat([ingred_preds, ingred_preds], axis=1)
#     ingred_res.columns = ['sents', 'sent_indx', 'predictions']
#     return ingred_res

In [6]:
# Tokenize the ingredients
raw['token_ingred'] = parse_ingredients(raw.ingredients)

# Reshape ingredients for tagging
ingreds = []
for i in range(raw.shape[0]):
    ingreds.append(reshape_ingredients(raw.iloc[i]))
ingred_data = pd.concat(ingreds)

# Load ingredient tagger lexicon
ingred_lexicon = lexiconTransformer(words_min_freq=2, unknown_tag_token='OTHER', saveNamePrefix='Ingred_mod')
ingred_lexicon.load_lexicon()

# Convert Ingredients from words to tokens for modeling
indx_ingred, _ = ingred_lexicon.transform(ingred_data, [])
indx_ingred = pd.Series(indx_ingred, index=ingred_data.index)

# Combine sentences and tokens into a DataFrame
ingred_final = pd.concat([ingred_data, indx_ingred], axis=1)
ingred_final.columns = ['sents', 'sent_indx']

# Get length of longest sequence
max_seq_len = get_max_seq_len(ingred_final['sent_indx'])

#Add one to max length for offsetting sequence by 1
padded_words = pad_idx_seqs(ingred_final['sent_indx'], 
                                  max_seq_len + 1) 

# Ingredient parameters
n_word_embedding_nodes=300
n_tag_embedding_nodes=150
n_RNN_nodes=400
n_dense_nodes=200

ingred_mod = create_test_model(ingred_mod_save_name, ingred_lexicon, crf=ingred_crf_mod, 
                               n_word_embedding_nodes=n_word_embedding_nodes,
                               n_tag_embedding_nodes=n_tag_embedding_nodes,
                               n_RNN_nodes=n_RNN_nodes, 
                               n_dense_nodes=n_dense_nodes,
                              seq_input_len=max_seq_len,
                              stateful=False)

def reshape_and_predict_row(x):
    preds = ingred_mod.predict(np.reshape(x, [1, max_seq_len]))
    pred_tags = np.apply_along_axis(lambda x: np.argmax(x, axis=-1), 2, preds)
    pred_tags = np.reshape(pred_tags, (max_seq_len))
    return [ingred_lexicon.indx_to_tags_dict[tag] if tag in ingred_lexicon.indx_to_tags_dict.keys() else 0 for tag in pred_tags]

preds = np.apply_along_axis(reshape_and_predict_row, 1, padded_words[:, 1:])



  return _compile(pattern, flags).split(string, maxsplit)


In [105]:
tmp = np.apply_along_axis(reshape_and_predict_row, 1, padded_words[:10, 1:])

In [106]:
tmp == 'NAME'

array([[False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False,  True,  True],
       [False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False,  True],
       [False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, Fal

In [5]:
ingred_preds = predict_ingred_ner(raw)

  return _compile(pattern, flags).split(string, maxsplit)


TypeError: cannot concatenate object of type "<class 'list'>"; only pd.Series, pd.DataFrame, and pd.Panel (deprecated) objs are valid

In [None]:
ingred_preds.to_pickle(os.path.join(dataPath, 'ingred_predictions.pkl'))

In [5]:
aa = raw.directions

In [10]:
aa.iloc[0]

['Grease and flour two 8 x 4 inch pans.  Preheat oven to 325 degrees F (165 degrees C).',
 'Sift flour, salt, baking powder, soda, and cinnamon together in a bowl.',
 'Beat eggs, oil, vanilla, and sugar together in a large bowl.  Add sifted ingredients to the creamed mixture, and beat well.  Stir in zucchini and nuts until well combined.  Pour batter into prepared pans.',
 'Bake for 40 to 60 minutes, or until tester inserted in the center comes out clean.  Cool in pan on rack for 20 minutes.  Remove bread from pan, and completely cool.']

In [52]:
z = [list(gensim.utils.tokenize(x, lower=True)) for x in aa]

In [48]:
aa

['Grease and flour two 8 x 4 inch pans.  Preheat oven to 325 degrees F (165 degrees C).',
 'Sift flour, salt, baking powder, soda, and cinnamon together in a bowl.',
 'Beat eggs, oil, vanilla, and sugar together in a large bowl.  Add sifted ingredients to the creamed mixture, and beat well.  Stir in zucchini and nuts until well combined.  Pour batter into prepared pans.',
 'Bake for 40 to 60 minutes, or until tester inserted in the center comes out clean.  Cool in pan on rack for 20 minutes.  Remove bread from pan, and completely cool.']

In [55]:
import spacy

In [56]:
nlp = spacy.load('en_core_web_sm')

In [59]:
nlp.tagger(aa)

AttributeError: 'list' object has no attribute 'doc'

In [53]:
[nltk.pos_tag(x) for x in z]

[[('grease', 'NN'),
  ('and', 'CC'),
  ('flour', 'JJ'),
  ('two', 'CD'),
  ('x', 'JJ'),
  ('inch', 'NN'),
  ('pans', 'NNS'),
  ('preheat', 'VBP'),
  ('oven', 'RB'),
  ('to', 'TO'),
  ('degrees', 'NNS'),
  ('f', 'VB'),
  ('degrees', 'NNS'),
  ('c', 'VB')],
 [('sift', 'VB'),
  ('flour', 'JJ'),
  ('salt', 'NN'),
  ('baking', 'VBG'),
  ('powder', 'NN'),
  ('soda', 'NN'),
  ('and', 'CC'),
  ('cinnamon', 'NN'),
  ('together', 'RB'),
  ('in', 'IN'),
  ('a', 'DT'),
  ('bowl', 'NN')],
 [('beat', 'NN'),
  ('eggs', 'NNS'),
  ('oil', 'NN'),
  ('vanilla', 'NN'),
  ('and', 'CC'),
  ('sugar', 'NN'),
  ('together', 'RB'),
  ('in', 'IN'),
  ('a', 'DT'),
  ('large', 'JJ'),
  ('bowl', 'NN'),
  ('add', 'NN'),
  ('sifted', 'VBD'),
  ('ingredients', 'NNS'),
  ('to', 'TO'),
  ('the', 'DT'),
  ('creamed', 'JJ'),
  ('mixture', 'NN'),
  ('and', 'CC'),
  ('beat', 'NN'),
  ('well', 'RB'),
  ('stir', 'RB'),
  ('in', 'IN'),
  ('zucchini', 'NN'),
  ('and', 'CC'),
  ('nuts', 'NNS'),
  ('until', 'IN'),
  ('well', 'RB'