## Ingredient Phrase Model

This program will create a model that is designed to separately identify food name, quantity and other information as Named Entity Recognition tags from a word ingredient list.

In [1]:
%autosave 0

Autosave disabled


In [2]:
import pandas as pd
import numpy as np
import os
import pickle

# Generate training data using NY Times ingredient phrase tagger
from ingredient_phrase_tagger.training.cli import Cli
from ingredient_phrase_tagger.training import utils, reshape

from sklearn.model_selection import train_test_split

# Model libraries
from tagger_model import *

from IPython.core.debugger import set_trace

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
# Some default parameters
n_word_embedding_nodes=300
n_tag_embedding_nodes=150
n_RNN_nodes=400
n_dense_nodes=200

dataPath = '../data'

In [4]:
filename = 'cleaned_nyt_ingred_data.pkl'

# reshape.read_and_save_raw_data(dataPath, filename)
cleaned_dat = pd.read_pickle(os.path.join(dataPath, filename))

In [5]:
train, test = train_test_split(cleaned_dat, test_size = .2)

In [6]:
# Create lexicon
lexicon = lexiconTransformer(words_min_freq=2, unknown_tag_token='OTHER')

lexicon.fit(train.sents, train.tags)

train['sent_indx'], train['tag_indx'] = lexicon.transform(train.sents, train.tags)

# Get length of longest sequence
max_seq_len = get_max_seq_len(train['sent_indx'])

#Add one to max length for offsetting sequence by 1
train_padded_words = pad_idx_seqs(train['sent_indx'], 
                                  max_seq_len + 1) 

train_padded_tags = pad_idx_seqs(train['tag_indx'],
                                 max_seq_len + 1, value=1)

# Shift tags by 1 for training since we should use tag of previous 
# iteration in next iteration
shifted_train_padded_tags = np.insert(train_padded_tags, 0, 1, axis=1)[:, :-1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [7]:
# Convert to one-hot vector encoding for y
# train_y = [to_categorical(i, num_classes=len(lexicon.tags_lexicon) + 1) for i in train_padded_tags]

In [None]:
mod_save_name = 'ingredient_model_crf_tmp1'
crf_mod = True

In [None]:
ingredient_model = run_training_model(train_padded_words, shifted_train_padded_tags, 
                                      train_padded_tags, mod_save_name, lexicon, crf=crf_mod,
                                      print_summary=True, batch_size=256, epochs=200,
                                      n_word_embedding_nodes=n_word_embedding_nodes,
                                      n_tag_embedding_nodes=n_tag_embedding_nodes,
                                      n_RNN_nodes=n_RNN_nodes, 
                                      n_dense_nodes=n_dense_nodes)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
word_input_layer (InputLayer)   (None, 56)           0                                            
__________________________________________________________________________________________________
tag_input_layer (InputLayer)    (None, 56)           0                                            
__________________________________________________________________________________________________
word_embedding_layer (Embedding (None, 56, 300)      1690800     word_input_layer[0][0]           
__________________________________________________________________________________________________
tag_embedding_layer (Embedding) (None, 56, 150)      1500        tag_input_layer[0][0]            
__________________________________________________________________________________________________
concat_emb

In [None]:
test['sent_indx'], test['tag_indx'] = lexicon.transform(test.sents, test.tags)

In [None]:
test_mod = create_test_model(mod_save_name, lexicon, crf=crf_mod, 
                             n_word_embedding_nodes=n_word_embedding_nodes,
                             n_tag_embedding_nodes=n_tag_embedding_nodes,
                             n_RNN_nodes=n_RNN_nodes, 
                             n_dense_nodes=n_dense_nodes)

In [None]:
preds = predict_new_tag(test_mod, test, lexicon)

In [None]:
evaluate_model(preds, test, print_sample=True)