## Ingredient Phrase Model

This program will create a model that is designed to separately identify food name, quantity and other information as Named Entity Recognition tags from a word ingredient list.

In [1]:
import pandas as pd
import numpy as np
import os
import pickle

# Generate training data using NY Times ingredient phrase tagger
from ingredient_phrase_tagger.training.cli import Cli
from ingredient_phrase_tagger.training import utils, reshape

from sklearn.model_selection import train_test_split

# Model libraries
from tagger_model import *

from IPython.core.debugger import set_trace

  (fname, cnt))
  (fname, cnt))
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# Some default parameters
n_word_embedding_nodes=300
n_tag_embedding_nodes=150
n_RNN_nodes=400
n_dense_nodes=200

dataPath = '../data'

In [3]:
filename = 'cleaned_nyt_ingred_data.pkl'

# reshape.read_and_save_raw_data(dataPath, filename)
cleaned_dat = pd.read_pickle(os.path.join(dataPath, filename))

In [4]:
train, test = train_test_split(cleaned_dat, test_size = .2)

In [5]:
# Create lexicon
lexicon = lexiconTransformer(words_min_freq=2, unknown_tag_token='OTHER')

lexicon.fit(train.sents, train.tags)

train['sent_indx'], train['tag_indx'] = lexicon.transform(train.sents, train.tags)

# Get length of longest sequence
max_seq_len = get_max_seq_len(train['sent_indx'])

#Add one to max length for offsetting sequence by 1
train_padded_words = pad_idx_seqs(train['sent_indx'], 
                                  max_seq_len + 1) 

train_padded_tags = pad_idx_seqs(train['tag_indx'],
                                 max_seq_len + 1)

# Shift tags by 1 for training since we should use tag of previous 
# iteration in next iteration
shifted_train_padded_tags = np.insert(train_padded_tags, 0, 1, axis=1)[:, :-1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [6]:
# Convert to one-hot vector encoding for y
# train_y = [to_categorical(i, num_classes=len(lexicon.tags_lexicon) + 1) for i in train_padded_tags]

In [7]:
mod_save_name = 'ingredient_model_crf_wordOnly'
crf_mod = True

In [None]:
ingredient_model = run_training_model(train_padded_words, train_padded_tags, 
                                      train_padded_tags, mod_save_name, lexicon, crf=crf_mod,
                                      print_summary=True, batch_size=256, epochs=200,
                                      n_word_embedding_nodes=n_word_embedding_nodes,
                                      n_tag_embedding_nodes=n_tag_embedding_nodes,
                                      n_RNN_nodes=n_RNN_nodes, 
                                      n_dense_nodes=n_dense_nodes)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
word_input_layer (InputLayer (None, 56)                0         
_________________________________________________________________
word_embedding_layer (Embedd (None, 56, 300)           1691100   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 56, 800)           1682400   
_________________________________________________________________
dense_layer (TimeDistributed (None, 56, 200)           160200    
_________________________________________________________________
output_layer (CRF)           (None, 56, 10)            2130      
Total params: 3,535,830
Trainable params: 3,535,830
Non-trainable params: 0
_________________________________________________________________
Train on 114600 samples, validate on 28650 samples
Epoch 1/200

Epoch 00001: saving model to models/ingredient_model_crf_wordOnly.hd

In [None]:
test['sent_indx'], test['tag_indx'] = lexicon.transform(test.sents, test.tags)

In [None]:
test_mod = create_test_model(mod_save_name, lexicon, crf=crf_mod, 
                             n_word_embedding_nodes=n_word_embedding_nodes,
                             n_tag_embedding_nodes=n_tag_embedding_nodes,
                             n_RNN_nodes=n_RNN_nodes, 
                             n_dense_nodes=n_dense_nodes)

In [None]:
preds = predict_new_tag(test_mod, test[:50], lexicon)

In [None]:
evaluate_model(preds, test[:50], print_sample=True)