## Ingredient Phrase Model

This program will create a model that is designed to separately identify food name, quantity and other information as Named Entity Recognition tags from a word ingredient list.

In [1]:
import pandas as pd
import numpy as np
import os
import pickle

# Generate training data using NY Times ingredient phrase tagger
from ingredient_phrase_tagger.training.cli import Cli
from ingredient_phrase_tagger.training import utils, reshape
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Model libraries
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Model
from keras.layers import Input, concatenate, Concatenate, TimeDistributed, Dense, Bidirectional
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras_contrib.layers import CRF
from keras.optimizers import Adam
from keras import regularizers
from keras.callbacks import ModelCheckpoint

from IPython.core.debugger import set_trace

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# Some default parameters
n_word_embedding_nodes=300
n_tag_embedding_nodes=150
n_RNN_nodes=400
n_dense_nodes=200

dataPath = '../data'

In [3]:
filename = 'cleaned_nyt_ingred_data.pkl'

reshape.read_and_save_raw_data(dataPath, filename)
cleaned_dat = pd.read_pickle(os.path.join(dataPath, filename))

NameError: name 'Cli' is not defined

In [6]:
train, test = train_test_split(cleaned_dat, test_size = .2)

In [32]:
class lexiconTransformer():
    """Create a lexicon and transform sentences and tags
       to indexes for use in the model."""
    
    def __init__(self, words_min_freq = 1, tags_min_freq = 1, 
                 savePath = 'models', unknown_word_token = u'<UNK>',
                 unknown_tag_token = u'<UNK>'):
        self.words_min_freq = words_min_freq
        self.tags_min_freq = tags_min_freq
        self.words_lexicon = None
        self.unknown_word_token = unknown_word_token
        self.indx_to_words_dict = None
        self.tags_lexicon = None
        self.unknown_tag_token = unknown_tag_token
        self.indx_to_tags_dict = None
        self.savePath = savePath
    
    def fit(self, sents, tags):
        """Create lexicon based on sentences and tags"""
        self.make_words_lexicon(sents)
        self.make_tags_lexicon(tags)
        
        self.make_lexicon_reverse()
        self.save_lexicon()
                
    def transform(self, sents, tags):
        sents_indxs = self.tokens_to_idxs(sents, self.words_lexicon)
        tags_indxs = self.tokens_to_idxs(tags, self.tags_lexicon)
        return (sents_indxs, tags_indxs)

    def fit_transform(self, sents, tags):
        self.fit(sents, tags)
        return self.transform(sents, tags)
        
    def make_words_lexicon(self, sents_token):
        """Wrapper for words lexicon"""
        self.words_lexicon = self.make_lexicon(sents_token, self.words_min_freq,
                                               self.unknown_word_token)

    def make_tags_lexicon(self, tags_token):
        """Wrapper for tags lexicon"""
        self.tags_lexicon = self.make_lexicon(tags_token, self.tags_min_freq,
                                              unknown = self.unknown_tag_token)

    def make_lexicon(self, token_seqs, min_freq=1, unknown = u'<UNK>'):
        """Create lexicon from input based on a frequency

            Parameters:
            
            token_seqs
            ----------
               A list of a list of input tokens that will be used to create the lexicon
            
            min_freq
            --------
               Number of times the token needs to be in the corpus to be included in the
               lexicon.  Otherwise, will be replaced with the "unknown" entry
            
            unknown
            -------
               The word in the lexicon that should be used for tokens not existing in lexicon.
               This can be a value that already exists in input list.  For instance, in 
               Named Entity Recognition, a value of "other" or "O" may already be a tag 
               and so having "other" and "unknown" are the same thing!
        """
        # Count how often each word appears in the text.
        token_counts = {}
        for seq in token_seqs:
            for token in seq:
                if token in token_counts:
                    token_counts[token] += 1
                else:
                    token_counts[token] = 1

        # Then, assign each word to a numerical index. 
        # Filter words that occur less than min_freq times.
        lexicon = [token for token, count in token_counts.items() if count >= min_freq]
        
        # Have to delete unknown value from token list so not a gap in lexicon values when
        # turning it into a lexicon (aka, if unknown == OTHER and that is the 7th value, 
        # then 7 won't exist in the lexicon which may cause issues)
        if unknown in lexicon:
            lexicon.remove(unknown)

        # Indices start at 1. 0 is reserved for padding, and 1 is reserved for unknown words.
        lexicon = {token:idx + 2 for idx,token in enumerate(lexicon)}
        
        lexicon[unknown] = 1 # Unknown words are those that occur fewer than min_freq times
        lexicon_size = len(lexicon)
        return lexicon
    
    def save_lexicon(self):
        "Save lexicons by pickling them"
        if not os.path.exists(self.savePath):
            os.makedirs(self.savePath)
        with open(os.path.join(self.savePath, 'words_lexicon.pkl'), 'wb') as f:
            pickle.dump(self.words_lexicon, f)
            
        with open(os.path.join(self.savePath, 'tags_lexicon.pkl'), 'wb') as f:
            pickle.dump(self.tags_lexicon, f)
            
    def load_lexicon(self):
        with open(os.path.join(self.savePath, 'words_lexicon.pkl'), 'rb') as f:
            self.words_lexicon = pickle.load(f)
            
        with open(os.path.join(self.savePath, 'tags_lexicon.pkl'), 'rb') as f:
            self.tags_lexicon = pickle.pickle.load(f)
        
        self.make_lexicon_reverse()
        
    def make_lexicon_reverse(self):
        self.indx_to_words_dict = self.get_lexicon_lookup(self.words_lexicon)
        self.indx_to_tags_dict = self.get_lexicon_lookup(self.tags_lexicon)
    
    def get_lexicon_lookup(self, lexicon):
        '''Make a dictionary where the string representation of 
           a lexicon item can be retrieved from its numerical index'''
        lexicon_lookup = {idx: lexicon_item for lexicon_item, idx in lexicon.items()}
        return lexicon_lookup
    
    def tokens_to_idxs(self, token_seqs, lexicon):
        """Transform tokens to numeric indexes or <UNK> if doesn't exist"""
        idx_seqs = [[lexicon[token] if token in lexicon else lexicon['<UNK>'] for 
                                 token in token_seq] for token_seq in token_seqs]
        return idx_seqs

def create_model(seq_input_len, n_word_input_nodes, n_tag_input_nodes, 
                 n_word_embedding_nodes, n_tag_embedding_nodes, n_RNN_nodes, 
                 n_dense_nodes, stateful=False, batch_size=None, 
                 recurrent_dropout=0.1, crf=False):
    """Create a model for both POS and NER tagging.  Can try CRF but if that doesn't work,
       can default back to softmax"""

    #Layers 1
    word_input = Input(batch_shape=(batch_size, seq_input_len), name='word_input_layer')
    tag_input = Input(batch_shape=(batch_size, seq_input_len), name='tag_input_layer')

    #Layers 2
    #mask_zero will ignore 0 padding
    word_embeddings = Embedding(input_dim=n_word_input_nodes,
                                output_dim=n_word_embedding_nodes, 
                                mask_zero=True, name='word_embedding_layer')(word_input) 
    #Output shape = (batch_size, seq_input_len, n_word_embedding_nodes)
    tag_embeddings = Embedding(input_dim=n_tag_input_nodes,
                               output_dim=n_tag_embedding_nodes,
                               mask_zero=True,
                               name='tag_embedding_layer')(tag_input) 
    #Output shape = (batch_size, seq_input_len, n_tag_embedding_nodes)

    #Layer 3
    merged_embeddings = concatenate([word_embeddings, tag_embeddings], name='concat_embedding_layer')
    #Output shape =  (batch_size, seq_input_len, n_word_embedding_nodes + n_tag_embedding_nodes)

    #Layer 4
    hidden_layer = Bidirectional(LSTM(units=n_RNN_nodes, return_sequences=True, 
                                      recurrent_dropout=recurrent_dropout,
                                     stateful=stateful, name='hidden_layer'))(merged_embeddings)
    #Output shape = (batch_size, seq_input_len, n_hidden_nodes)

    #Layer 5
    dense_layer = TimeDistributed(Dense(units=n_dense_nodes, activation='relu'), name='dense_layer')(hidden_layer)

    #Layer 6
    if crf:
        crf = CRF(units=n_tag_input_nodes, sparse_target=True, name='output_layer')
        output_layer = crf(dense_layer)
        loss = crf.loss_function
        acc = crf.accuracy
    else:
        output_layer = TimeDistributed(Dense(units=n_tag_input_nodes, activation='softmax'), 
                                       name='output_layer')(hidden_layer)
        loss = "sparse_categorical_crossentropy" 
        acc = 'acc'
    # Output shape = (batch_size, seq_input_len, n_tag_input_nodes)

    #Specify which layers are input and output, compile model with loss and optimization functions
    model = Model(inputs=[word_input, tag_input], outputs=output_layer)
#     model = Model(inputs=[word_input], outputs=output_layer)
    model.compile(loss=loss, optimizer="adam", metrics=[acc])

    return model 

In [33]:
def pad_idx_seqs(idx_seqs, max_seq_len, value=0.0):
    # Keras provides a convenient padding function; 
    padded_idxs = pad_sequences(sequences=idx_seqs, maxlen=max_seq_len, 
                                value=value)
    return padded_idxs

def get_max_seq_len(sents):
    return max([len(idx_seq) for idx_seq in sents])

In [34]:
def run_training_model(pad_words, pad_tags, y_dat, saveName, lexicon, 
                       batch_size=128, epochs=15, val_split=.1, 
                       print_summary=False, savePath='models', 
                       n_word_embedding_nodes=n_word_embedding_nodes,
                       n_tag_embedding_nodes=n_tag_embedding_nodes,
                       n_RNN_nodes=n_RNN_nodes, 
                       n_dense_nodes=n_dense_nodes,
                       crf=False):    
    """ Builds and fits a model"""
    
    model = create_model(seq_input_len=pad_words.shape[-1] - 1, #substract 1 from matrix length because of offset
                             n_word_input_nodes=len(lexicon.words_lexicon) + 1, #Add one for 0 padding
                             n_tag_input_nodes=len(lexicon.tags_lexicon) + 1, #Add one for 0 padding
                             n_word_embedding_nodes=n_word_embedding_nodes,
                             n_tag_embedding_nodes=n_tag_embedding_nodes,
                             n_RNN_nodes=n_RNN_nodes, 
                             n_dense_nodes=n_dense_nodes,
                             crf=crf)
        
    if print_summary:
        model.summary()

    filepath = os.path.join(savePath, saveName + '.hdf5')
    checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1)
    callbacks_list = [checkpoint]

    '''Train the model'''

    # output matrix (y) has extra 3rd dimension added because sparse cross-entropy 
    # function requires one label per row
    model.fit(x=[pad_words[:,1:], pad_tags[:,:-1]], 
#     model.fit(x=[pad_words[:,1:]], 
              y=pad_tags[:, 1:, None], batch_size=batch_size, 
              epochs=epochs, validation_split=val_split,
              callbacks=callbacks_list)

    return model

In [35]:
def create_test_model(loadName, lexicon, loadPath='models',
                      n_word_embedding_nodes=n_word_embedding_nodes,
                      n_tag_embedding_nodes=n_tag_embedding_nodes,
                      n_RNN_nodes=n_RNN_nodes, 
                      n_dense_nodes=n_dense_nodes, crf=False):
    """ Loads a model to predict new data"""
    
    model = create_model(seq_input_len=1,
                             n_word_input_nodes=len(lexicon.words_lexicon) + 1, #Add one for 0 padding
                             n_tag_input_nodes=len(lexicon.tags_lexicon) + 1, #Add one for 0 padding
                             n_word_embedding_nodes=n_word_embedding_nodes,
                             n_tag_embedding_nodes=n_tag_embedding_nodes,
                             n_RNN_nodes=n_RNN_nodes, 
                             n_dense_nodes=n_dense_nodes,
                             stateful=True, batch_size=1,
                             crf=crf)

    model.load_weights(os.path.join(loadPath, loadName + '.hdf5'))
    return model

In [36]:
def predict_new_tag(predictor_model, test, lexicon):
    """Predict tags for new data"""
    pred_tags = []
    for _, sent in test.iterrows():
        tok_sent = sent['sents']
        sent_idxs = sent['sent_indx']
        sent_pred_tags = []
#         prev_tag = 1  #initialize predicted tag sequence with padding
        prev_tag = 0  #initialize predicted tag sequence with padding
        for cur_word in sent_idxs:
            # cur_word and prev_tag are just integers, but the model expects an input array
            # with the shape (batch_size, seq_input_len), so prepend two dimensions to these values
#             p_next_tag = predictor_model.predict(x=[np.array(cur_word)[None, None]])[0]            
            p_next_tag = predictor_model.predict(x=[np.array(cur_word)[None, None],
                                                    np.array(prev_tag)[None, None]])[0]
            prev_tag = np.argmax(p_next_tag, axis=-1)[0]
            sent_pred_tags.append(prev_tag)
        predictor_model.reset_states()

        #Map tags back to string labels
        sent_pred_tags = [lexicon.indx_to_tags_dict[tag] for tag in sent_pred_tags]
        pred_tags.append(sent_pred_tags) #filter padding 

    return pred_tags

def evaluate_model(pred_tags, test, print_sample=False):
    """Evaluate predictions against a test set"""
    
    test = test.copy()
    test['predicted_tags'] = pred_tags
    
    if print_sample:
        for _, sent in test.sample(n=10).iterrows():
            print("SENTENCE:\t{}".format("\t".join(sent['sents'])))
            print("PREDICTED:\t{}".format("\t".join(sent['predicted_tags'])))
            print("GOLD:\t\t{}".format("\t".join(sent['tags'])))
            print("CORRECT:\t{}".format("\t".join([str(x) for x in np.array(sent['tags']) == np.array(sent['predicted_tags'])])), "\n\n")

    
    all_gold_tags = [tag for sent_tags in test['tags'] for tag in sent_tags]
    all_pred_tags = [tag for sent_tags in test['predicted_tags'] for tag in sent_tags]
    accuracy = accuracy_score(y_true=all_gold_tags, y_pred=all_pred_tags)
    precision = precision_score(y_true=all_gold_tags, y_pred=all_pred_tags, average='weighted')
    recall = recall_score(y_true=all_gold_tags, y_pred=all_pred_tags, average='weighted')
    f1 = f1_score(y_true=all_gold_tags, y_pred=all_pred_tags, average='weighted')

    print("ACCURACY: {:.3f}".format(accuracy))
    print("PRECISION: {:.3f}".format(precision))
    print("RECALL: {:.3f}".format(recall))
    print("F1: {:.3f}".format(f1))
    

In [37]:
# Create lexicon
lexicon = lexiconTransformer(words_min_freq=2, unknown_tag_token='OTHER')

lexicon.fit(train.sents, train.tags)

train['sent_indx'], train['tag_indx'] = lexicon.transform(train.sents, train.tags)

# Get length of longest sequence
max_seq_len = get_max_seq_len(train['sent_indx'])

#Add one to max length for offsetting sequence by 1
train_padded_words = pad_idx_seqs(train['sent_indx'], 
                                  max_seq_len + 1) 

train_padded_tags = pad_idx_seqs(train['tag_indx'],
                                 max_seq_len + 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [38]:
# Convert to one-hot vector encoding for y
train_y = [to_categorical(i, num_classes=len(lexicon.tags_lexicon) + 1) for i in train_padded_tags]

In [41]:
mod_save_name = 'ingredient_model_softmax'
crf_mod = False
ingredient_model = run_training_model(train_padded_words, train_padded_tags, 
                                      train_y, mod_save_name, lexicon, crf=crf_mod,
                                      print_summary=True, batch_size=256, epochs=5)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
word_input_layer (InputLayer)   (None, 56)           0                                            
__________________________________________________________________________________________________
tag_input_layer (InputLayer)    (None, 56)           0                                            
__________________________________________________________________________________________________
word_embedding_layer (Embedding (None, 56, 300)      1687200     word_input_layer[0][0]           
__________________________________________________________________________________________________
tag_embedding_layer (Embedding) (None, 56, 150)      1500        tag_input_layer[0][0]            
__________________________________________________________________________________________________
concat_emb

KeyboardInterrupt: 

In [109]:
test['sent_indx'], test['tag_indx'] = lexicon.transform(test.sents, test.tags)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [104]:
test_mod = create_test_model(mod_save_name, lexicon, crf=crf_mod)

In [112]:
preds = predict_new_tag(test_mod, test, lexicon)

In [131]:
evaluate_model(preds, test, print_sample=True)

SENTENCE:	Salt	and	freshly	ground	pepper	to	taste
PREDICTED:	B-NAME	OTHER	B-COMMENT	B-COMMENT	B-NAME	OTHER	OTHER
GOLD:		B-NAME	I-NAME	B-COMMENT	I-COMMENT	B-NAME	B-COMMENT	I-COMMENT
CORRECT:	True	False	True	False	True	False	False 


SENTENCE:	6	plum	tomatoes	,	stems	removed	and	cut	in	quarters
PREDICTED:	B-QTY	B-NAME	B-NAME	OTHER	B-NAME	B-NAME	OTHER	OTHER	OTHER	B-COMMENT
GOLD:		B-QTY	B-NAME	I-NAME	OTHER	B-COMMENT	I-COMMENT	I-COMMENT	I-COMMENT	I-COMMENT	I-COMMENT
CORRECT:	True	True	False	True	False	False	False	False	False	False 


SENTENCE:	2	teaspoons	chopped	fresh	mint
PREDICTED:	B-QTY	B-UNIT	OTHER	B-NAME	B-NAME
GOLD:		B-QTY	B-UNIT	B-COMMENT	I-COMMENT	B-NAME
CORRECT:	True	True	False	False	True 


SENTENCE:	2	to	3	teaspoons	minced	fresh	ginger	(	to	taste	)
PREDICTED:	B-QTY	OTHER	B-QTY	B-UNIT	B-NAME	B-NAME	B-NAME	OTHER	OTHER	OTHER	OTHER
GOLD:		OTHER	OTHER	B-QTY	OTHER	B-NAME	I-NAME	I-NAME	OTHER	OTHER	OTHER	OTHER
CORRECT:	False	True	True	False	True	False	False	True	True	True	True 


SENTEN

  'precision', 'predicted', average, warn_for)


ACCURACY: 0.521
PRECISION: 0.539
RECALL: 0.521
F1: 0.421


  'precision', 'predicted', average, warn_for)
