## Ingredient Phrase Model

This program will create a model that is designed to separately identify food name, quantity and other information as Named Entity Recognition tags from a word ingredient list.

In [18]:
import pandas as pd
import numpy as np
import os
import pickle

# Generate training data using NY Times ingredient phrase tagger
from ingredient_phrase_tagger.training.cli import Cli
from ingredient_phrase_tagger.training.cli import utils
from sklearn.model_selection import train_test_split

# Model libraries
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Model
from keras.layers import Input, concatenate, Concatenate, TimeDistributed, Dense, Bidirectional
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras_contrib.layers import CRF
from keras.optimizers import Adam
from keras import regularizers
from keras.callbacks import ModelCheckpoint

In [2]:
# Read in Raw data
dataPath = '../data'
datafileNm = os.path.join(dataPath, 'nyt-ingredients-snapshot-2015.csv')

nytData = pd.read_csv(datafileNm, index_col=None)
nytData.drop(columns='index', inplace=True)

In [3]:
def reshape_data(tbl):
    """Reshape data for use in model so that each recipe is
       just one observation and the ingredients and tags are lists"""

    
    # Indexes where the sentece starts
    sentStarters = tbl.loc[tbl['index'] == 'I1']

    # Add indicator for group and fill that forward for the group
    tbl.loc[tbl['index'] == 'I1', 'sent'] = range(sentStarters.shape[0])
    tbl['sent'] = tbl['sent'].fillna(method='ffill')

    def reshape_recipe(recipe):
        tokens = [token for token in recipe['token']]
        tags = [tag for tag in recipe['tag']]
        return pd.DataFrame({'sents': [tokens], 'tags': [tags]})

    return tbl.groupby('sent').apply(reshape_recipe)
                               

In [4]:
# Generate training data from NY Times Ingredient Tagging Model
cleaned_dat = reshape_data(Cli(nytData).df)

  return _compile(pattern, flags).split(string, maxsplit)


In [5]:
train, test = train_test_split(cleaned_dat, test_size = .2)

In [83]:
class lexiconTransformer():
    """Create a lexicon and transform sentences and tags
       to indexes for use in the model."""
    
    def __init__(self, words_min_freq = 1, tags_min_freq = 1, savePath = 'models'):
        self.words_min_freq = words_min_freq
        self.tags_min_freq = tags_min_freq
        self.words_lexicon = None
        self.indx_to_words_dict = None
        self.tags_lexicon = None
        self.indx_to_tags_dict = None
        self.savePath = savePath
    
    def fit(self, sents, tags):
        """Create lexicon based on sentences and tags"""
        self.make_words_lexicon(sents)
        self.make_tags_lexicon(tags)
        
        self.make_lexicon_reverse()
        self.save_lexicon()
                
    def transform(self, sents, tags):
        sents_indxs = self.tokens_to_idxs(sents, self.words_lexicon)
        tags_indxs = self.tokens_to_idxs(tags, self.tags_lexicon)
        return (sents_indxs, tags_indxs)

    def fit_transform(self, sents, tags):
        self.fit(sents, tags)
        return self.transform(sents, tags)
        
    def make_words_lexicon(self, sents_token):
        """Wrapper for words lexicon"""
        self.words_lexicon = self.make_lexicon(sents_token, self.words_min_freq)

    def make_tags_lexicon(self, tags_token):
        """Wrapper for tags lexicon"""
        self.tags_lexicon = self.make_lexicon(tags_token, self.tags_min_freq)

    def make_lexicon(self, token_seqs, min_freq=1):
        """Create lexicon from input based on a frequency"""
        # Count how often each word appears in the text.
        token_counts = {}
        for seq in token_seqs:
            for token in seq:
                if token in token_counts:
                    token_counts[token] += 1
                else:
                    token_counts[token] = 1

        # Then, assign each word to a numerical index. 
        # Filter words that occur less than min_freq times.
        lexicon = [token for token, count in token_counts.items() if count >= min_freq]
        # Indices start at 1. 0 is reserved for padding, and 1 is reserved for unknown words.
        lexicon = {token:idx + 2 for idx,token in enumerate(lexicon)}
        lexicon[u'<UNK>'] = 1 # Unknown words are those that occur fewer than min_freq times
        lexicon_size = len(lexicon)
        return lexicon
    
    def save_lexicon(self):
        "Save lexicons by pickling them"
        if not os.path.exists(self.savePath):
            os.makedirs(self.savePath)
        with open(os.path.join(self.savePath, 'words_lexicon.pkl'), 'wb') as f:
            pickle.dump(self.words_lexicon, f)
            
        with open(os.path.join(self.savePath, 'tags_lexicon.pkl'), 'wb') as f:
            pickle.dump(self.tags_lexicon, f)
            
    def load_lexicon(self):
        with open(os.path.join(self.savePath, 'words_lexicon.pkl'), 'rb') as f:
            self.words_lexicon = pickle.load(f)
            
        with open(os.path.join(self.savePath, 'tags_lexicon.pkl'), 'rb') as f:
            self.tags_lexicon = pickle.pickle.load(f)
        
        self.make_lexicon_reverse()
        
    def make_lexicon_reverse(self):
        self.indx_to_words_dict = self.get_lexicon_lookup(self.words_lexicon)
        self.indx_to_tags_dict = self.get_lexicon_lookup(self.tags_lexicon)
    
    def get_lexicon_lookup(self, lexicon):
        '''Make a dictionary where the string representation of 
           a lexicon item can be retrieved from its numerical index'''
        lexicon_lookup = {idx: lexicon_item for lexicon_item, idx in lexicon.items()}
        return lexicon_lookup
    
    def tokens_to_idxs(self, token_seqs, lexicon):
        """Transform tokens to numeric indexes or <UNK> if doesn't exist"""
        idx_seqs = [[lexicon[token] if token in lexicon else lexicon['<UNK>'] for 
                                 token in token_seq] for token_seq in token_seqs]
        return idx_seqs

def create_crf_model(seq_input_len, n_word_input_nodes, n_tag_input_nodes, 
                     n_word_embedding_nodes, n_tag_embedding_nodes, n_RNN_nodes, 
                     n_dense_nodes, stateful=False, batch_size=None):
    """CRF Model for both POS and NER tagging"""

    #Layers 1
    word_input = Input(batch_shape=(batch_size, seq_input_len), name='word_input_layer')
    tag_input = Input(batch_shape=(batch_size, seq_input_len), name='tag_input_layer')

    #Layers 2
    #mask_zero will ignore 0 padding
    word_embeddings = Embedding(input_dim=n_word_input_nodes,
                                output_dim=n_word_embedding_nodes, 
                                mask_zero=True, name='word_embedding_layer')(word_input) 
    #Output shape = (batch_size, seq_input_len, n_word_embedding_nodes)
    tag_embeddings = Embedding(input_dim=n_tag_input_nodes,
                               output_dim=n_tag_embedding_nodes,
                               mask_zero=True, name='tag_embedding_layer')(tag_input) 
    #Output shape = (batch_size, seq_input_len, n_tag_embedding_nodes)

    #Layer 3
    merged_embeddings = concatenate([word_embeddings, tag_embeddings], name='concat_embedding_layer')
    #Output shape =  (batch_size, seq_input_len, n_word_embedding_nodes + n_tag_embedding_nodes)

    #Layer 4
    hidden_layer = Bidirectional(LSTM(units=n_RNN_nodes, return_sequences=True, 
                                     stateful=stateful, name='hidden_layer'))(merged_embeddings)
    #Output shape = (batch_size, seq_input_len, n_hidden_nodes)

    #Layer 5
    dense_layer = TimeDistributed(Dense(units=n_dense_nodes, activation='relu'), name='dense_layer')(hidden_layer)

    #Layer 6
    crf = CRF(units=n_tag_input_nodes, sparse_target=True, name='output_layer')
    output_layer = crf(dense_layer)
    # Output shape = (batch_size, seq_input_len, n_tag_input_nodes)

    #Specify which layers are input and output, compile model with loss and optimization functions
    model = Model(inputs=[word_input, tag_input], outputs=output_layer)
    model.compile(loss=crf.loss_function, optimizer="rmsprop", metrics=[crf.accuracy])

    return model 

In [30]:
def pad_idx_seqs(idx_seqs, max_seq_len, value=0.0):
    # Keras provides a convenient padding function; 
    padded_idxs = pad_sequences(sequences=idx_seqs, maxlen=max_seq_len, value=value)
    return padded_idxs

def get_max_seq_len(sents):
    return max([len(idx_seq) for idx_seq in sents])

In [76]:
def run_training_model(pad_words, pad_tags, y_dat, saveName, 
                       batch_size=128, epochs=15, val_split=.1, 
                       print_summary=False, savePath='models'):
    
    """ Builds and fits a CRF model"""
    
    model = create_crf_model(seq_input_len=pad_words.shape[-1] - 1, #substract 1 from matrix length because of offset
                             n_word_input_nodes=len(lexicon.words_lexicon) + 1, #Add one for 0 padding
                             n_tag_input_nodes=len(lexicon.tags_lexicon) + 1, #Add one for 0 padding
                             n_word_embedding_nodes=n_word_embedding_nodes,
                             n_tag_embedding_nodes=n_tag_embedding_nodes,
                             n_RNN_nodes=n_RNN_nodes, 
                             n_dense_nodes=n_dense_nodes)
        
    if print_summary:
        model.summary()

    filepath = os.path.join(savePath, saveName + '.hdf5')
    checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1)
    callbacks_list = [checkpoint]

    '''Train the model'''

    # output matrix (y) has extra 3rd dimension added because sparse cross-entropy 
    # function requires one label per row
    model.fit(x=[pad_words[:,1:], pad_tags[:,:-1]], 
              y=np.array(y_dat)[:, 1:, :], batch_size=batch_size, 
              epochs=epochs, validation_split=val_split,
              callbacks=callbacks_list)

    return model

In [62]:
# Some default parameters
n_word_embedding_nodes=300
n_tag_embedding_nodes=150
n_RNN_nodes=400
n_dense_nodes=200

lexicon = lexiconTransformer(words_min_freq=2)

lexicon.fit(train.sents, train.tags)

train['sent_indx'], train['tag_indx'] = lexicon.transform(train.sents, train.tags)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [63]:
# Get length of longest sequence
max_seq_len = get_max_seq_len(train['sent_indx'])

#Add one to max length for offsetting sequence by 1
train_padded_words = pad_idx_seqs(train['sent_indx'], 
                                  max_seq_len + 1) 

train_padded_tags = pad_idx_seqs(train['tag_indx'],
                                 max_seq_len + 1)

In [68]:
# Convert to one-hot vector encoding for y
train_y = [to_categorical(i, num_classes=len(lexicon.tags_lexicon) + 1) for i in train_padded_tags]

In [84]:
ingredient_model = run_training_model(train_padded_words, train_padded_tags, 
                                  train_y, 'ingredient_model', print_summary=True, 
                                  batch_size=64, epochs=20)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
word_input_layer (InputLayer)   (None, 56)           0                                            
__________________________________________________________________________________________________
tag_input_layer (InputLayer)    (None, 56)           0                                            
__________________________________________________________________________________________________
word_embedding_layer (Embedding (None, 56, 300)      1694700     word_input_layer[0][0]           
__________________________________________________________________________________________________
tag_embedding_layer (Embedding) (None, 56, 150)      1650        tag_input_layer[0][0]            
__________________________________________________________________________________________________
concat_emb

KeyboardInterrupt: 

In [13]:
def create_model(seq_input_len, n_word_input_nodes, n_tag_input_nodes, n_word_embedding_nodes,
                 n_tag_embedding_nodes, n_hidden_nodes, n_dense_nodes, 
                 stateful=False, batch_size=None):
    
    #Layers 1
    word_input = Input(batch_shape=(batch_size, seq_input_len), name='word_input_layer')
    tag_input = Input(batch_shape=(batch_size, seq_input_len), name='tag_input_layer')

    #Layers 2
    word_embeddings = Embedding(input_dim=n_word_input_nodes,
                                output_dim=n_word_embedding_nodes, 
                                mask_zero=True, name='word_embedding_layer')(word_input) #mask_zero will ignore 0 padding
    #Output shape = (batch_size, seq_input_len, n_word_embedding_nodes)
    tag_embeddings = Embedding(input_dim=n_tag_input_nodes,
                               output_dim=n_tag_embedding_nodes,
                               mask_zero=True, name='tag_embedding_layer')(tag_input) 
    #Output shape = (batch_size, seq_input_len, n_tag_embedding_nodes)
    
    #Layer 3
#     merged_embeddings = Concatenate(axis=-1, name='concat_embedding_layer')([word_embeddings, tag_embeddings])
    merged_embeddings = concatenate([word_embeddings, tag_embeddings], name='concat_embedding_layer')
    #Output shape =  (batch_size, seq_input_len, n_word_embedding_nodes + n_tag_embedding_nodes)
    
    #Layer 4
    hidden_layer = Bidirectional(LSTM(units=n_hidden_nodes, return_sequences=True, 
                                     stateful=stateful, name='hidden_layer'))(merged_embeddings)
#     hidden_layer = Bidirectional(GRU(units=n_hidden_nodes, return_sequences=True, 
#                                      stateful=stateful, name='hidden_layer', 
#                                      recurrent_regularizer=regularizers.l2(.01),
#                                      kernel_regularizer=regularizers.l2(0.01),
#                                      activity_regularizer=regularizers.l2(0.01)))(merged_embeddings)
    #Output shape = (batch_size, seq_input_len, n_hidden_nodes)
    
    #Layer 5
    dense_layer = TimeDistributed(Dense(units=n_dense_nodes, activation='relu'), name='dense_layer')(hidden_layer)

    #Layer 6
    crf = CRF(units=n_tag_input_nodes, learn_mode='marginal', sparse_target=True, name='output_layer')
#     output_layer = crf(hidden_layer)
    output_layer = crf(dense_layer)
    # Output shape = (batch_size, seq_input_len, n_tag_input_nodes)
    
    #Specify which layers are input and output, compile model with loss and optimization functions
    model = Model(inputs=[word_input, tag_input], outputs=output_layer)
#     adamOpt = Adam(clipvalue = 1, clipnorm = 1)
    model.compile(loss=crf.loss_function, optimizer="rmsprop", metrics=[crf.accuracy])
    
    return model



#     output_layer = TimeDistributed(Dense(units=n_tag_input_nodes, 
#                                          activation='softmax'), name='output_layer')(hidden_layer)
#     # Output shape = (batch_size, seq_input_len, n_tag_input_nodes)
    
#     #Specify which layers are input and output, compile model with loss and optimization functions
#     model = Model(inputs=[word_input, tag_input], outputs=output_layer)
#     model.compile(loss="sparse_categorical_crossentropy",
#                   optimizer='adam', metrics=['accuracy'])
#     return model


In [19]:
with open('models/words_lexicon.pkl', 'rb') as f:
    words_lexicon = pickle.load(f)
    
with open('models/tags_lexicon.pkl', 'rb') as f:
    tags_lexicon = pickle.load(f)

tags_lexicon_lookup = get_lexicon_lookup(tags_lexicon)

predictor_model = create_model(seq_input_len=1,
                               n_word_input_nodes=len(words_lexicon) + 1,
                               n_tag_input_nodes=len(tags_lexicon) + 1,
                               n_word_embedding_nodes=n_word_embedding_nodes,
                               n_tag_embedding_nodes=n_tag_embedding_nodes,
                               n_hidden_nodes=n_hidden_nodes, 
                               n_dense_nodes=n_dense_nodes,
                               stateful=True,
                               batch_size=1)

#Transfer the weights from the trained model
predictor_model.load_weights('./models/ner_temp_model_weights-15-0.0000.hdf5')

LEXICON LOOKUP SAMPLE:
{2: 'B-per', 3: 'I-per', 4: 'O', 5: 'B-gpe', 6: 'B-tim', 7: 'I-tim', 8: 'B-org', 9: 'B-geo', 10: 'I-org', 11: 'B-art', 12: 'I-geo', 13: 'B-eve', 14: 'I-eve', 15: 'I-gpe', 16: 'I-art', 17: 'B-nat', 18: 'I-nat', 1: '<UNK>'}
