In [1]:
import pandas as pd
import numpy as np
import os
import pickle
import re

# Generate training data using NY Times ingredient phrase tagger
from ingredient_phrase_tagger.training.cli import Cli
from ingredient_phrase_tagger.training import utils, reshape

from sklearn.model_selection import train_test_split

# Model libraries
from tagger_model import *

from IPython.core.debugger import set_trace

  (fname, cnt))
  (fname, cnt))
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# Some default parameters
n_word_embedding_nodes=300
n_tag_embedding_nodes=150
n_RNN_nodes=400
n_dense_nodes=200

dataPath = '../data'

In [3]:
filename = 'ner_dataset.csv'
data = pd.read_csv(os.path.join(dataPath, filename), encoding="latin1")
data = data.fillna(method="ffill")

# Reformat data so that each sentence is put into a vector per row of a pandas dataframe
cleaned_dat = data.groupby('Sentence #', sort=False).apply(lambda x: pd.DataFrame(data = {'sents': [x.Word.tolist()], 
                                                                                       'tags': [x.POS.tolist()]}))

In [4]:
train, test = train_test_split(cleaned_dat, test_size = .2, random_state=10)

In [5]:
# Create lexicon
lexicon = lexiconTransformer(words_min_freq=2, tags_min_freq=2)

lexicon.fit(train.sents, train.tags)

train['sent_indx'], train['tag_indx'] = lexicon.transform(train.sents, train.tags)

# Get length of longest sequence
max_seq_len = get_max_seq_len(train['sent_indx'])

#Add one to max length for offsetting sequence by 1
train_padded_words = pad_idx_seqs(train['sent_indx'], 
                                  max_seq_len + 1) 

train_padded_tags = pad_idx_seqs(train['tag_indx'],
                                 max_seq_len + 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [6]:
lexicon.tags_lexicon

{'$': 31,
 ',': 22,
 '.': 10,
 ':': 41,
 ';': 40,
 '<UNK>': 1,
 'CC': 4,
 'CD': 17,
 'DT': 8,
 'EX': 34,
 'IN': 7,
 'JJ': 5,
 'JJR': 35,
 'JJS': 27,
 'LRB': 38,
 'MD': 29,
 'NN': 9,
 'NNP': 2,
 'NNPS': 28,
 'NNS': 3,
 'PDT': 36,
 'POS': 18,
 'PRP': 23,
 'PRP$': 21,
 'RB': 13,
 'RBR': 30,
 'RBS': 33,
 'RP': 11,
 'RRB': 39,
 'TO': 20,
 'UH': 42,
 'VB': 14,
 'VBD': 6,
 'VBG': 12,
 'VBN': 16,
 'VBP': 24,
 'VBZ': 19,
 'WDT': 32,
 'WP': 25,
 'WP$': 37,
 'WRB': 15,
 '``': 26}

In [7]:
# Convert to one-hot vector encoding for y
# train_y = [to_categorical(i, num_classes=len(lexicon.tags_lexicon) + 1) for i in train_padded_tags]

In [8]:
mod_save_name = 'pos_model_crf_wordOnly_upd1'
crf_mod = True

In [9]:
pos_model = run_training_model(train_padded_words, train_padded_tags, 
                                      train_padded_tags, mod_save_name, lexicon, crf=crf_mod,
                                      print_summary=True, batch_size=256, epochs=100,
                                      n_word_embedding_nodes=n_word_embedding_nodes,
                                      n_tag_embedding_nodes=n_tag_embedding_nodes,
                                      n_RNN_nodes=n_RNN_nodes, 
                                      n_dense_nodes=n_dense_nodes)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
word_input_layer (InputLayer (None, 81)                0         
_________________________________________________________________
word_embedding_layer (Embedd (None, 81, 300)           5481900   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 81, 800)           1682400   
_________________________________________________________________
dense_layer (TimeDistributed (None, 81, 200)           160200    
_________________________________________________________________
output_layer (CRF)           (None, 81, 43)            10578     
Total params: 7,335,078
Trainable params: 7,335,078
Non-trainable params: 0
_________________________________________________________________
Train on 30693 samples, validate on 7674 samples
Epoch 1/100

Epoch 00001: saving model to models/pos_model_crf_wordOnly_upd1.hdf5
E

Epoch 35/100

Epoch 00035: saving model to models/pos_model_crf_wordOnly_upd1.hdf5
Epoch 36/100

Epoch 00036: saving model to models/pos_model_crf_wordOnly_upd1.hdf5
Epoch 37/100

Epoch 00037: saving model to models/pos_model_crf_wordOnly_upd1.hdf5
Epoch 38/100

Epoch 00038: saving model to models/pos_model_crf_wordOnly_upd1.hdf5
Epoch 39/100

Epoch 00039: saving model to models/pos_model_crf_wordOnly_upd1.hdf5
Epoch 40/100

Epoch 00040: saving model to models/pos_model_crf_wordOnly_upd1.hdf5
Epoch 41/100

Epoch 00041: saving model to models/pos_model_crf_wordOnly_upd1.hdf5
Epoch 42/100

Epoch 00042: saving model to models/pos_model_crf_wordOnly_upd1.hdf5
Epoch 43/100

Epoch 00043: saving model to models/pos_model_crf_wordOnly_upd1.hdf5
Epoch 44/100

Epoch 00044: saving model to models/pos_model_crf_wordOnly_upd1.hdf5
Epoch 45/100

Epoch 00045: saving model to models/pos_model_crf_wordOnly_upd1.hdf5
Epoch 46/100

Epoch 00046: saving model to models/pos_model_crf_wordOnly_upd1.hdf5
Epoc

Epoch 74/100
 2560/30693 [=>............................] - ETA: 42s - loss: 12.5664 - acc: 0.9995

KeyboardInterrupt: 

In [10]:
test['sent_indx'], test['tag_indx'] = lexicon.transform(test.sents, test.tags)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [11]:
test_mod = create_test_model(mod_save_name, lexicon, crf=crf_mod, 
                             n_word_embedding_nodes=n_word_embedding_nodes,
                             n_tag_embedding_nodes=n_tag_embedding_nodes,
                             n_RNN_nodes=n_RNN_nodes, 
                             n_dense_nodes=n_dense_nodes)

In [None]:
preds = predict_new_tag(test_mod, test, lexicon)

In [None]:
evaluate_model(preds, test, print_sample=True)