In [1]:
# enviro: macos, docker
envir = "macos"

In [2]:
import tensorflow as tf
print("Tensorflow version: {}".format(tf.__version__))

import os
import numpy as np
import re

  from ._conv import register_converters as _register_converters


Tensorflow version: 1.9.0


In [3]:
model = None
db = None

tags_to_index = {}   # {word1: 1, word2: 2, ...}
index_to_tags = {}   # {1: word1, 2: word2, ...}
train_dataset_raw = {}
train_dataset = []
test_dataset_raw = {}
test_dataset = []
dataset_vocab = {}

# Prepare

Here we use a word2vec data structure with 300 dimensions from google news.

In [2]:
!pip install gensim

Collecting gensim
[?25l  Downloading https://files.pythonhosted.org/packages/62/19/8ecba86351de0eacb9baf1cc49ba86315cd91bc672acd74d6e4e709eb482/gensim-3.6.0-cp36-cp36m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl (24.0MB)
[K    100% |████████████████████████████████| 24.0MB 818kB/s eta 0:00:01
Collecting smart-open>=1.2.1 (from gensim)
  Downloading https://files.pythonhosted.org/packages/4b/1f/6f27e3682124de63ac97a0a5876da6186de6c19410feab66c1543afab055/smart_open-1.7.1.tar.gz
Collecting bz2file (from smart-open>=1.2.1->gensim)
  Downloading https://files.pythonhosted.org/packages/61/39/122222b5e85cd41c391b68a99ee296584b2a2d1d233e7ee32b4532384f2d/bz2file-0.98.tar.gz
Collecting boto3 (from smart-open>=1.2.1->gensim)
[?25l  Downloading https://files.pythonhosted.org/packages/e3/af/ff24b42daacdc929629f4f85ce8a54ee1c6591475b5067d180028feffb57/boto3-1.9.28-py2.py3-none-any.whl (128kB)
[K    100% |████████████████████████████████| 133

In [4]:
from gensim.models import KeyedVectors

In [6]:
if envir == "docker":
    datastore = os.path.join("/","notebooks","data","googlenews","GoogleNews-vectors-negative300.bin")
elif envir == "macos":
    datastore = os.path.join("/","Volumes","Data","googlenews","GoogleNews-vectors-negative300.bin")

if not os.path.exists(datastore):
    raise IOError("Not such file {}.".format(datastore))

# Preprocessing

In [7]:
if envir == "docker":
    leveldb_path = os.path.join("/","notebooks","devops","tmp","word2vecdb")
elif envir == "macos":
    leveldb_path = "/Users/jiankaiwang/devops/tmp/word2vecdb"

if envir == "docker":
    pre_data_path = os.path.join("/","notebooks","data","CoNLL-2000")
elif envir == "macos":
    pre_data_path = os.path.join("/","Volumes","Data","CoNLL-2000")   

origin_train = os.path.join(pre_data_path, "train.txt")
origin_test = os.path.join(pre_data_path, "test.txt")

preprocess_train = os.path.join(pre_data_path, "pp_train.txt")
preprocess_test = os.path.join(pre_data_path, "pp_test.txt")

In [8]:
def preprocessWord(dataList, training_data=True):
    """
    parse train and test origin data
    """
    global train_dataset, test_dataset
    
    ref_dataset = dataList
    tmp_dataset = train_dataset if training_data else test_dataset
    
    count = 0
    while count < len(dataList):
        pair = ref_dataset[count]
        if count < len(ref_dataset) - 1:
            next_pair = ref_dataset[count + 1]
            
            # solve New NN
            #       York NN
            # pair[1] == next_pair[1] : the same property
            if (pair[0] + "_" + next_pair[0]) in model and pair[1] == next_pair[1]:
                tmp_dataset.append([pair[0] + "_" + next_pair[0], pair[1]])
                count += 2
                continue
                
        # number replacement
        word = re.sub("\d", "#", pair[0])
        
        # replace '-' with _
        word = re.sub("-", "_", word)
        
        # only preserve the word we needed
        if word in model:
            tmp_dataset.append([word, pair[1]])
            count += 1
            continue
        
        # handle both "_word" or "word_"
        if "_" in word:
            subwords = word.split("_")
            for sw in subwords:
                if not (sw.isspace() or len(sw) == 0):
                    tmp_dataset.append([sw, pair[1]])
            count += 1
            continue
            
        train_dataset.append([word, pair[1]])
        count += 1

In [9]:
# load the whole google news words
model = KeyedVectors.load_word2vec_format(datastore, binary=True)

In [10]:
# a word2vec example from google news
print(model['record'])

[ 5.39550781e-02  1.58203125e-01  1.04003906e-01 -2.89916992e-04
  5.32226562e-02 -8.64257812e-02 -1.98242188e-01 -7.47070312e-02
  2.89062500e-01  1.11816406e-01 -1.86523438e-01 -9.52148438e-02
  1.77734375e-01 -1.62109375e-01 -3.56445312e-02  3.68652344e-02
  3.49426270e-03 -5.15136719e-02 -1.55273438e-01 -3.18359375e-01
 -8.97216797e-03  1.89453125e-01 -6.78710938e-02 -9.47265625e-02
  1.12304688e-01 -1.09375000e-01 -7.87353516e-03  2.32421875e-01
 -2.38281250e-01  1.64062500e-01  1.62109375e-01 -7.32421875e-02
 -1.41601562e-01 -2.29492188e-01 -1.68945312e-01  1.73828125e-01
 -2.27539062e-01 -1.25000000e-01  2.11914062e-01  2.63671875e-01
  3.18359375e-01  1.13769531e-01 -1.28906250e-01  2.59765625e-01
  1.91406250e-01  6.68945312e-02  1.03027344e-01  1.75781250e-01
 -4.12597656e-02  4.54101562e-02  1.80664062e-01 -3.49121094e-02
  1.93359375e-01  1.00708008e-02  5.32226562e-02  1.19140625e-01
 -1.21093750e-01 -8.10546875e-02  1.33789062e-01 -1.74804688e-01
  4.07714844e-02  5.32226

In [11]:
if os.path.exists(preprocess_train): 
    os.remove(preprocess_train)
    print("Remove preprocessed train data.")

if os.path.exists(preprocess_test):
    os.remove(preprocess_test)
    print("Remove preprocessed test data.")

Remove preprocessed train data.
Remove preprocessed test data.


In [12]:
# read origin train data
with open(origin_train, "r") as f:
    train_dataset_raw = f.readlines()
    train_dataset_raw = [element.split() for element in train_dataset_raw if len(element.split()) > 0]
print("Total Train size: {}".format(len(train_dataset_raw)))

# show data content
print(train_dataset_raw[0])

# preprocessing origin train data
preprocessWord(dataList=train_dataset_raw, training_data=True)

# write out the prepcrossed train data
with open(preprocess_train, "w") as fout:
    for pair in train_dataset:
        fout.write("{} {}\n".format(pair[0], pair[1]))

Total Train size: 211727
['Confidence', 'NN', 'B-NP']


In [13]:
# read origin test data
with open(origin_test, "r") as f:
    test_dataset_raw = f.readlines()
    test_dataset_raw = [element.split() for element in test_dataset_raw if len(element.split()) > 0]
print("Total Test size: {}".format(len(test_dataset_raw)))

# show data content
print(test_dataset_raw[0])

# preprocessing origin train data
preprocessWord(dataList=test_dataset_raw, training_data=False)

# write out the prepcrossed train data
with open(preprocess_test, "w") as fout:
    for pair in test_dataset:
        fout.write("{} {}\n".format(pair[0], pair[1]))

Total Test size: 47377
['Rockwell', 'NNP', 'B-NP']


## create two mapping files

In [14]:
count = 0
for pair in train_dataset + test_dataset:
    dataset_vocab[pair[0]] = 1
    
    if pair[1] not in tags_to_index:
        tags_to_index[pair[1]] = count
        index_to_tags[count] = pair[1]
        count += 1

In [15]:
print(len(dataset_vocab))

18485


In [16]:
print(tags_to_index)

{'NN': 0, 'IN': 1, 'DT': 2, 'VBZ': 3, 'RB': 4, 'VBN': 5, 'TO': 6, 'VB': 7, 'JJ': 8, 'NNS': 9, 'NNP': 10, ',': 11, 'CC': 12, 'POS': 13, '.': 14, 'VBP': 15, 'VBG': 16, 'PRP$': 17, 'CD': 18, '``': 19, "''": 20, 'VBD': 21, 'EX': 22, 'MD': 23, '#': 24, '(': 25, '$': 26, ')': 27, 'NNPS': 28, 'PRP': 29, 'JJS': 30, 'WP': 31, 'RBR': 32, 'JJR': 33, 'WDT': 34, 'WRB': 35, 'RBS': 36, 'PDT': 37, 'RP': 38, ':': 39, 'FW': 40, 'WP$': 41, 'SYM': 42, 'UH': 43}


In [17]:
print(index_to_tags)

{0: 'NN', 1: 'IN', 2: 'DT', 3: 'VBZ', 4: 'RB', 5: 'VBN', 6: 'TO', 7: 'VB', 8: 'JJ', 9: 'NNS', 10: 'NNP', 11: ',', 12: 'CC', 13: 'POS', 14: '.', 15: 'VBP', 16: 'VBG', 17: 'PRP$', 18: 'CD', 19: '``', 20: "''", 21: 'VBD', 22: 'EX', 23: 'MD', 24: '#', 25: '(', 26: '$', 27: ')', 28: 'NNPS', 29: 'PRP', 30: 'JJS', 31: 'WP', 32: 'RBR', 33: 'JJR', 34: 'WDT', 35: 'WRB', 36: 'RBS', 37: 'PDT', 38: 'RP', 39: ':', 40: 'FW', 41: 'WP$', 42: 'SYM', 43: 'UH'}


## save into leveldb / pickle file

In [18]:
nonmodel_cache = {}
word2vec_data = {}

count = 1
ttl_vocab = len(dataset_vocab.keys())

try:
    for word, _ in dataset_vocab.items():
        if count % 1000 == 0:
            print("Inserted {} words out of {} total.".format(count, ttl_vocab))

        if word in model:
            word2vec_data[word] = model[word]
        
        elif word in nonmodel_cache:
            word2vec_data[word] = nonmodel_cache[word]
        
        else:
            print("add into cache {}".format(word))
            nonmodel_cache[word] = np.random.uniform(-0.25, 0.25, 300).astype(np.float32)
            word2vec_data[word] = nonmodel_cache[word]

        count += 1
except Exception as e:
    print(e)


add into cache to
add into cache ,
add into cache a
add into cache and
add into cache 's
add into cache .
add into cache of
add into cache ``
add into cache ''
add into cache L.P
add into cache '
add into cache hotel\/casino
add into cache ;
add into cache :
add into cache Underseas
add into cache ?
Inserted 1000 words out of 18485 total.
add into cache Ohbayashi
add into cache establshed
add into cache B.A.T
add into cache Zoete
add into cache Noxell
add into cache P&G
add into cache Boelkow
add into cache G.m.b
add into cache Fleet\/Norstar
add into cache I.E.P.
Inserted 2000 words out of 18485 total.
add into cache SKr#.#
add into cache Bfree
add into cache Herslow
add into cache Kurtanjek
add into cache SE\/##
add into cache IIcx
add into cache ...
add into cache Polymerix
add into cache Polycast
add into cache Kushkin
add into cache R.I
add into cache Ebasco
add into cache Enserch
add into cache Asia\/Australia
add into cache Infotechnology
add into cache Webster\/Eagle
add into c

In [19]:
import pickle

In [20]:
with open(os.path.join(leveldb_path, "data.pkl"), "wb") as f:
    pickle.dump(word2vec_data, f)

In [54]:
word2vec_data['Confidence']

array([ 2.00195312e-01,  4.68750000e-02, -4.10156250e-02, -2.75390625e-01,
        8.15429688e-02,  1.90429688e-01, -2.63671875e-01, -7.91015625e-02,
        2.81250000e-01,  1.03027344e-01,  1.29882812e-01,  1.53320312e-01,
        1.63085938e-01,  3.69140625e-01, -2.61718750e-01,  2.48046875e-01,
        4.21875000e-01,  1.65039062e-01, -3.00292969e-02, -1.12792969e-01,
        9.96093750e-02,  4.43359375e-01,  2.66113281e-02,  2.81250000e-01,
        3.18908691e-03,  7.56835938e-02,  1.41601562e-02, -9.03320312e-02,
        2.83203125e-01,  2.77343750e-01, -2.96630859e-02, -7.86132812e-02,
        3.47656250e-01, -3.45703125e-01,  2.83203125e-01,  1.00097656e-01,
       -4.08203125e-01,  1.80664062e-01, -3.26171875e-01,  1.63574219e-02,
       -8.49609375e-02, -2.53906250e-01, -2.30468750e-01,  1.05468750e-01,
       -6.03027344e-02, -7.12890625e-02, -2.87109375e-01, -2.40478516e-02,
       -1.66992188e-01,  5.15625000e-01, -3.88671875e-01,  3.49609375e-01,
        8.25195312e-02,  

## dataset prepare

In [22]:
len(train_dataset)

221804

In [26]:
train_dataset[0]

['Confidence', 'NN']

In [116]:
class POSDataset():
    
    def __init__(self, pkl, dataset, tags_to_index, get_all=False):
        self.pkl = pkl
        self.inputs = []
        self.tags = []
        self.ptr = 0   # current index
        self.n = 0   # n-gram
        self.get_all = get_all
        
        for pair in dataset:
            self.inputs.append(pkl[pair[0]])   # get the embedding
            self.tags.append(tags_to_index[pair[1]])   # get the tag index
            
        self.inputs = np.array(self.inputs, dtype=np.float32)
        self.tags = np.eye(len(tags_to_index.keys()))[self.tags]
        
    def set_n_gram(self, n):
        self.n = n
        
    def minibatch(self, size):
        batch_inputs = []
        batch_tags = []
        
        if self.get_all:
            counter = 0
            while counter < len(self.inputs) - self.n + 1:
                batch_inputs.append(self.inputs[counter:counter+self.n].flatten())
                batch_tags.append(self.tags[counter + self.n - 1])
                counter += 1
                
        elif self.ptr + size < len(self.inputs) - self.n:
            counter = self.ptr
            while counter < self.ptr + size:
                batch_inputs.append(self.inputs[counter:counter+self.n].flatten())
                batch_tags.append(self.tags[counter + self.n - 1])
                counter += 1
        else:
            # TODO: may exist bugs
            counter = self.ptr
            while counter < len(self.inputs) - self.n + 1:
                batch_inputs.append(self.inputs[counter:counter+self.n].flatten())
                batch_tags.append(self.tags[counter + self.n - 1])
                counter += 1

            counter2 = 0
            while counter2 < size - counter + self.ptr:
                batch_inputs.append(self.inputs[counter2:counter2+self.n].flatten())
                batch_tags.append(self.tags[counter2 + self.n - 1])
                counter2 += 1

        self.ptr = (self.ptr + size) % (len(self.inputs) - self.n)
        return np.array(batch_inputs, dtype=np.float32), np.array(batch_tags)

In [117]:
train = POSDataset(word2vec_data, train_dataset, tags_to_index)
test = POSDataset(word2vec_data, test_dataset, tags_to_index, get_all=True)

In [118]:
train.inputs[0]

array([ 2.00195312e-01,  4.68750000e-02, -4.10156250e-02, -2.75390625e-01,
        8.15429688e-02,  1.90429688e-01, -2.63671875e-01, -7.91015625e-02,
        2.81250000e-01,  1.03027344e-01,  1.29882812e-01,  1.53320312e-01,
        1.63085938e-01,  3.69140625e-01, -2.61718750e-01,  2.48046875e-01,
        4.21875000e-01,  1.65039062e-01, -3.00292969e-02, -1.12792969e-01,
        9.96093750e-02,  4.43359375e-01,  2.66113281e-02,  2.81250000e-01,
        3.18908691e-03,  7.56835938e-02,  1.41601562e-02, -9.03320312e-02,
        2.83203125e-01,  2.77343750e-01, -2.96630859e-02, -7.86132812e-02,
        3.47656250e-01, -3.45703125e-01,  2.83203125e-01,  1.00097656e-01,
       -4.08203125e-01,  1.80664062e-01, -3.26171875e-01,  1.63574219e-02,
       -8.49609375e-02, -2.53906250e-01, -2.30468750e-01,  1.05468750e-01,
       -6.03027344e-02, -7.12890625e-02, -2.87109375e-01, -2.40478516e-02,
       -1.66992188e-01,  5.15625000e-01, -3.88671875e-01,  3.49609375e-01,
        8.25195312e-02,  

In [119]:
train.tags[0]

array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

# Model

In [120]:
import time
from tensorflow.python import control_flow_ops

## Hyperparameters

In [121]:
n_gram = 3
embedding_size = 300   # the same with embedding vector for each words
n_hidden_1 = 512
n_hidden_2 = 256
n_output = len(tags_to_index.keys())

In [122]:
full_training = False
training_epoch = 100 if full_training else 2
batch_size = 100 if full_training else 32
display_step = 10 if full_training else 1

In [123]:
learning_rate = 1e-3
beta1 = 0.9
beta2 = 0.999
epsilon = 1e-8

## Network

In [124]:
def layer_batch_norm(x, n_out, phase_train):
    """
    batch normalization
    """
    beta_init = tf.constant_initializer(value=0., dtype=tf.float32)
    gamma_init = tf.constant_initializer(value=1., dtype=tf.float32)
    
    beta = tf.get_variable("beta", [n_out], initializer=beta_init)
    gamma = tf.get_variable("gamma", [n_out], initializer=gamma_init)
    
    batch_mean, batch_var = tf.nn.moments(x, [0], name="moments")
    
    ema = tf.train.ExponentialMovingAverage(decay=0.9)
    ema_apply_op = ema.apply([batch_mean, batch_var])
    ema_mean, ema_var = ema.average(batch_mean), ema.average(batch_var)
    
    def mean_var_with_update():
        with tf.control_dependencies([ema_apply_op]):
            return tf.identity(batch_mean), tf.identity(batch_var)
        
    mean, var = control_flow_ops.cond(phase_train, mean_var_with_update, lambda: (ema_mean, ema_var))
    
    x_r = tf.reshape(x, [-1, 1, 1, n_out])
    normed = tf.nn.batch_norm_with_global_normalization(x_r, mean, var, beta, gamma, \
                                                        variance_epsilon=1e-3, scale_after_normalization=True)
    
    return tf.reshape(normed, [-1, n_out])

In [125]:
def layer(input, weight_shape, bias_shape, phase_train):
    weight_init = tf.random_normal_initializer(stddev=(1.0 / weight_shape[0])**0.5)
    bias_init = tf.constant_initializer(value=0)
    weight = tf.get_variable("W", weight_shape, initializer=weight_init)
    bias = tf.get_variable("b", bias_shape, initializer=bias_init)
    logits = tf.matmul(input, weight) + bias
    return tf.nn.sigmoid(layer_batch_norm(logits, weight_shape[1], phase_train))

In [126]:
def network(x, phase_train):
    with tf.variable_scope("hidden_1"):
        hidden_1 = layer(x, [n_gram * embedding_size, n_hidden_1], [n_hidden_1], phase_train)
    
    with tf.variable_scope("hidden_2"):
        hidden_2 = layer(hidden_1, [n_hidden_1, n_hidden_2], [n_hidden_2], phase_train)
    
    with tf.variable_scope("output"):
        output = layer(hidden_2, [n_hidden_2, n_output], [n_output], phase_train)
    
    return output

## Target

In [127]:
def loss(output, y):
    xe = tf.nn.softmax_cross_entropy_with_logits_v2(logits=output, labels=y)
    loss = tf.reduce_mean(xe)
    train_summary_opt = tf.summary.scalar("train_loss", loss)
    return loss, train_summary_opt

In [128]:
def training(loss, global_step):
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=beta1, beta2=beta2, epsilon=epsilon, \
                                       use_locking=False, name="Adam")
    train_op = optimizer.minimize(loss, global_step=global_step)
    return train_op

In [129]:
def evaluate(output, y):
    with tf.variable_scope("Validation"):
        compare = tf.equal(tf.argmax(output, 1), tf.argmax(y, 1))
        accuracy = tf.reduce_mean(tf.cast(compare, tf.float32))
        val_summary_op = tf.summary.scalar("validation", accuracy)
        return accuracy, val_summary_op

# Learning

In [130]:
print("{}-gram".format(n_gram))
train.set_n_gram(n_gram)
test.set_n_gram(n_gram)

3-gram


In [136]:
with tf.Graph().as_default():
    with tf.variable_scope("autoencoder_model"):
        x = tf.placeholder("float", [None, n_gram * embedding_size])
        y = tf.placeholder("float", [None, n_output])
        phase_train = tf.placeholder(tf.bool)
        
        output = network(x, phase_train)
        cost, cost_summary = loss(output, y)
        global_step = tf.Variable(0, name="global_step", trainable=False)
        training_op = training(cost, global_step=global_step)
        eval_op, eval_summary = evaluate(output, y)
        
        summary_op = tf.summary.merge_all()
        saver = tf.train.Saver(max_to_keep=50)
        
        with tf.Session() as sess:
            train_writer = tf.summary.FileWriter("pos_" + str(n_gram) + "-gram_logs/", graph=sess.graph)
            val_writer = tf.summary.FileWriter("pos_" + str(n_gram) + "-gram_logs/", graph=sess.graph)
            
            sess.run(tf.global_variables_initializer())
            
            # valid data
            prefix = ["She", "decided", "that", "it", "was", "time", "to", "leave", "home", "."]
            sentence = ["Then", "the", "woman", ",", "after", "grabbing", "her", "umbrella", ",", "went", "to", "the", "bank", "to", "deposit", "her", "cash", "."]
            test_str = []
            if n_gram > 1:
                for word in prefix[1-n_gram:]:
                    test_str.append(word2vec_data[word])
            for word in sentence:
                test_str.append(word2vec_data[word])
            test_str = np.array(test_str, dtype=np.float32)
            
            test_input = []
            for i in range(0, len(test_str) - n_gram + 1):
                test_input.append(test_str[i:i+n_gram].flatten())
                
            test_input = np.array(test_input, dtype=np.float32)
            
            # training
            for epoch in range(training_epoch):
                avg_cost = 0.
                total_batch = int(len(train.inputs) / batch_size)
                
                # look over the whole batch
                for idx in range(total_batch):
                    b_x, b_y = train.minibatch(batch_size)
                    _, cal_cost, cost_sum = sess.run([training_op, cost, cost_summary], feed_dict={x: b_x, y: b_y, phase_train: True})
                    train_writer.add_summary(cost_sum, sess.run(global_step))
                    avg_cost += cal_cost / batch_size
                    
                # display the progress
                if epoch % display_step == 0:
                    # show validation/testing result
                    print("epoch: {}, cost: {}".format(epoch, avg_cost))
                    v_x, v_y = test.minibatch(0)   # due to get_all=True
                    train_writer.add_summary(cost_sum, sess.run(global_step))
                    
                    accuracy, val_sum = sess.run([eval_op, eval_summary], feed_dict={x: v_x, y: v_y, phase_train: False})
                    val_writer.add_summary(val_sum, sess.run(global_step))
                    print("epoch: {}, validation error: {}".format(epoch, (1-accuracy)))
                    
                    # manual test
                    test_output = sess.run(output, feed_dict={x: test_input, phase_train: False})
                    tags = []
                    for tag_vec in test_output:
                        index = np.argmax(tag_vec)
                        tags.append(index_to_tags[index])
                    
                    for word_idx in range(len(sentence)):
                        print("{},{}".format(sentence[word_idx], tags[word_idx]))
                    
                    saver.save(sess, "pos_" + str(n_gram) + "-gram_logs//model-checkpoint-" + str(epoch+1), global_step=global_step)
                    
        print("Training Finished.")

epoch: 0, cost: 656.6154117286205
epoch: 0, validation error: 0.25795120000839233
Then,RB
the,DT
woman,NN
,,,
after,IN
grabbing,VBG
her,NNP
umbrella,NN
,,,
went,VBD
to,TO
the,DT
bank,NN
to,NNS
deposit,NN
her,NNP
cash,NN
.,.
epoch: 1, cost: 626.1850555837154
epoch: 1, validation error: 0.3242754340171814
Then,IN
the,NNP
woman,NN
,,,
after,IN
grabbing,VBG
her,NNP
umbrella,NN
,,,
went,VBD
to,TO
the,DT
bank,NN
to,TO
deposit,NN
her,NNP
cash,NN
.,.
Training Finished.


pos can refer to the webpage https://www.ibm.com/support/knowledgecenter/zh/SS5RWK_3.5.0/com.ibm.discovery.es.ta.doc/iiysspostagset.htm.