In [11]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Embedding, Dense, LSTM, TimeDistributed
from tensorflow.keras.models import Sequential
import nltk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import brown
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from nltk.corpus import brown, treebank
from tensorflow.keras.optimizers import Adam
from gensim.models import KeyedVectors
import gensim.downloader as api
import os
path = api.load("word2vec-google-news-300", return_path=True)
tf.config.run_functions_eagerly(True)

In [12]:
nltk.download('brown')
nltk.download('averaged_perceptron_tagger')
nltk.download('treebank')

[nltk_data] Downloading package brown to /Users/annielin/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/annielin/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package treebank to
[nltk_data]     /Users/annielin/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


True

In [13]:
txt = brown.tagged_words(tagset='universal')
txt2 = treebank.tagged_words(tagset='universal')
brown_sent = brown.tagged_sents(tagset='universal')
tree_sent = treebank.tagged_sents(tagset='universal')
# all_sent = brown_sent + tree_sent
all_sent = brown_sent

In [14]:
# Tokenize the words
# w_tokenizer = Tokenizer()
# pos_tokenizer = Tokenizer()
# tokens = [(tup[0].lower(), tup[1]) for tup in txt]
# tokens.extend([(tup[0].lower(), tup[1]) for tup in txt2])

word = [word[0].lower() for tup in all_sent for word in tup] # store the word
pos = [pos[1].lower() for tup in all_sent for pos in tup] # store the corresponding pos tag

word_tokenizer = Tokenizer()
pos_tokenizer = Tokenizer()

word_tokenizer.fit_on_texts(word) 
word_seqs = word_tokenizer.texts_to_sequences(word)  
pos_tokenizer.fit_on_texts(pos)
pos_seqs = pos_tokenizer.texts_to_sequences(pos)

# word_vectors = KeyedVectors.load_word2vec_format('path/to/word2vec.bin', binary=True)


In [15]:
# find the length of the training sets
max_len = 100 # what do we want as the max length of padding for sequence?
w_size = len(word_tokenizer.word_index) + 1
pos_size = len(pos_tokenizer.word_index) + 1


In [16]:
# code modified from https://towardsdatascience.com/pos-tagging-using-rnn-7f08a522f849
word_vectors = KeyedVectors.load_word2vec_format(path, binary=True)
embedding_size = 300  
embedding_weights = np.zeros((w_size, embedding_size))
word2id = word_tokenizer.word_index
for word, index in word2id.items():
    try:
        embedding_weights[index, :] = word_vectors[word]
    except KeyError:
        pass

In [17]:
# the LSTM Model
model = Sequential()
optimizer = Adam(learning_rate=0.01)

model.add(Embedding(w_size, embedding_size, input_length=max_len, weights=[embedding_weights], trainable=False)) 
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
model.add(TimeDistributed(Dense(pos_size, activation='softmax')))
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
print(model.summary())


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 100, 300)          13362600  
                                                                 
 lstm_1 (LSTM)               (None, 100, 64)           93440     
                                                                 
 time_distributed_1 (TimeDis  (None, 100, 12)          780       
 tributed)                                                       
                                                                 
Total params: 13,456,820
Trainable params: 94,220
Non-trainable params: 13,362,600
_________________________________________________________________
None


In [None]:
# training data

# split the data for better training
split_idx = int(0.8 * w_size)
word_train, pos_train = word_seqs[:split_idx], pos_seqs[:split_idx]
word_test, pos_test = word_seqs[split_idx:], pos_seqs[split_idx:]

# pad the data
word_train = pad_sequences(word_train, max_len, padding='post', truncating='post') 
pos_train = pad_sequences(pos_train, max_len, padding='post', truncating='post')
word_test = pad_sequences(word_test, maxlen=max_len, padding='post', truncating='post')
pos_test = pad_sequences(pos_test, maxlen=max_len, padding='post', truncating='post')

# convert pos data set into one-hot encoding
pos_train = to_categorical(pos_train, num_classes=pos_size)
pos_test = to_categorical(pos_test, num_classes=pos_size)

training = model.fit(word_train, pos_train, batch_size=32, epochs=1, validation_data=(word_test, pos_test))
loss, accuracy = model.evaluate(word_test, pos_test)
model.save('lstm_model.h5')
print('Model Done')
os._exit(0)



In [None]:
print(model.summary())

In [None]:
#  right now the model is trained with only brown data set for faster runtime
#  may increase the data with treebank for better training in future