In [40]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import nltk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import brown
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer

In [41]:
nltk.download('brown')
nltk.download('averaged_perceptron_tagger')
txt = brown.tagged_words()

[nltk_data] Downloading package brown to /Users/annielin/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/annielin/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [42]:
# Tokenize the words
w_tokenizer = Tokenizer()
pos_tokenizer = Tokenizer()
tokens = [(tup[0].lower(), tup[1]) for tup in txt]

word_freq = {}
# Make a word-frequency dict
for token in tokens:
    if token[0] not in word_freq:
        word_freq[token[0]] = {}
        
    if token[1] not in word_freq[token[0]]:
        word_freq[token[0]][token[1]] = 1
    else:
        word_freq[token[0]][token[1]] += 1
        
w_tokenizer.fit_on_texts([t[0] for t in tokens])
pos_tokenizer.fit_on_texts([t[1] for t in tokens])

w_seqs = w_tokenizer.texts_to_sequences([t[0] for t in tokens])
pos_seqs = pos_tokenizer.texts_to_sequences([t[1] for t in tokens])

In [43]:
# pad the sentences
max_len = 50
pos_size = len(pos_tokenizer.word_index) + 1
w_seqs = pad_sequences(w_seqs, max_len, padding='post', truncating='post') # what do we want as the max length of padding for sequence?
pos_seqs = pad_sequences(pos_seqs, max_len, padding='post', truncating='post')
pos_encoded = to_categorical(pos_seqs, num_classes=pos_size)


In [45]:
# the LSTM layer
model = keras.Sequential()
data_size = len(w_tokenizer.word_index) + 1
model.add(layers.Embedding(data_size, 50, input_length=max_len)) # may need to change the output dim
model.add(layers.LSTM(10))
model.add(layers.Dense(pos_size, 'softmax'))
model.compile(optimizer='adam', loss='mean_squared_error')
print(model.summary())

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 50, 50)            2227100   
                                                                 
 lstm_3 (LSTM)               (None, 10)                2440      
                                                                 
 dense_3 (Dense)             (None, 79)                869       
                                                                 
Total params: 2,230,409
Trainable params: 2,230,409
Non-trainable params: 0
_________________________________________________________________
None
