In [None]:
# install necessary packages using pip
!pip install keras numpy wget

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from keras.utils import set_random_seed
set_random_seed(42)

In [None]:
from google.colab import drive
import os
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
def load_corpus(path):
    sentences = []
    for file_name in os.listdir(path):
        with open(os.path.join(path, file_name), 'r') as f:
            for line in f:
                if line.strip():
                    sentence = []
                    for word_tag in line.split():
                        word, tag = word_tag.split('/')
                        sentence.append((word.lower(), tag))
                    sentences.append(sentence)
    return sentences

# test the function here:
path = os.path.join(os.getcwd(), 'drive', 'My Drive', 'cs6320', 'modified_brown') 
data = load_corpus(path)

In [None]:
data[0]

[('in', 'PREPOSITION'),
 ('sentences', 'NOUN'),
 (',', 'PUNCT'),
 ('patterns', 'NOUN'),
 ('of', 'PREPOSITION'),
 ('stress', 'NOUN'),
 ('are', 'VERB'),
 ('determined', 'VERB'),
 ('by', 'PREPOSITION'),
 ('complex', 'ADJECTIVE'),
 ('combinations', 'NOUN'),
 ('of', 'PREPOSITION'),
 ('influences', 'NOUN'),
 ('that', 'PRONOUN'),
 ('can', 'VERB'),
 ('only', 'ADVERB'),
 ('be', 'VERB'),
 ('suggested', 'VERB'),
 ('here', 'ADVERB'),
 ('.', 'PUNCT')]

In [None]:
import numpy as np # you may need this to convert lists to np arrays before returning them

# Creates the dataset with train_X (words) and train_y (tag).
def create_dataset(sentences):
    word_to_idx = {word: i for i, word in enumerate(sorted({wt[0] for s in sentences for wt in s}), 2)}
    word_to_idx['[PAD]'] = 0
    word_to_idx['[OOV]'] = 1
    tag_to_idx = {tag: i for i, tag in enumerate(sorted({wt[1] for s in sentences for wt in s}), 1)}
    tag_to_idx['[PAD]'] = 0
    
    train_x = []
    train_y = []
    
    for sentence in sentences:
        words = []
        tags = []
        for word, tag in sentence:
            words.append(word_to_idx[word])
            tags.append(tag_to_idx[tag])
        train_x.append(words)
        train_y.append(tags)

    return train_x, train_y, word_to_idx, tag_to_idx
# Test the function here
train_x, train_y, word_to_idx, tag_to_idx = create_dataset(data)
print(train_x[0], train_y[0])

[23075, 39676, 393, 32734, 31195, 42784, 4349, 13468, 8023, 10581, 10316, 31195, 23507, 44517, 8284, 31446, 5722, 43196, 21611, 405] [7, 5, 9, 5, 7, 5, 10, 10, 7, 1, 5, 7, 5, 8, 10, 2, 10, 10, 2, 9]


In [None]:
from keras.utils import pad_sequences as pad
# Pad the sequences with 0s to the max length.
def pad_sequences(train_x, train_y):
    MAX_LENGTH = len(max(train_x, key=len))  # 180
    train_x = pad(train_x, maxlen=MAX_LENGTH, padding='post')
    train_y = pad(train_y, maxlen=MAX_LENGTH, padding='post')

    return train_x, train_y, MAX_LENGTH
    

# Test the function
train_x, train_y, MAX_LENGTH = pad_sequences(train_x, train_y)

In [None]:
train_y[0]

array([ 7,  5,  9,  5,  7,  5, 10, 10,  7,  1,  5,  7,  5,  8, 10,  2, 10,
       10,  2,  9,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0], dtype=int32)

In [None]:
from keras.models import Sequential
from keras.layers import InputLayer, Activation
from keras.layers import Dense, LSTM, InputLayer, Bidirectional, TimeDistributed, Embedding
from keras.optimizers import Adam

# Define the Keras model.
def define_model(MAX_LENGTH):  
    
    # Define 'model' here
    model = Sequential()
    model.add(InputLayer(input_shape=(MAX_LENGTH, )))
    model.add(Embedding(input_dim=len(word_to_idx), output_dim=128))
    model.add(Bidirectional(LSTM(units=256, return_sequences=True)))
    model.add(TimeDistributed(Dense(units=len(tag_to_idx))))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy',
                  optimizer=Adam(0.001),
                  metrics=['accuracy'])
    print(model.summary())
    return model

# Call the function here
model = define_model(MAX_LENGTH)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 180, 128)          6367104   
                                                                 
 bidirectional (Bidirectiona  (None, 180, 512)         788480    
 l)                                                              
                                                                 
 time_distributed (TimeDistr  (None, 180, 12)          6156      
 ibuted)                                                         
                                                                 
 activation (Activation)     (None, 180, 12)           0         
                                                                 
Total params: 7,161,740
Trainable params: 7,161,740
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
# Returns the one-hot encoding of the sequence.
def to_categorical(train_y, num_tags):
    cat_sequences = []
    for s in train_y:
        cats = []
        for item in s:
            cats.append(np.zeros(num_tags))
            cats[-1][item] = 1.0
        cat_sequences.append(cats)
    return np.array(cat_sequences)
# Call the function as to_categorical(train_y, categories = len(tag2idx))

In [None]:
import tensorflow as tf
# Trains the model.
def train(model, train_x, train_y):
    # Fit the data into the Keras model, through 40 passes (epochs) using model.fit()
    model.fit(train_x, to_categorical(train_y, len(tag_to_idx)), batch_size=128, epochs=40, validation_split=0.2)
    # Return the model.
    return model

# call function here
model = train(model, train_x, train_y)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [None]:
# Test a sentence using the given model.
def test(model, sentence):
    processed_sentence = np.zeros(MAX_LENGTH, dtype='int')
    idx_to_tag = {i: tag for tag, i in tag_to_idx.items()}
    for i, word in enumerate(sentence.split()):
        processed_sentence[i] = (word_to_idx.get(word, word_to_idx['[OOV]']))
    processed_sentence = np.expand_dims(processed_sentence, axis=0)
    prediction = model.predict(processed_sentence).squeeze()
    prediction = [np.argmax(tag) for tag in prediction if np.argmax(tag) != 0]
    return [idx_to_tag[tag] for tag in prediction]


s1 = 'the planet jupiter and its moons are in effect a mini solar system .'
s2 = 'computers process programs accurately .'
print('The most likely tag sequence for the sentence \"{}\" is:\n {}'.format(s1, test(model, s1)))
print('The most likely tag sequence for the sentence \"{}\" is:\n {}'.format(s2, test(model, s2)))

The most likely tag sequence for the sentence "the planet jupiter and its moons are in effect a mini solar system ." is:
 ['DETERMINER', 'NOUN', 'NOUN', 'CONJUNCTION', 'PRONOUN', 'NOUN', 'VERB', 'PREPOSITION', 'VERB', 'DETERMINER', 'ADJECTIVE', 'ADJECTIVE', 'NOUN', 'PUNCT']
The most likely tag sequence for the sentence "computers process programs accurately ." is:
 ['NOUN', 'NOUN', 'NOUN', 'ADVERB', 'PUNCT']
