<a href="https://colab.research.google.com/github/kimgeonhee317/nlpdemystifed-notes/blob/main/notebook/13_Recurrent_Neural_Networks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 13- Recurrent Neural Networks

### Import Library

In [1]:
import nltk
import numpy as np
import requests
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import ModelCheckpoint
from nltk.corpus import treebank, brown, conll2000
from sklearn.model_selection import train_test_split
from tensorflow import keras

## Part-of-Speech Tagger with Bidirectional LSTM

In [2]:
# PoS tagging with LSTM is multiclass classification task for sequence.

# nltk offers free sets for labelled corpora.
# look at https://www.nltk.org/nltk_data
nltk.download('treebank')
nltk.download('brown')
nltk.download('conll2000')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Unzipping corpora/conll2000.zip.


True

In [3]:
nltk.download('universal_tagset')

[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


True

In [4]:
# Download all PoS-tagged sentences and place them in one List.
tagged_sentences = treebank.tagged_sents(tagset='universal')+\
                   brown.tagged_sents(tagset='universal')+\
                   conll2000.tagged_sents(tagset='universal')
print(tagged_sentences[0])
print(f"Dataset size: {len(tagged_sentences)}")

[('Pierre', 'NOUN'), ('Vinken', 'NOUN'), (',', '.'), ('61', 'NUM'), ('years', 'NOUN'), ('old', 'ADJ'), (',', '.'), ('will', 'VERB'), ('join', 'VERB'), ('the', 'DET'), ('board', 'NOUN'), ('as', 'ADP'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'NOUN'), ('Nov.', 'NOUN'), ('29', 'NUM'), ('.', '.')]
Dataset size: 72202


In [10]:
sentences, sentence_tags = [], []

for s in tagged_sentences:
  sentence, tags = zip(*s) # multiple numbers of tuple according to sentences
  #print(sentence)
  sentences.append(list(sentence))
  sentence_tags.append(list(tags))


('Rhode', "Island's", 'rate', 'of', '$.07', 'per', 'mile', 'is', 'considerably', 'lower', 'than', 'reimburseable', 'rates', 'in', 'the', 'federal', 'government', 'and', 'in', 'industry', 'nationally', 'which', 'approximate', 'a', '$.09', 'per', 'mile', 'average', '.')
('Actual', 'mileage', 'allowances', 'are', 'well-administered', 'and', 'not', 'unduly', 'expensive', 'for', 'the', 'state', '.')
('The', 'travel', 'regulations', ',', 'requirements', 'and', 'procedures', 'governing', 'reimbursement', 'are', 'controlled', 'properly', 'and', 'not', 'overly', 'restrictive', '.')
('Fixed', 'monthly', 'allowances', 'are', 'a', 'controversial', 'subject', '.')
('They', 'have', 'a', 'great', 'advantage', 'in', 'ease', 'of', 'audit', 'time', 'and', 'payment', '.')
('However', ',', 'they', 'lend', 'themselves', 'to', 'abuse', 'and', 'inadequate', 'control', 'measures', '.')
('Flat', 'payments', 'over', '$50', 'per', 'month', 'are', 'more', 'expensive', 'to', 'the', 'state', 'than', 'the', 'assign

Exception ignored in: <function _xla_gc_callback at 0x7b5c4acad750>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/jax/_src/lib/__init__.py", line 103, in _xla_gc_callback
    def _xla_gc_callback(*args):
KeyboardInterrupt: 


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
('One', 'major', 'factor', 'was', 'the', 'decline', 'of', 'the', 'dollar', 'against', 'the', 'mark', ',', 'which', 'began', 'less', 'than', 'a', 'year', 'after', 'Merkur', "'s", '1985', 'launch', '.')
('As', 'the', 'West', 'German', 'currency', 'rose', ',', 'so', 'did', 'Merkur', 'prices', '.')
('The', 'Merkur', 'cars', 'also', 'suffered', 'from', 'spotty', 'quality', ',', 'some', 'dealers', 'say', '.')
('``', 'It', 'was', 'like', 'a', 'comedy', 'of', 'errors', ',', "''", 'says', 'Martin', 'J.', '``', 'Hoot', "''", 'McInerney', ',', 'a', 'big', 'dealer', 'whose', 'Star', 'Lincoln-Mercury-Merkur', 'operation', 'in', 'Southfield', ',', 'Mich.', ',', 'sold', 'more', 'XR4Ti', "'s", 'than', 'any', 'other', 'dealership', '.')
('But', 'by', 'the', 'third', 'quarter', 'of', '1988', ',', 'Scorpios', 'had', 'a', 'high', 'satisfaction', 'rating', 'in', 'internal', 'Ford', 'studies', ',', 'a', 'spokesman', 'said', '.')
('Apparently',

In [11]:
print(sentences[0])
print(sentence_tags[0])

['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.']
['NOUN', 'NOUN', '.', 'NUM', 'NOUN', 'ADJ', '.', 'VERB', 'VERB', 'DET', 'NOUN', 'ADP', 'DET', 'ADJ', 'NOUN', 'NOUN', 'NUM', '.']


In [12]:
print(len(sentences), len(sentence_tags)) # number of sentences

72202 72202


In [13]:
train_ratio = 0.75
validation_ratio = 0.15
test_ratio=0.10

# train:test = 0.75:0.25
x_train, x_test, y_train, y_test = train_test_split(sentences, sentence_tags,
                                                     test_size= 1-train_ratio,
                                                     random_state = 317)
# train:val:test = 0.75:0.15:0.10
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test,
                                                test_size = test_ratio/(test_ratio + validation_ratio),
                                                random_state = 317)

In [14]:
print(len(x_train), len(y_train))
print(len(x_val), len(y_val))
print(len(x_test), len(y_test))

54151 54151
10830 10830
7221 7221


In [15]:
# Generate wordvectors for our sentenses
# default tokenizer, out-ov-vocabulary token as <OOV>
sentence_tokenizer = keras.preprocessing.text.Tokenizer(oov_token='<OOV')
sentence_tokenizer.fit_on_texts(x_train)
print(f"Vocabulary size: {len(sentence_tokenizer.word_index)}"

Vocabulary size: 52183


In [16]:
# we need another tokenizer for the tags are also sequences.
tag_tokenizer = keras.preprocessing.text.Tokenizer()
tag_tokenizer.fit_on_texts(y_train)

In [18]:
print(f"Number of PoS tags: {len(tag_tokenizer.word_index)}\n")
tag_tokenizer.get_config()

Number of PoS tags: 12



{'num_words': None,
 'filters': '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
 'lower': True,
 'split': ' ',
 'char_level': False,
 'oov_token': None,
 'document_count': 54151,
 'word_counts': '{"adv": 51392, "verb": 175631, "adp": 137365, "det": 127600, ".": 143593, "adj": 81110, "noun": 288313, "conj": 35420, "num": 21374, "prt": 31340, "pron": 44737, "x": 6109}',
 'word_docs': '{"det": 44815, "num": 11905, "adj": 36440, ".": 53331, "conj": 24581, "adv": 29599, "verb": 50880, "noun": 51202, "adp": 43937, "pron": 26974, "prt": 21888, "x": 2668}',
 'index_docs': '{"5": 44815, "11": 11905, "6": 36440, "3": 53331, "9": 24581, "7": 29599, "2": 50880, "1": 51202, "4": 43937, "8": 26974, "10": 21888, "12": 2668}',
 'index_word': '{"1": "noun", "2": "verb", "3": ".", "4": "adp", "5": "det", "6": "adj", "7": "adv", "8": "pron", "9": "conj", "10": "prt", "11": "num", "12": "x"}',
 'word_index': '{"noun": 1, "verb": 2, ".": 3, "adp": 4, "det": 5, "adj": 6, "adv": 7, "pron": 8, "conj": 9, "prt": 10, "

In [19]:
tag_tokenizer.word_index

{'noun': 1,
 'verb': 2,
 '.': 3,
 'adp': 4,
 'det': 5,
 'adj': 6,
 'adv': 7,
 'pron': 8,
 'conj': 9,
 'prt': 10,
 'num': 11,
 'x': 12}

In [21]:
x_train_seqs = sentence_tokenizer.texts_to_sequences(x_train)

In [23]:
print(x_train_seqs[0])
print(x_train[0])

[133, 1921, 19, 8, 13, 1606, 461, 15, 4344, 318, 12, 8, 1922, 28157, 1858, 25, 12287, 28158, 926, 926, 20609, 7, 6603, 6, 20610, 1568, 9, 3198, 39, 13, 145, 6, 1742, 23, 834, 295, 3038, 311, 28159, 32, 28160, 1284, 3, 11, 207, 20609, 1742, 429, 2072, 12288, 15, 4]
['Still', 'existing', 'on', 'a', '``', 'Northern', 'Union', "''", 'telegraph', 'form', 'is', 'a', 'typical', 'peremptory', 'message', 'from', 'Peru', 'grocer', 'J.', 'J.', 'Hapgood', 'to', 'Burton', 'and', "Graves'", 'store', 'in', 'Manchester', '--', '``', 'Get', 'and', 'send', 'by', 'stage', 'four', 'pounds', 'best', 'Porterhouse', 'or', 'serloin', 'stake', ',', 'for', 'Mrs.', 'Hapgood', 'send', 'six', 'sweet', 'oranges', "''", '.']


In [25]:
y_train_seqs = tag_tokenizer.texts_to_sequences(y_train)

In [28]:
print(tag_tokenizer.sequences_to_texts([y_train_seqs[0]]))
print(y_train_seqs[0])

['adv verb adp det . adj noun . noun noun verb det adj adj noun adp noun noun noun noun noun adp noun conj noun noun adp noun . . verb conj verb adp noun num noun adj noun conj noun noun . adp noun noun verb num adj noun . .']
[7, 2, 4, 5, 3, 6, 1, 3, 1, 1, 2, 5, 6, 6, 1, 4, 1, 1, 1, 1, 1, 4, 1, 9, 1, 1, 4, 1, 3, 3, 2, 9, 2, 4, 1, 11, 1, 6, 1, 9, 1, 1, 3, 4, 1, 1, 2, 11, 6, 1, 3, 3]


In [30]:
# Do the same things to valid dataset
x_val_seqs = sentence_tokenizer.texts_to_sequences(x_val)
y_val_seqs = tag_tokenizer.texts_to_sequences(y_val)

In [46]:
# Even if RNN can handle variable lengthes of sequences, it is musch better for performance to univy the lengthes of each sequences
print(len(max(x_train_seqs, key=len))) # return the length of the longest sequence
MAX_LENGTH = len(max(x_train_seqs, key=len))
print(f"Length of longest input sequence: {MAX_LENGTH}")

271
Length of longest input sequence: 271


In [47]:
# we can pad every sentences with method "pad_sequences" from keras
x_train_padded = keras.preprocessing.sequence.pad_sequences(x_train_seqs, padding='post',
                                                            maxlen=MAX_LENGTH)


In [49]:
print(x_train_padded[0])
print(len(x_train_padded[0]))

[  133  1921    19     8    13  1606   461    15  4344   318    12     8
  1922 28157  1858    25 12287 28158   926   926 20609     7  6603     6
 20610  1568     9  3198    39    13   145     6  1742    23   834   295
  3038   311 28159    32 28160  1284     3    11   207 20609  1742   429
  2072 12288    15     4     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0   

In [51]:
# do the same things to training label
y_train_padded = keras.preprocessing.sequence.pad_sequences(y_train_seqs, padding='post',
                                                           maxlen=MAX_LENGTH)

In [52]:
x_val_padded = keras.preprocessing.sequence.pad_sequences(x_val_seqs, padding='post', maxlen=MAX_LENGTH)
y_val_padded = keras.preprocessing.sequence.pad_sequences(y_val_seqs, padding='post', maxlen=MAX_LENGTH)

In [56]:
# As PoS tagging is a multiclass classification task done at each timestep,
# we need to convert everty tag for every sentence into one-hot encoding.
y_train_categoricals = keras.utils.to_categorical(y_train_padded)
print(y_train_categoricals[0]) # sequence is now composed of one-hot encodings

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]


In [57]:
# one-hot encoding for a single tag in a sequence
print(y_train_categoricals[0][0])

[0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]


In [65]:
# we can determind PoS tag from oh-encoding by "look-up" in index_word dictionary from tag_tokenizer
idx = np.argmax(y_train_categoricals[0][0]) # argmax return the index of elememt having maximum value in OHencoding array
print(f"Index: {idx}")
print(f"Tag: {tag_tokenizer.index_word[idx]}")

Index: 7
Tag: adv


In [66]:
# one hot encoding to val_labels
y_val_categoricals = keras.utils.to_categorical(y_val_padded)

In [None]:
# we'll train word embedding concurrently with our model(we can pretrained word vectors as well)

[notes]
1. Ignore padding values :
The embedding layers has *mask_zero* parameter. we added padding in order to make our batches the same size, but we don't want to makd PoS predictions on padding. Setting *mask_zero* to *True* makes the layers following the embedding layer ignore padding values.

2. Return sequences not only one output :
we're using *bidriectional LSTM*. The Bidrectional layer is a wrapper to which we pass an LSTM layer. The first parameter to the LSTM layer is the number of units in the cell. The second parameter, return_sequences, control whether the RNN returns an output for each timestep or only the last output. Since we're doing PoS-tagging, we want an aoutput for each timestep and so *return_sequences* is set to *True*.

In [68]:
# For the embedding layer. "+1" to account for the padding token.
num_tokens = len(sentence_tokenizer.word_index) + 1 # +1 for padding token
embedding_dim = 128

# For the output layer, The number of classes corresponds to the number of possible tags
num_classes = len(tag_tokenizer.word_index) + 1 # also +1 for padding token

In [72]:
# we set random_set_seed and kerner_initializer parameter to get same result.
tf.random.set_seed(317)

model = keras.Sequential()

# input layer(embedding layer : each tokens -> embedding_dim )
model.add(layers.Embedding(input_dim = num_tokens,
                           output_dim = embedding_dim,
                           input_length = MAX_LENGTH,
                           mask_zero=True))

# hidden layer (bidrectional)
model.add(layers.Bidirectional(layers.LSTM(128, return_sequences=True,
                                           kernel_initializer=tf.keras.initializers.random_normal(seed=317))))

# output layer for each timestep with softmax activation fucntion
model.add(layers.Dense(num_classes, activation='softmax',
                       kernel_initializer=tf.keras.initializers.random_normal(seed=317)))


model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])



In [73]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 271, 128)          6679552   
                                                                 
 bidirectional_1 (Bidirectio  (None, 271, 256)         263168    
 nal)                                                            
                                                                 
 dense_1 (Dense)             (None, 271, 13)           3341      
                                                                 
Total params: 6,946,061
Trainable params: 6,946,061
Non-trainable params: 0
_________________________________________________________________


[notes] \
1. The embedding layer output has three dimensions
- Batch size : None => we haven't specified it yet
- Sequence length : 217
- Embedding dimension : 128

2. Bidirectional LSTM outputs a vector twice the size of what we specified because its bidirectional. Remember two LSTM output will be concatenated before going to output layer.

3. Output layer also has three dimensions
- Batch size
- Sequence length
- Output dimension : 13 as number of tag classes


In [74]:
# we put early-stopping for trainig to be stopped when validation loss stops improving.
es_callback = keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

history = model.fit(x_train_padded, y_train_categoricals, epochs=20,
                    batch_size=256, validation_data=(x_val_padded, y_val_categoricals),
                    callbacks=[es_callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20


In [75]:
# After our model is trained, go to test data set
# preprocessing(tokenize, pad) it and oh encoding

x_test_seqs = sentence_tokenizer.texts_to_sequences(x_test)
x_test_padded = keras.preprocessing.sequence.pad_sequences(x_test_seqs, padding='post', maxlen=MAX_LENGTH)

y_test_seqs = tag_tokenizer.texts_to_sequences(y_test)
y_test_padded = keras.preprocessing.sequence.pad_sequences(y_test_seqs, padding='post', maxlen=MAX_LENGTH)
y_test_categoricals = keras.utils.to_categorical(y_test_padded)

In [76]:
model.evaluate(x_test_padded, y_test_categoricals)



[0.10725592076778412, 0.9684592485427856]

In [80]:
# now we can use our models to tag sentences.

samples = [
    "Brown refused to testify.",
    "Brown sofas are on sale",
]

In [97]:
# develop simple function for doing this task.

def tag_sentences(sentences):
  sentences_seqs = sentence_tokenizer.texts_to_sequences(sentences)
  sentences_padded = keras.preprocessing.sequence.pad_sequences(sentences_seqs,
                                                               maxlen=MAX_LENGTH,
                                                               padding='post')
  # tag_preds is each list of probabilty distribution (softmax)
  tag_preds = model.predict(sentences_padded)

  sentence_tags = []

  # each iteration is one sequence
  for i, preds in enumerate(tag_preds):

    print(preds)
    # seq of most probable ones in sequence
    tags_seq = [np.argmax(p) for p in preds[:len(sentences_seqs[i])]]
    words = [sentence_tokenizer.index_word[w] for w in sentences_seqs[i]]
    tags = [tag_tokenizer.index_word[t] for t in tags_seq]
    sentence_tags.append(list(zip(words, tags)))

  return sentence_tags

In [96]:
tagged_sample_sentences = tag_sentences(samples)

[[9.4178740e-06 9.7496110e-01 2.2204881e-04 ... 9.1031229e-04
  4.2912136e-05 7.7011995e-05]
 [1.5102107e-06 2.7067616e-04 9.9829799e-01 ... 2.1098483e-04
  3.2934058e-09 3.2704625e-06]
 [5.2168394e-07 1.3455164e-05 1.6992788e-05 ... 8.8361514e-01
  2.0139216e-05 2.6381040e-05]
 ...
 [7.4889779e-02 8.0382518e-02 7.9107232e-02 ... 7.5719878e-02
  7.5777747e-02 7.5990155e-02]
 [7.4889779e-02 8.0382518e-02 7.9107232e-02 ... 7.5719878e-02
  7.5777747e-02 7.5990155e-02]
 [7.4889779e-02 8.0382518e-02 7.9107232e-02 ... 7.5719878e-02
  7.5777747e-02 7.5990155e-02]]
[[4.6470045e-06 1.5848954e-01 8.4175648e-05 ... 5.5472274e-06
  1.8318392e-04 1.1667493e-05]
 [9.9270876e-09 9.9998748e-01 7.0956122e-07 ... 1.8804568e-07
  1.2746519e-07 4.2475389e-07]
 [2.2053248e-09 3.2962133e-05 9.9992669e-01 ... 3.1247776e-08
  7.4231480e-12 3.5437630e-08]
 ...
 [7.4889779e-02 8.0382518e-02 7.9107232e-02 ... 7.5719878e-02
  7.5777747e-02 7.5990155e-02]
 [7.4889779e-02 8.0382518e-02 7.9107232e-02 ... 7.5719878e-

In [98]:
print(tagged_sample_sentences[0])
print(tagged_sample_sentences[1])

[('brown', 'noun'), ('refused', 'verb'), ('to', 'prt'), ('testify', 'verb')]
[('brown', 'adj'), ('sofas', 'noun'), ('are', 'verb'), ('on', 'adp'), ('sale', 'noun')]


It's just one way of buidling a PoS tagger, these days' PoS tagger is much more sophisticated models in which transfomer is used.