<a href="https://colab.research.google.com/github/jacksonliang35/Polish-POS-Tagging/blob/main/lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### This script predicts the ctag of a word using character-level information by LSTM (Long Short-term Memory) units.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd /content/drive/MyDrive/UIUC/pos_tagging/Code

/content/drive/MyDrive/UIUC/pos_tagging/Code


In [None]:
## Imports
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

Following is a function used to parse xml into list of list.

In [None]:
def read_data(path):
    ## Parse xml file
    tree = ET.parse(path)
    root = tree.getroot()
    ## Construct list of list
    data = []
    label = []
    lexicon = []
    for chunk in root:
        cur_data = []
        cur_label = []
        cur_lex = []
        for tok in chunk:
            if tok.tag != 'ns':
                cur_data.append(tok.find('orth').text.lower())
                cur_lex.append(tok.find('lex').find('base').text.lower())
                cur_label.append(tok.find('lex').find('ctag').text)
        data.append(cur_data)
        label.append(cur_label)
        lexicon.append(cur_lex)
    return data,label,lexicon

In [None]:
# Input data
trdata, trlabel, trlemma = read_data('../Data/train.xml')
valdata, vallabel, vallemma = read_data('../Data/validate.xml')
tedata, telabel, telemma = read_data('../Data/test.xml')

In [None]:
# Combine training and validation data.
trdata += valdata
trlabel += vallabel
trlemma += vallemma

In [None]:
# Ignore position in a sentence
Xtrain = []
ytrain = []
Xtest = []
ytest = []
for sent in trdata:
    for w in sent:
        Xtrain.append(list(w))
for sent in trlabel:
    ytrain += sent
for sent in tedata:
    for w in sent:
        Xtest.append(list(w))
for sent in telabel:
    ytest += sent

In [None]:
# All Polish characters
all_chars = []
max_len = 0
for w in Xtrain:
    if len(w) > max_len:
        max_len = len(w)
    all_chars.extend(w)
vocab = list(set(all_chars))

In [None]:
# Transform characters into indices
vocab_set = set(vocab)
vocab.append('<UNK>')
vocab_dict = {vocab[i]:i for i in range(len(vocab))}
vocab_dict_inv = {i:vocab[i] for i in range(len(vocab))}
Xtrain_enc = []
Xtest_enc = []
for w in Xtrain:
    Xtrain_enc.append([vocab_dict[i] for i in w])
for w in Xtest:
    tmp = []
    for c in list(w):
        if c in vocab_set:
            tmp.append(vocab_dict[c])
        else:
            tmp.append(vocab_dict['<UNK>'])
    Xtest_enc.append(tmp)

In [None]:
# Pad the sequences
Xtrain_pad = keras.preprocessing.sequence.pad_sequences(Xtrain_enc)
Xtest_pad = keras.preprocessing.sequence.pad_sequences(Xtest_enc, maxlen=Xtrain_pad.shape[1])

In [None]:
# The lemma undergo the same process as orth form
Xtrain_lem = []
Xtest_lem = []
for sent in trlemma:
    for w in sent:
        Xtrain_lem.append(list(w))
for sent in telemma:
    for w in sent:
        Xtest_lem.append(list(w))
assert len(Xtrain) == len(Xtrain_lem)

all_chars_lem = []
for w in Xtrain_lem:
    all_chars_lem.extend(w)
vocab_lem = list(set(all_chars_lem))
vocab_set_lem = set(vocab_lem)
vocab_lem.append('<UNK>')
vocab_dict_lem = {vocab_lem[i]:i for i in range(len(vocab_lem))}

Xtrain_enc_lem = []
Xtest_enc_lem = []
for w in Xtrain_lem:
    Xtrain_enc_lem.append([vocab_dict_lem[i] for i in w])
for w in Xtest_lem:
    tmp = []
    for c in list(w):
        if c in vocab_set_lem:
            tmp.append(vocab_dict_lem[c])
        else:
            tmp.append(vocab_dict_lem['<UNK>'])
    Xtest_enc_lem.append(tmp)

Xtrain_pad_lem = keras.preprocessing.sequence.pad_sequences(Xtrain_enc_lem)
Xtest_pad_lem = keras.preprocessing.sequence.pad_sequences(Xtest_enc_lem, maxlen=Xtrain_pad_lem.shape[1])

In [None]:
# Encode ctags as vec
ctags = list(set(ytrain + ytest))
label_enc = LabelEncoder().fit(ctags)
ytrain = label_enc.transform(ytrain)
ytest = label_enc.transform(ytest)
ytrain_oh = tf.one_hot(ytrain, len(label_enc.classes_))
ytest_oh = tf.one_hot(ytest, len(label_enc.classes_))

#### Building the baseline LSTM model without lemma...

In [None]:
inputs = keras.Input(shape=(None,), dtype='int32')
x = layers.Embedding(len(vocab), 128)(inputs)
x = layers.LSTM(64)(x)
outputs = layers.Dense(len(label_enc.classes_), activation='softmax')(x)
model = keras.Model(inputs, outputs)
model.compile(optimizer="Adam", loss=keras.losses.CategoricalCrossentropy(), metrics=['accuracy'])
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding (Embedding)       (None, None, 128)         15616     
                                                                 
 lstm (LSTM)                 (None, 64)                49408     
                                                                 
 dense (Dense)               (None, 926)               60190     
                                                                 
Total params: 125,214
Trainable params: 125,214
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(Xtrain_pad, ytrain_oh, epochs=10)

  "Even though the `tf.config.experimental_run_functions_eagerly` "


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fb28d458390>

In [None]:
model.evaluate(Xtest_pad, ytest_oh)

  "Even though the `tf.config.experimental_run_functions_eagerly` "




[0.7732653021812439, 0.7509972453117371]

#### Now, try incorporating lemma into LSTM...

In [None]:
# Orth form
orth_input = keras.Input(shape=(None,), dtype='int32')
xo = layers.Embedding(len(vocab), 128)(orth_input)
xo = layers.LSTM(64)(xo)

# Word lemma
lem_input = keras.Input(shape=(None,), dtype='int32')
xl = layers.Embedding(len(vocab_lem), 128)(lem_input)
xl = layers.LSTM(64)(xl)

# Final classification
concat = layers.Concatenate()([xo, xl])
outputs = layers.Dense(len(label_enc.classes_), activation='softmax')(concat)
model2 = keras.Model([orth_input, lem_input], outputs)
model2.compile(optimizer="Adam", loss=keras.losses.CategoricalCrossentropy(), metrics=['accuracy'])
model2.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_4 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_5 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding_3 (Embedding)        (None, None, 128)    15616       ['input_4[0][0]']                
                                                                                                  
 embedding_4 (Embedding)        (None, None, 128)    15488       ['input_5[0][0]']                
                                                                                            

In [None]:
model2.fit([Xtrain_pad, Xtrain_pad_lem], ytrain_oh, epochs=10)

Epoch 1/10
    3/30394 [..............................] - ETA: 16:47 - loss: 6.8248 - accuracy: 0.0729    

  "Even though the `tf.config.experimental_run_functions_eagerly` "


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fd20ac3cf90>

In [None]:
model2.evaluate([Xtest_pad, Xtest_pad_lem], ytest_oh)

   7/7592 [..............................] - ETA: 2:10 - loss: 0.4120 - accuracy: 0.8259

  "Even though the `tf.config.experimental_run_functions_eagerly` "




[0.560661256313324, 0.8018055558204651]

This is 0.05 better in accuracy!

#### Using Bi-LSTM with Attention

In [None]:
# Define the Attention Network
from keras.models import *
from keras import backend as K
class Attention(keras.layers.Layer):
    def __init__(self, return_sequences=False):
        # If return_sequences==True, the output is the hidden state weighted by the attention weights.
        # If return_sequences==False, the output is further summed up.
        self.return_sequences = return_sequences
        super(Attention,self).__init__()

    def build(self, input_shape):
        self.W=self.add_weight(name="att_weight", shape=(input_shape[-1],1), initializer="normal")
        self.b=self.add_weight(name="att_bias", shape=(input_shape[-2],1), initializer="zeros")
        super(Attention,self).build(input_shape)

    def call(self, x):
        e = K.tanh(K.dot(x,self.W)+self.b)
        a = K.softmax(e, axis=1)
        output = x*a
        if self.return_sequences:
            return output
        return K.sum(output, axis=1)

In [None]:
# Orth form
orth_input = keras.Input(shape=(Xtrain_pad.shape[1],), dtype='int32')
xo = layers.Embedding(len(vocab), 128)(orth_input)
xo = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(xo)
xo = Attention()(xo)

# Word lemma
lem_input = keras.Input(shape=(Xtrain_pad_lem.shape[1],), dtype='int32')
xl = layers.Embedding(len(vocab_lem), 128)(lem_input)
xl = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(xl)
xl = Attention()(xl)

# Final classification
concat = layers.Concatenate()([xo, xl])
outputs = layers.Dense(len(label_enc.classes_), activation='softmax')(concat)
model4 = keras.Model([orth_input, lem_input], outputs)
model4.compile(optimizer="Adam", loss=keras.losses.CategoricalCrossentropy(), metrics=['accuracy'])
model4.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 54)]         0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 54)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 54, 128)      15616       ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, 54, 128)      15488       ['input_2[0][0]']                
                                                                                              

In [None]:
model4.fit([Xtrain_pad, Xtrain_pad_lem], ytrain_oh, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f0e3f79df10>

In [None]:
model4.evaluate([Xtest_pad, Xtest_pad_lem], ytest_oh)



[0.5255862474441528, 0.8068938255310059]

Using attention mechanism is almost the same (+0.005). This is probably because words are not as long as sentences where attention is more powerful.