<a href="https://colab.research.google.com/github/hbasafa/POS_tagging/blob/master/pos.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from time import time

import pandas as pd
import numpy as np
import hazm
from keras import Sequential
from keras.callbacks import TensorBoard
from keras.engine import InputLayer
from keras.layers import Embedding, Dense, Bidirectional, LSTM, TimeDistributed, Activation, K
from keras.optimizers import Adam
from keras_preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, KFold
import zipfile


# path_to_zip_file = "/content/CA4_data.zip"
# zip_ref = zipfile.ZipFile(path_to_zip_file, 'r')
# directory_to_extract_to = "/content"
# zip_ref.extractall(directory_to_extract_to)
# zip_ref.close()

train_path = "/content/train.txt"


def to_categorical(sequences, categories):
    cat_sequences = []
    for s in sequences:
        cats = []
        for item in s:
            cats.append(np.zeros(categories))
            cats[-1][item] = 1.0
        cat_sequences.append(cats)
    return np.array(cat_sequences)


def ignore_class_accuracy(to_ignore=0):
    def ignore_accuracy(y_true, y_pred):
        y_true_class = K.argmax(y_true, axis=-1)
        y_pred_class = K.argmax(y_pred, axis=-1)

        ignore_mask = K.cast(K.not_equal(y_pred_class, to_ignore), 'int32')
        matches = K.cast(K.equal(y_true_class, y_pred_class), 'int32') * ignore_mask
        accuracy = K.sum(matches) / K.maximum(K.sum(ignore_mask), 1)
        return accuracy

    return ignore_accuracy


def generate_sample(X_data, y_data, batch_size):

  samples_per_epoch = X_data.shape[0]
  number_of_batches = samples_per_epoch/batch_size
  counter=0

  while 1:

    X_batch = np.array(X_data[batch_size*counter:batch_size*(counter+1)]).astype('float32')
    y_batch = np.array(y_data[batch_size*counter:batch_size*(counter+1)]).astype('float32')
    counter += 1
    yield X_batch,y_batch

    #restart counter to yeild data in the next epoch as well
    if counter >= number_of_batches:
        counter = 0


def logits_to_tokens(sequences, index):
    token_sequences = []
    for categorical_sequence in sequences:
        token_sequence = []
        for categorical in categorical_sequence:
            token_sequence.append(index[np.argmax(categorical)])

        token_sequences.append(token_sequence)

    return token_sequences


if __name__ == '__main__':
    # preparing data
    train_data = pd.read_csv(train_path, '\s{2,}')
    words = train_data["#"]
    tags = train_data["DELM"]

    # cropping data
    r = -1
    words = words[:r]
    tags = tags[:r]

    dummy_word = "Jafar"
    text = (' ' + dummy_word + ' ').join(words)
    sentences = hazm.sent_tokenize(text)
    X, y = [], []
    k = 0
    for sent in sentences:
        s_words = sent.split(dummy_word)
        n = len(s_words)
        X.append(words[k:k + n])
        y.append(tags[k:k + n])
        k += n

    X = np.array(X)
    y = np.array(y)

    words, tags = set(words), set(tags)

    n_split = 5
    kf = KFold(n_splits=n_split, shuffle=True)
    # (train_sentences,
    #  test_sentences,
    #  train_tags,
    #  test_tags) = train_test_split(X, y, test_size=0.2)
    total_scores = []
    
    for train_index, test_index in kf.split(X=X, y=y):
        train_sentences, test_sentences = X[train_index], X[test_index]
        train_tags, test_tags = y[train_index], y[test_index]
        # assigning number
        word2index = {w: i + 2 for i, w in enumerate(list(words))}
        word2index['-PAD-'] = 0  # The special value used for padding
        word2index['-OOV-'] = 1  # The special value used for OOVs

        tag2index = {t: i + 1 for i, t in enumerate(list(tags))}
        tag2index['-PAD-'] = 0  # The special value used to padding

        train_sentences_X, test_sentences_X, train_tags_y, test_tags_y = [], [], [], []

        # preparing inputs
        for s in train_sentences:
            s_int = []
            for w in s:
                try:
                    s_int.append(word2index[w])
                except KeyError:
                    s_int.append(word2index['-OOV-'])

            train_sentences_X.append(s_int)

        for s in test_sentences:
            s_int = []
            for w in s:
                try:
                    s_int.append(word2index[w])
                except KeyError:
                    s_int.append(word2index['-OOV-'])

            test_sentences_X.append(s_int)

        for s in train_tags:
            train_tags_y.append([tag2index[t] for t in s])

        for s in test_tags:
            test_tags_y.append([tag2index[t] for t in s])

        MAX_LENGTH = len(max(train_sentences_X, key=len))
        print(MAX_LENGTH)

        train_sentences_X = pad_sequences(train_sentences_X, maxlen=MAX_LENGTH, padding='post')
        test_sentences_X = pad_sequences(test_sentences_X, maxlen=MAX_LENGTH, padding='post')
        train_tags_y = pad_sequences(train_tags_y, maxlen=MAX_LENGTH, padding='post')
        test_tags_y = pad_sequences(test_tags_y, maxlen=MAX_LENGTH, padding='post')

        model = Sequential()
        model.add(InputLayer(input_shape=(MAX_LENGTH,)))
        model.add(Embedding(len(word2index), 64))
        model.add(Bidirectional(LSTM(128, return_sequences=True)))
        model.add(TimeDistributed(Dense(len(tag2index))))
        model.add(Activation('softmax'))

        model.compile(loss='categorical_crossentropy',
                      optimizer=Adam(0.01),
                      metrics=['accuracy', ignore_class_accuracy(0)])

        model.summary()

        cat_train_tags_y = to_categorical(train_tags_y, len(tag2index))

        # tensorboard = TensorBoard(log_dir='./../../logs/POS-RNN-demo-2-' + '-{}'.format(time()))

        model.fit(train_sentences_X, cat_train_tags_y,
                  batch_size=128, epochs=8,
                  validation_split=0.2)

        # model.save('model_-pos-rnn-demo-2-' + '.h5')

        # model.load_weights('model_-pos-rnn-demo-2-.h5')

        scores = model.evaluate(test_sentences_X, to_categorical(test_tags_y, len(tag2index)))
        total_scores.append(scores)
        for i in range(len(scores)):
            print(f"{model.metrics_names[i]}: {scores[i] * 100}")

        # predictions = model.predict(test_sentences_X)
        # print(list(test_tags[1]))
        # print("")
        # # m = min(len(predictions), len(to_categorical(test_tags_y, len(tag2index))))-2
        # print(ignore_class_accuracy(0)(predictions, to_categorical(test_tags_y, len(tag2index))))
        # print(logits_to_tokens(predictions, {i: t for t, i in tag2index.items()})[1])

    print(f"total accuracy over 5-fold: {np.mean(total_scores, axis=0)[1] * 100}")





733
Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 733, 64)           3989376   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 733, 256)          197632    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 733, 40)           10280     
_________________________________________________________________
activation_1 (Activation)    (None, 733, 40)           0         
Total params: 4,197,288
Trainable params: 4,197,288
Non-trainable params: 0
_________________________________________________________________
