In [None]:

def logits_to_tokens(sequences, index):
    token_sequences = []
    for categorical_sequence in sequences:
        token_sequence = []
        for categorical in categorical_sequence:
            token_sequence.append(index[np.argmax(categorical)])

        token_sequences.append(token_sequence)

    return token_sequences


def to_categorical(sequences, categories):
    cat_sequences = []
    for s in sequences:
        cats = []
        for item in s:
            cats.append(np.zeros(categories))
            cats[-1][item] = 1.0
        cat_sequences.append(cats)
    return np.array(cat_sequences)


def get_words(sentences):
    words = set([])
    for sentence in sentences:
        for word in sentence:
            words.add(word)
    return words


def get_tags(sentences_tags):
    tags = set([])
    for tag in sentences_tags:
        for t in tag:
            tags.add(t)
    return tags


def get_train_sentences_x(train_sentences, word2index):
    train_sentences_x = []
    for sentence in train_sentences:
        sentence_index = []
        for word in sentence:
            try:
                sentence_index.append(word2index[word])
            except KeyError:
                sentence_index.append(word2index['-OOV-'])

        train_sentences_x.append(sentence_index)
    return train_sentences_x


def get_test_sentences_x(test_sentences, word2index):
    test_sentences_x = []
    for sentence in test_sentences:
        sentence_index = []
        for word in sentence:
            try:
                sentence_index.append(word2index[word])
            except KeyError:
                sentence_index.append(word2index['-OOV-'])
        test_sentences_x.append(sentence_index)
    return test_sentences_x


def get_train_tags_y(train_tags, tag2index):
    train_tags_y = []
    for tags in train_tags:
        train_tags_y.append([tag2index[t] for t in tags])
    return train_tags_y


def get_test_tags_y(test_tags, tag2index):
    test_tags_y = []
    for tags in test_tags:
        test_tags_y.append([tag2index[t] for t in tags])
    return test_tags_y
import codecs
import numpy as np
from sklearn.model_selection import train_test_split



tagged_sentences = codecs.open("/content/drive/MyDrive/Colab Notebooks/pos.txt", encoding="utf-8").readlines()
print(tagged_sentences[0])
print("Tagged sentences: ", len(tagged_sentences))
sentences, sentence_tags = [], []
import ast
for tagged_sentence in tagged_sentences:
    sentence, tags = zip(*ast.literal_eval(tagged_sentence))
    sentences.append(np.array(sentence))
    sentence_tags.append(np.array(tags))
(train_sentences, test_sentences, train_tags, test_tags) = train_test_split(sentences, sentence_tags, test_size=0.1)
# print(train_sentences)
words = get_words(train_sentences)
tags = get_tags(train_tags)
print(tags)

[('اشتتیاق', 'NN'), ('اور', 'CC'), ('ملائکہ', 'NN'), ('ہی', 'I'), ('ببانگِ', 'NN'), ('دہل', 'PN'), ('موجود', 'ADJ'), ('ہیں', 'VB'), ('اس', 'PD'), ('وقت', 'NN'), ('تو', 'I'), ('۔', 'SM')]

Tagged sentences:  4
{'ADJ', 'TA', 'CC', 'AA', 'P', 'I', 'SM', 'PM', 'PN', 'NEG', 'AKP', 'CA', 'PD', 'VB', 'NN'}


In [None]:
word2index = {w: i + 2 for i, w in enumerate(list(words))}
word2index['-PAD-'] = 0
word2index['-OOV-'] = 1

tag2index = {t: i + 1 for i, t in enumerate(list(tags))}
tag2index['-PAD-'] = 0

In [None]:
train_sentences_x = get_train_sentences_x(train_sentences, word2index)
test_sentences_x = get_test_sentences_x(test_sentences, word2index)
print(tag2index)
train_tags_y = get_train_tags_y(train_tags, tag2index)
test_tags_y = get_test_tags_y(test_tags, tag2index)

{'ADJ': 1, 'TA': 2, 'CC': 3, 'AA': 4, 'P': 5, 'I': 6, 'SM': 7, 'PM': 8, 'PN': 9, 'NEG': 10, 'AKP': 11, 'CA': 12, 'PD': 13, 'VB': 14, 'NN': 15, '-PAD-': 0}


In [None]:
MAX_LENGTH = len(max(train_sentences_x, key=len))
from keras.preprocessing.sequence import pad_sequences
train_sentences_x = pad_sequences(train_sentences_x, maxlen=MAX_LENGTH, padding='post')
test_sentences_x = pad_sequences(test_sentences_x, maxlen=MAX_LENGTH, padding='post')
train_tags_y = pad_sequences(train_tags_y, maxlen=MAX_LENGTH, padding='post')
test_tags_y = pad_sequences(test_tags_y, maxlen=MAX_LENGTH, padding='post')

In [None]:
import tensorflow as tf
from keras.optimizers import Adam
from keras.layers import Activation, Dense
model = tf.keras.Sequential()
model.add(tf.keras.layers.InputLayer(input_shape=(MAX_LENGTH,)))
model.add(tf.keras.layers.Embedding(len(word2index), 128))
model.add(Dense(128))
model.add(Dense(len(tag2index)))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(0.001),
              metrics=['accuracy'])
history = model.fit(train_sentences_x, to_categorical(train_tags_y, len(tag2index)), batch_size=32, epochs=10,
                    validation_split=0.2).history
# model.save("../models/mlp.h5")

scores = model.evaluate(test_sentences_x, to_categorical(test_tags_y, len(tag2index)))
model.summary()

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 30, 128)           7552      
_________________________________________________________________
dense (Dense)                (None, 30, 128)           16512     
_________________________________________________________________
dense_1 (Dense)              (None, 30, 16)            2064      
_________________________________________________________________
activation (Activation)      (None, 30, 16)            0         
Total params: 26,128
Trainable params: 26,128
Non-trainable params: 0
_________________________________________________________________
