In [60]:
import pandas as pd
import re
import string
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, LSTM, InputLayer, Bidirectional, TimeDistributed, Embedding, Activation
from keras.optimizers import Adam
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import numpy as np
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from collections import Counter
import seaborn as sn
import matplotlib.pyplot as plt
from keras import backend as K


%matplotlib inline

In [12]:
fi = open('./Brown_train.txt')

sent_db = []
tag_db = []



for line in fi:
    line = line.rstrip()
    temp_line = line.split(' ')
    temp_sent_db = []
    temp_tag_db = []
    
    for word in temp_line:
        temp_word = word.split('/')    
        temp_sent_db.append(temp_word[0])
        temp_tag_db.append(temp_word[1])
    
    sent_db.append(temp_sent_db)
    tag_db.append(temp_tag_db)

In [16]:
print(len(sent_db), len(tag_db))
print(len(sent_db[0]), len(tag_db[0]))

27491 27491
15 15


In [26]:
def makeVocab(data):
    vocab = []
    for d in data:
        for w in d:
            vocab.append(w)
    
    return sorted(set(vocab))

In [27]:
def getIndexInVocab(vocab, word):
    if word not in vocab:
        return 0
    return vocab.index(word) + 2

In [28]:
def padding(sentence, padLen):
    pad_sent = []
    i = 0
    for s in sentence:
        pad_sent.append(s)
        i += 1
        if i == padLen:
            break
    
    rem_len = padLen - len(pad_sent)
    for i in range(rem_len):
        pad_sent.append(1)
    
    return pad_sent

In [29]:
def makeVectorized(list_sent, vocab, maxlen):
    ans = []

    for l in list_sent:
        sent = []
        for w in l:
            sent.append(getIndexInVocab(vocab, w))
        
        sent = padding(sent, maxlen)
        ans.append(sent)
    
    return ans

In [34]:
def to_categorical(tags, categories):
    cat_sequences = []
    for s in tags:
        cats = []
        for item in s:
            cats.append(np.zeros(categories))
            cats[-1][item] = 1.0
        cat_sequences.append(cats)
    return np.array(cat_sequences)

In [61]:
def ignore_class_accuracy(to_ignore=0):
    def ignore_accuracy(y_true, y_pred):
        y_true_class = K.argmax(y_true, axis=-1)
        y_pred_class = K.argmax(y_pred, axis=-1)
 
        ignore_mask = K.cast(K.not_equal(y_pred_class, to_ignore), 'int32')
        matches = K.cast(K.equal(y_true_class, y_pred_class), 'int32') * ignore_mask
        accuracy = K.sum(matches) / K.maximum(K.sum(ignore_mask), 1)
        return accuracy
    return ignore_accuracy

In [38]:
MAX_LENGTH = 60

In [30]:
X_train, X_test, y_train, y_test = train_test_split(sent_db, tag_db, test_size=0.2)

In [33]:
sent_vocab = makeVocab(X_train)
tag_vocab = makeVocab(y_train)
x_train_vectorised = makeVectorized(X_train, sent_vocab, MAX_LENGTH)
x_test_vectorised = makeVectorized(X_test,sent_vocab, MAX_LENGTH)

y_train_vectorised = makeVectorized(y_train, tag_vocab, MAX_LENGTH)
y_test_vectorised = makeVectorized(y_test,tag_vocab, MAX_LENGTH)

In [46]:
x_train_vectorised = np.array(x_train_vectorised)
x_test_vectorised = np.array(x_test_vectorised)

In [44]:
y_train_ohe = to_categorical(y_train_vectorised, len(tag_vocab) + 2)
y_test_ohe = to_categorical(y_test_vectorised, len(tag_vocab) + 2)
y_train_ohe.shape

(21992, 60, 39)

In [70]:
model = Sequential()
model.add(InputLayer(input_shape=(MAX_LENGTH, )))
model.add(Embedding(len(sent_vocab) + 2, 128))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(TimeDistributed(Dense(len(tag_vocab) + 2)))
model.add(Activation('softmax'))
 
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(0.001),
              metrics=['accuracy', ignore_class_accuracy(0), 'categorical_accuracy'])
 
model.summary()

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 60, 128)           3873408   
_________________________________________________________________
bidirectional_10 (Bidirectio (None, 60, 512)           788480    
_________________________________________________________________
time_distributed_10 (TimeDis (None, 60, 39)            20007     
_________________________________________________________________
activation_10 (Activation)   (None, 60, 39)            0         
Total params: 4,681,895
Trainable params: 4,681,895
Non-trainable params: 0
_________________________________________________________________


In [71]:
model.fit(x_train_vectorised, y_train_ohe, batch_size=128, epochs=5, validation_split=0.2)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 17593 samples, validate on 4399 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.callbacks.History at 0x1a6543ab50>

In [72]:
scores = model.evaluate(x_test_vectorised,y_test_ohe)



In [73]:
scores

[0.040078302439228365,
 0.9876765608787537,
 0.9876783490180969,
 0.9876765608787537]