In [35]:
from numpy import array
import tensorflow as tf
import glob
import numpy as np
import pickle
from datetime import datetime
import nltk 

from sklearn.metrics import accuracy_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential,Model
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import TimeDistributed
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import LSTM,Bidirectional,MaxPooling1D,Flatten,concatenate
from tensorflow.keras.utils import Progbar
from tensorflow.keras.models import load_model

from tensorflow.keras.initializers import RandomUniform


In [4]:
class LoadData():
    def __init__(self):
        self.train_files = None
        self.validation_files = None
        
    def get_data(self):
        self.train_files = glob.glob("benchmarking_data/Train//*.txt")
        self.validation_files = glob.glob("benchmarking_data/Validate//*.txt")
        
    def sentence_from_file(self,filename):
        single_data_list = list()
        with open(filename) as fp:
            sentence_list = []
            lines = fp.readlines()
            for line in lines:
                splits = line.split(' ')
                if splits[0]=='\n':
                    #sent = " ".join([word[0] for word in sentence_list])
                    #single_data_list.append((sentence_list,sent))
                    single_data_list.append(sentence_list)
                    sentence_list = list()
                else:
                    sentence_list.append((splits[0],splits[1],splits[-1].replace('\n','')))
                
        return single_data_list
    
    def addCharInformatioin(self,Sentences):
        for i,sentence in enumerate(Sentences):
            for j,data in enumerate(sentence):
                chars = [c for c in data[0]]
                Sentences[i][j] = [data[0],chars,data[1],data[2]]
        return Sentences
    
    def prepared_data(self,files):
        list_sentences = list()
        for each_file in files:
            sentences = self.sentence_from_file(each_file)
            #sentences = self.addCharInformatioin(sentences)
            list_sentences+= sentences
        return list_sentences
    

In [5]:
load_data_obj = LoadData()
load_data_obj.get_data()
trained_sen_list = load_data_obj.prepared_data(load_data_obj.train_files)
validation_sen_list = load_data_obj.prepared_data(load_data_obj.validation_files)
print(trained_sen_list[:5])

[[('book', 'NN', 'O'), ('The', 'DT', 'B-restaurant_name'), ('Middle', 'NNP', 'I-restaurant_name'), ('East', 'NNP', 'I-restaurant_name'), ('restaurant', 'NN', 'B-restaurant_type'), ('in', 'IN', 'O'), ('IN', 'NNP', 'B-state'), ('for', 'IN', 'O'), ('noon', 'NN', 'B-timeRange')], [('Book', 'VB', 'O'), ('a', 'DT', 'O'), ('table', 'NN', 'O'), ('at', 'IN', 'O'), ('T-Rex', 'NNP', 'B-restaurant_name'), ('distant', 'NN', 'B-spatial_relation'), ('from', 'IN', 'O'), ('Halsey', 'NNP', 'B-poi'), ('St', 'NNP', 'I-poi'), ('.', '.', 'O')], [("I'd", 'NNP', 'O'), ('like', 'IN', 'O'), ('to', 'TO', 'O'), ('eat', 'VB', 'O'), ('at', 'IN', 'O'), ('a', 'DT', 'O'), ('taverna', 'NN', 'B-restaurant_type'), ('that', 'WDT', 'O'), ('serves', 'VBZ', 'O'), ('chili', 'JJ', 'B-served_dish'), ('con', 'NN', 'I-served_dish'), ('carne', 'NN', 'I-served_dish'), ('for', 'IN', 'O'), ('a', 'DT', 'O'), ('party', 'NN', 'O'), ('of', 'IN', 'O'), ('10', 'CD', 'B-party_size_number')], [('I', 'PRP', 'O'), ('have', 'VBP', 'O'), ('a', '

In [22]:
class Preprocessing():
    def __init__(self):
        self.max_len = len(max(trained_sen_list))
        
    def make_data(self,data_list):
        
        
        words = list()
        for each_sent in data_list:
            for each_item in each_sent:
                words.append(each_item[0])
        words = list(set(words))

        
        pos_tags = list()
        for each_sent in data_list:
            for each_item in each_sent:
                pos_tags.append(each_item[1])
        pos_tags = list(set(pos_tags))
        
        labels = list()
        for each_sent in data_list:
            for each_item in each_sent:
                labels.append(each_item[2])
        labels = list(set(labels))
        
        
        self.word2idx = {w: i for i, w in enumerate(words)}
        self.word2idx.update({"PAD": len(self.word2idx), "UNK": len(self.word2idx)+1})
        self.num_words = len(self.word2idx)
        
        self.pos_tag2idx = {t: i for i, t in enumerate(pos_tags)}
        self.pos_tag2idx.update({"PAD": len(self.pos_tag2idx), "UNK": len(self.pos_tag2idx)+1})
        self.num_pos_tags = len(self.pos_tag2idx)
        
        self.label2idx = {t: i for i, t in enumerate(labels)}
        self.num_lables = len(self.label2idx)
        
    def word2features(self,data, word_dict):
        word = data[0]
        postag = data[1]
        binary_map = {True:0,False:1,None:2}
        features = [word_dict[word],binary_map[word.islower()], 
                    binary_map[word.isupper()], binary_map[word.istitle()], 
                    binary_map[word.isdigit()], self.pos_tag2idx[postag] ]
        return features


    def sent2features(self,sent,word_dict):
        sentence_features = list()
        for index in range(len(sent)):
            sentence_features.append(self.word2features(sent[index],word_dict))
                       
        return sentence_features

    def sent2labels(self,sent):
        return [label for token, postag, label in sent]

    def sent2tokens(self,sent):
        return [token for token, postag, label in sent]
    
    def create_data(self,data_list):
        self.sentences = data_list
        maxlen = max([len(item) for item in data_list])
        self.max_len = maxlen
        wd = [[self.word2idx[w[0]] for w in s] for s in self.sentences]
        
        wd = pad_sequences(maxlen=maxlen, sequences=wd, padding="post",value=self.word2idx["PAD"])
        
        pos = [[self.pos_tag2idx[w[1]] for w in s] for s in self.sentences]
        pos = pad_sequences(maxlen=maxlen, sequences=pos, padding="post",value=self.pos_tag2idx["PAD"])

        y = [[self.label2idx[w[2]] for w in s] for s in self.sentences]
        y = pad_sequences(maxlen=maxlen, sequences=y, padding="post", value=self.label2idx["O"])
        return (wd,pos),y

In [23]:
preprocess_obj = Preprocessing()
preprocess_obj.make_data(trained_sen_list+validation_sen_list)
x_train,y_train = preprocess_obj.create_data(trained_sen_list)

In [44]:
class MyCallback(tf.keras.callbacks.Callback):
    def __init__(self, monitor='acc', baseline=0.95):
        self.monitor = monitor
        self.baseline = baseline
        self.training_stop = False

    def on_train_begin(self, logs={}):
        self.history={'loss': [],'acc': [],'val_loss': [],'val_acc': []}

    def on_epoch_end(self, epoch, logs={}):
        if logs and logs.get(self.monitor) >= self.baseline:
            print("\nReached %2.2f%% accuracy, so stopping training!!" %(self.baseline*100))
            self.training_stop = True
        
        if self.training_stop:    
            self.model.stop_training = True


class CreateModel():
    def __init__(self):
        self.model = None
        self.history = None
        self.x_train = x_train
        self.y_train = y_train
        self.max_len = preprocess_obj.max_len
        self.num_words = preprocess_obj.num_words
        self.num_labels = preprocess_obj.num_lables
        self.posEmbeddings = np.identity(len(preprocess_obj.pos_tag2idx), dtype='float32') 
        
    def train(self):
        word_input = Input(shape=(self.max_len,))
        word_model = Embedding(input_dim=self.num_words, output_dim=50, input_length=self.max_len)(word_input)
        
        pos_input = Input(shape=(None,), dtype='int32')
        pos_model = Embedding(output_dim = self.posEmbeddings.shape[1], input_dim = self.posEmbeddings.shape[0], weights = [self.posEmbeddings], trainable=False)(pos_input)

        output = concatenate([word_model, pos_model])
        
        output = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(output)
        output = TimeDistributed(Dense(self.num_labels, activation="softmax"))(output)
        
        self.model = Model(inputs=[word_input, pos_input], outputs=[output])
        self.model.compile(loss="sparse_categorical_crossentropy", optimizer='nadam',metrics=["acc"])
        
    def run(self,batch_size=32,epoch=5):
        logdir = "logs_tensorboard/" + datetime.now().strftime("%Y%m%d-%H%M%S")
        logdir = "logs_tensorboard"
        tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logdir)
        
        val_acc = 0.99
        monitor_param = 'val_acc'
        
        checkpoint = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min')
        
        #checkpoint = MyCallback(monitor=monitor_param,baseline=val_acc) 
        self.history = self.model.fit(self.x_train, self.y_train,
                                     batch_size=batch_size, epochs=epoch,
                                     validation_split=0.1,callbacks=[checkpoint,tensorboard_callback],
                                     verbose=1)
    def save_model(self,model_file):
        self.model.save(model_file)

In [45]:
model_obj = CreateModel()
model_obj.train()
model_obj.run(batch_size=32,epoch=100)
model_obj.save_model("models/simple_ner_model_v2.h5")

Train on 12405 samples, validate on 1379 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100


In [40]:
class Prediction():
    def __init__(self):
        self.word2idx = preprocess_obj.word2idx
        self.pos_tag2idx = preprocess_obj.pos_tag2idx
        self.idx2label = {v: k for k,v in preprocess_obj.label2idx.items()}
        self.model = model_obj.model
        self.max_len = preprocess_obj.max_len
    def predict(self,texts):
        label_lists = list()
        for text in texts:
            words = text.split()
            tagged = nltk.pos_tag(words) 
            
            wd = [[self.word2idx.get(word, self.word2idx["UNK"]) for word in words]]
            wd = pad_sequences(maxlen=self.max_len, sequences=wd,
                          padding="post", value=self.word2idx["PAD"])
            
            pos = [[self.pos_tag2idx.get(item, self.pos_tag2idx["UNK"]) for item in tagged]]
            pos = pad_sequences(maxlen=self.max_len, sequences=pos,
                          padding="post", value=self.pos_tag2idx["PAD"])
            
            y_pred = self.model.predict([wd,pos])
            pred_index = np.argmax(y_pred, axis=-1)
            preds = pred_index.flatten().tolist()
            labels = [self.idx2label[ind] for ind in preds]
            label_lists.append(labels)
            
            print([(words[idx],labels[idx]) for idx in range(len(words))])
        return label_lists
    

In [43]:
#print(preprocess_obj.word2idx)
pred_obj = Prediction()
text = "Play the last track from Beyonce off Spotify"
y_pred = pred_obj.predict([text])

[('Play', 'O'), ('the', 'O'), ('last', 'B-sort'), ('track', 'B-music_item'), ('from', 'O'), ('Beyonce', 'O'), ('off', 'O'), ('Spotify', 'B-service')]
