For data i used  https://github.com/snipsco/nlu-benchmark/tree/master/2017-06-custom-intent-engines
Then you can run data_making.py

In [None]:
from numpy import array
import tensorflow as tf
import glob
import numpy as np
import pickle

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential,Model
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import TimeDistributed
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import LSTM,Bidirectional,MaxPooling1D,Flatten,concatenate
from tensorflow.keras.utils import Progbar
from tensorflow.keras.models import load_model

from tensorflow.keras.initializers import RandomUniform


In [None]:
class LoadData():
    def __init__(self):
        self.train_files = None
        self.validation_files = None
        
    def get_data(self):
        self.train_files = glob.glob("benchmarking_data/Train//*.txt")
        self.validation_files = glob.glob("benchmarking_data/Validate//*.txt")

In [None]:
load_data_obj = LoadData()
load_data_obj.get_data()

In [None]:
class Preprocessing():
    def __init__(self):
        self.word_embediings_model = open("embeddings/glove.6B.100d.txt", encoding="utf-8")
         
    
    def sentence_from_file(self,filename):
        f = open(filename)
        single_file_sentences = []
        sentence_list = []
        for line in f:
            if len(line)==0 or line[0]=="\n":
                if len(sentence_list) > 0:
                    single_file_sentences.append(sentence_list)
                    sentence_list = []
                continue
            splits = line.split(' ')
            sentence_list.append([splits[0],splits[1],splits[-1]])

        if len(sentence_list) >0:
            single_file_sentences.append(sentence_list)
            sentence_list = []
        return single_file_sentences

    def get_case_value(self,word, case_dict):   
        case_value = 'other'

        count_digits = 0
        for char in word:
            if char.isdigit():
                count_digits += 1

        if word.isdigit():
            case_value = 'number'
        elif count_digits / float(len(word)) > 0.5:
            case_value = 'fraction'
        elif word.islower():
            case_value = 'lower'
        elif word.isupper():
            case_value = 'upper'
        elif word[0].isupper():
            case_value = 'title'
        elif count_digits > 0:
            case_value = 'leters_digit'

        return case_dict[case_value]


    def createBatches(self,data):
        l = []
        for i in data:
            l.append(len(i[0]))
        l = set(l)
        batches = []
        batch_len = []
        z = 0
        for i in l:
            for batch in data:
                if len(batch[0]) == i:
                    batches.append(batch)
                    z += 1
            batch_len.append(z)
        return batches,batch_len

    def create_tensors(self,sentences,word_to_id,case_to_id,pos_to_id,char_to_id,label_to_id):
        #paddingIdx = word2Idx['PAD_TKN']
        unknownIdx = word_to_id['UNK_TKN']

        dataset = []

        word_count = 0
        unknownword_count = 0

        for sentence in sentences:
            word_indices = []    
            char_indices = []
            case_indices = []
            label_indices = []
            pos_indices = []

            for word,char,pos,label in sentence:  

                word_count += 1
                if word in word_to_id:
                    word_index = word_to_id[word]
                elif word.lower() in word_to_id:
                    word_index = word_to_id[word.lower()]                 
                else:
                    word_index = unknownIdx
                    unknownword_count += 1
                    
                char_index = []
                for x in char:
                    char_index.append(char_to_id[x])
                    
                word_indices.append(word_index)
                case_indices.append(self.get_case_value(word, case_to_id))
                pos_indices.append(pos_to_id[pos.replace('\n','')])
                char_indices.append(char_index)
                label_indices.append(label_to_id[label])
            print([word_indices, case_indices, char_indices, pos_indices, label_indices])
            dataset.append([word_indices, case_indices, char_indices, pos_indices, label_indices]) 
        return dataset


    def addCharInformatioin(self,Sentences):
        for i,sentence in enumerate(Sentences):
            for j,data in enumerate(sentence):
                chars = [c for c in data[0]]
                Sentences[i][j] = [data[0],chars,data[1],data[2]]
        return Sentences

    def padding(self,Sentences):
        maxlen = 52
        for sentence in Sentences:
            char = sentence[2]
            for x in char:
                maxlen = max(maxlen,len(x))
        for i,sentence in enumerate(Sentences):
            Sentences[i][2] = pad_sequences(Sentences[i][2],52,padding='post')
        return Sentences
    
    def get_word_embeddings(self,list_sentences):
        wd_to_id = {}
        wd_em = []
        
        words = {}
        for sentence in list_sentences:
            for token,char,pos,label in sentence:
                words[token.lower()] = True
                
        for line in self.word_embediings_model:
            split = line.strip().split(" ")

            if len(wd_to_id) == 0:
                wd_to_id["PAD_TKN"] = len(wd_to_id)
                vector = np.zeros(len(split)-1) 
                wd_em.append(vector)

                wd_to_id["UNK_TKN"] = len(wd_to_id)
                vector = np.random.uniform(-0.25, 0.25, len(split)-1)
                wd_em.append(vector)
            if split[0].lower() in words:
                vector = np.array([float(num) for num in split[1:]])
                wd_em.append(vector)
                wd_to_id[split[0]] = len(wd_to_id)

        wd_em = np.array(wd_em)
        return wd_em,wd_to_id
    
    def get_feature_dict(self,sentences):

        labelSet = set()
        lb_to_id = {}
        for sentence in sentences:
            for token,char,pos,label in sentence:
                labelSet.add(label)

        for label in labelSet:
            lb_to_id[label] = len(lb_to_id)

        id_to_lb = {v: k for k, v in lb_to_id.items()}

        ch_to_id = {"PADDING":0, "UNKNOWN":1}
        for c in " 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,-_()[]{}!?:;#'\"/\\%$`&=*+@^~|øæðş":
            ch_to_id[c] = len(ch_to_id)

        cs_to_id = {
                'number': 0, 'lower':1, 'upper':2, 'title':3, 
                'other':4, 'fraction':5, 'leters_digit': 6, 
                'PAD_TKN':7
                }

        pos_to_id = {"$":0, "''":1, "(":2, ")":3, ",":4, "--":5, ".":6, ":":7, "CC":8, "CD":9, "DT":10,
                     "EX":11, "FW":12, "IN":13, "JJ":14, "JJR":15, "JJS":16, "LS":17, "MD":18, "NN":19,
                     "NNP":20, "NNPS":21, "NNS":22, "PDT":23, "POS":24, "PRP":25, "PRP$":26, "RB":27, 
                     "RBR":28, "RBS":29, "RP":30, "SYM":31, "TO":32, "UH":33, "VB":34, "VBD":35, "VBG":36, 
                     "VBN":37, "VBP":38, "VBZ":39, "WDT":40, "WP":41, "WP$":42, "WRB":43, "``":44}
        
        return cs_to_id,pos_to_id,ch_to_id,lb_to_id,id_to_lb
    
    def make_batch(self,dataset):
        self.batch,self.batch_len = self.createBatches(dataset)
        return self.batch,self.batch_len
        
    def make_dataset(self,file_name):
        sentences = self.sentence_from_file(file_name)
        sentences = self.addCharInformatioin(sentences)
        return sentences
    
    def get_sentences(self,file_list):
        list_sentences = []
        for i in file_list:
            list_sentences+= self.make_dataset(i)
        return list_sentences
        

In [None]:
preprocess_obj = Preprocessing()
train_sentences = preprocess_obj.get_sentences(load_data_obj.train_files)
word_emb,word_to_id = preprocess_obj.get_word_embeddings(train_sentences)

'''the below function is not requred for validation data, we will load the dictionaries for validation'''
case_to_id,pos_to_id,char_to_id,label_to_id,id_to_label = preprocess_obj.get_feature_dict(train_sentences)
train_data_set = preprocess_obj.padding(preprocess_obj.create_tensors(train_sentences,word_to_id,case_to_id,pos_to_id,char_to_id,label_to_id))
train_batch,train_batch_len = preprocess_obj.make_batch(train_data_set)

In [None]:
class DesignModel():
    def __init__(self,params):
        self.model = None
        self.wd_em = word_emb
        self.caseEmbeddings = np.identity(len(case_to_id), dtype='float32')
        self.posEmbeddings = np.identity(len(pos_to_id), dtype='float32') 
        self.ch_to_id = char_to_id
        self.lb_to_id = label_to_id
        self.params = params
        self.train_batch = train_batch
        self.train_batch_len = train_batch_len

        
    def iterate_minibatches(self,dataset,batch_len): 
        start = 0
        for i in batch_len:
            tokens = []
            char = []
            labels = []
            casing = []
            pos_tags = []
            data = dataset[start:i]
            start = i
            for dt in data:
                t,c,ch,pos,l = dt
                l = np.expand_dims(l,-1)
                tokens.append(t)
                char.append(ch)
                labels.append(l)
                casing.append(c)
                pos_tags.append(pos)
            yield np.asarray(labels),np.asarray(tokens),np.asarray(casing), np.asarray(char), np.asarray(pos_tags)
    
    def BiRNN_model(self):
    
        input = Input(shape=(None,),dtype='int32')

        words = Embedding(input_dim=self.wd_em.shape[0], output_dim=self.wd_em.shape[1],  weights=[self.wd_em], trainable=False)(input)

        csng_input = Input(shape=(None,), dtype='int32')
        csng = Embedding(output_dim = self.caseEmbeddings.shape[1], input_dim = self.caseEmbeddings.shape[0], weights = [self.caseEmbeddings], trainable=False)(csng_input)


        char_input=Input(shape=(None,52,))
        embed_char_out=TimeDistributed(Embedding(len(self.ch_to_id),30,embeddings_initializer=RandomUniform(minval=-0.5, maxval=0.5)))(char_input)
        dropout= Dropout(self.params['dropout_rate'])(embed_char_out)
        conv1d_out = TimeDistributed(Conv1D(kernel_size=self.params['kernel_sizes_cnn'], filters=30, padding='same',activation=params['rnn_activation'], strides=1))(dropout)
        maxpool_out=TimeDistributed(MaxPooling1D(52))(conv1d_out)
        char = TimeDistributed(Flatten())(maxpool_out)
        char = Dropout(self.params['dropout_rate'])(char)

        pos_input = Input(shape=(None,), dtype='int32')
        pos = Embedding(output_dim = self.posEmbeddings.shape[1], input_dim = self.posEmbeddings.shape[0], weights = [self.posEmbeddings], trainable=False)(pos_input)


        output = concatenate([words, csng, char, pos])
        output = Bidirectional(LSTM(self.params['units_lstm'], return_sequences=True, dropout=self.params['dropout_rate'], recurrent_dropout=0.25))(output)
        output = TimeDistributed(Dense(len(self.lb_to_id), activation=self.params['rnn_activation']))(output)
        self.model = Model(inputs=[input, csng_input, char_input, pos_input], outputs=[output])
        self.model.compile(loss=self.params['loss'], optimizer=self.params['optimizer'],metrics=["accuracy"])

    def train_model(self):
    
        for epoch in range(self.params['epochs']):

            print("Epoch %d/%d"%(epoch+1, self.params['epochs']))
            a = Progbar(len(preprocess_obj.batch_len))
            res = None
            for i,batch in enumerate(self.iterate_minibatches(self.train_batch,self.train_batch_len)):
                labels, tkns, csng, char, pos = batch       
                res = self.model.train_on_batch([tkns, csng, char, pos], labels)
                a.update(i)
            print("\n")
            print(self.model.metrics_names[0],":",res[0],self.model.metrics_names[1],":",res[1])
            print(' ')

In [None]:
params = {
            "kernel_sizes_cnn": 3,
            "optimizer": "nadam",
            "cnn_activation":"tanh",
            "rnn_activation":"softmax",
            "units_lstm" : 100,
            "loss": "sparse_categorical_crossentropy",
            "text_size": 50,
            "dropout_rate": 0.5,
            "epochs": 100,
            "model_name": "cnn_model",
            "batch_size": 32,
            "verbose": True,
            "metrics":["accuracy"]
        }
model_obj = DesignModel(params)
model_obj.BiRNN_model()
model_obj.train_model()

In [None]:
class LoadAndSaveModels():
        
    def save_model(self,model,model_name):
        model.save("Model_Data/entity_models/"+model_name+".h5")
        print("Model saved to Model folder.")
        
    def save_dict(self, save_path,dictionaries):  
        
        for item in dictionaries:
            
            with open(save_path+"/"+item[1]+".txt", "wb") as myFile:
                pickle.dump(item[0], myFile)

        print("Files saved.")
        
    def load_dict(self,file):
        with open(file,"rb") as fp:
            dict = pickle.load(fp)
        return dict
    
    def load_model(self,model_name):
        model = load_model(model_name)
        return model


In [None]:
load_save = LoadAndSaveModels()
load_save.save_model(model_obj.model,"birnn")
dict = [(word_to_id,"word_to_id"),(label_to_id,"label_to_id"),(char_to_id,"char_to_id"),
        (id_to_label,"id_to_label"),(case_to_id,"case_to_id"),(pos_to_id,"pos_to_id")]
load_save.save_dict("Model_Data/dict",dict)

In [None]:
load_save = LoadAndSaveModels()
model = load_save.load_model("Model_Data/entity_models/birnn.h5")
word_to_id = load_save.load_dict("Model_Data/dict/word_to_id.txt")
case_to_id = load_save.load_dict("Model_Data/dict/case_to_id.txt")
pos_to_id = load_save.load_dict("Model_Data/dict/pos_to_id.txt")
char_to_id = load_save.load_dict("Model_Data/dict/char_to_id.txt")
label_to_id = load_save.load_dict("Model_Data/dict/label_to_id.txt")
id_to_label = load_save.load_dict("Model_Data/dict/id_to_label.txt")

In [None]:
validation_sentences = preprocess_obj.get_sentences(load_data_obj.validation_files)
validation_set = preprocess_obj.padding(preprocess_obj.create_tensors(validation_sentences,word_to_id,case_to_id,pos_to_id,char_to_id,label_to_id))
validation_batch,validation_batch_len = preprocess_obj.make_batch(validation_set)


In [None]:
class Prediction():
    def __init__(self):
        self.case_to_id = case_to_id
        self.pos_to_id = pos_to_id
        self.char_to_id = char_to_id
        self.label_to_id = label_to_id
        self.id_to_label = id_to_label
        self.word_to_id = word_to_id
    def prediction(self,dataset,model):
        correct_labels = []
        predict_labels = []
        b = Progbar(len(dataset))
        for i,data in enumerate(dataset):    
            tkns, csng, char,pos, labels = data
            tkns = np.asarray([tkns])     
            char = np.asarray([char])
            csng = np.asarray([csng])
            pos = np.asarray([pos])
            predict = model.predict([tkns, csng, char,pos], verbose=False)[0] 
            predict = predict.argmax(axis=-1)        
            correct_labels.append(labels)
            predict_labels.append(predict)
            b.update(i)
        return predict_labels, correct_labels
    
    def predict(self,sentence,model):
        sen_list = [[[i,'POS','O\n'] for i in sentence.split()]]
        test_sent = preprocess_obj.addCharInformatioin(sen_list)

        predLabels = []

        test_set = preprocess_obj.padding(preprocess_obj.create_tensors(test_sent,self.word_to_id,
                                                                        self.case_to_id,self.pos_to_id,
                                                                        self.char_to_id,self.label_to_id))
        test_batch,test_batch_len = preprocess_obj.createBatches(test_set)
        for i,data in enumerate(test_batch):
            tokens, csng, char, pos, labels = data
            tokens = np.asarray([tokens])     
            char = np.asarray([char])
            csng = np.asarray([csng])
            pos = np.asarray([pos])
            pred = model.predict([tokens,csng, char,pos], verbose=False)[0] 
            pred = pred.argmax(axis=-1) #Predict the classes            
            predLabels.append(pred)
        entity_labels = []
        j = 0
        words_list = sentence.split()
        for i in predLabels[-1]:
            entity_labels.append((words_list[j],self.id_to_label[int(i)].replace("\n","")))
            j+=1

        return entity_labels

In [None]:
pred_obj = Prediction()

In [None]:
sent = "Add Richard McNamara newest song to the Just Smile playlist"
entity_label = pred_obj.predict(sent,model)

In [None]:
entity_label

In [None]:
class Evaluate():
    def compute_precision(self,guessed_sentences, correct_sentences):
        assert(len(guessed_sentences) == len(correct_sentences))
        correctCount = 0
        count = 0


        for sentenceIdx in range(len(guessed_sentences)):
            guessed = guessed_sentences[sentenceIdx]
            correct = correct_sentences[sentenceIdx]
            assert(len(guessed) == len(correct))
            idx = 0
            while idx < len(guessed):
                if guessed[idx][0] == 'B': #A new chunk starts
                    count += 1

                    if guessed[idx] == correct[idx]:
                        idx += 1
                        correctlyFound = True

                        while idx < len(guessed) and guessed[idx][0] == 'I': #Scan until it no longer starts with I
                            if guessed[idx] != correct[idx]:
                                correctlyFound = False

                            idx += 1

                        if idx < len(guessed):
                            if correct[idx][0] == 'I': #The chunk in correct was longer
                                correctlyFound = False


                        if correctlyFound:
                            correctCount += 1
                    else:
                        idx += 1
                else:  
                    idx += 1

        precision = 0
        if count > 0:    
            precision = float(correctCount) / count

        return precision
    def get_metrics(self,predictions, correct, idx2Label): 
        label_pred = []    
        for sentence in predictions:
            label_pred.append([idx2Label[element] for element in sentence])

        label_correct = []    
        for sentence in correct:
            label_correct.append([idx2Label[element] for element in sentence])


        #print label_pred
        #print label_correct

        prec = self.compute_precision(label_pred, label_correct)
        rec = self.compute_precision(label_correct, label_pred)

        f1 = 0
        if (rec+prec) > 0:
            f1 = 2.0 * prec * rec / (prec + rec);

        return prec, rec, f1

In [None]:
eval_obj = Evaluate()

train_predict_labels, train_correct_labels = pred_obj.prediction(train_data_set,model)
pre_train, rec_train, f1_train= eval_obj.get_metrics(train_predict_labels, train_correct_labels, id_to_label)
print("Train-Data: Precision: %.3f, Recall: %.3f, F1 Score: %.3f" % (pre_train, rec_train, f1_train))
     
validation_predict_labels, validation_correct_labels = pred_obj.prediction(validation_set,model)
pre_test, rec_test, f1_test= eval_obj.get_metrics(validation_predict_labels, validation_correct_labels, id_to_label)
print("Validation-Data: Precision: %.3f, Recall: %.3f, F1 Score: %.3f" % (pre_test, rec_test, f1_test))
