In [None]:
import json
import numpy as np 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D, Dropout, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
file_path = '/content/drive/MyDrive/Colab Notebooks/VICCI/data/generated_train_data.json'
training_data = None
with open(file_path, 'r') as file:
    training_data = json.load(file)

In [None]:
queries, intents = [], []
for train_set in training_data:
    for query in train_set['query']:
        queries.append(query)
        intents.append(train_set['intent'])

In [None]:
queries_train, queries_cv, intents_train, intents_cv = train_test_split( queries, 
                                                        intents, train_size=0.7, 
                                                        random_state=123, 
                                                        stratify=intents)

In [None]:
lbl_encoder = LabelEncoder()
lbl_encoder.fit(intents_train)

LabelEncoder()

In [None]:
num_classes= len(lbl_encoder.classes_)

In [None]:
intents_train = lbl_encoder.transform(intents_train)
intents_cv = lbl_encoder.transform(intents_cv)

In [None]:
vocab_size = 2500
embedding_dim = 200
max_len = 20
oov_token = "<OOV>"

In [None]:
tokenizer = Tokenizer(num_words=vocab_size, lower=True, oov_token=oov_token)
tokenizer.fit_on_texts(queries_train)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(queries_train)
padded_train_sequences = pad_sequences(sequences, padding='post' ,truncating='post', maxlen=max_len)
sequences = tokenizer.texts_to_sequences(queries_cv)
padded_cv_sequences = pad_sequences(sequences, truncating='post', maxlen=max_len)

In [None]:
glove_path='/content/drive/MyDrive/Colab Notebooks/models/glove.6B.200d.txt'

In [None]:
embeddings_index = dict()
with open(glove_path) as gfile:
    for line in gfile:
        values = line.split()
        word, vectors = values[0], np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = vectors

In [None]:
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
	embedding_vector = embeddings_index.get(word)
	if embedding_vector is not None:
		embedding_matrix[i] = embedding_vector

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_len, 
                    mask_zero=True, weights=[embedding_matrix], trainable=False))
model.add(GlobalAveragePooling1D())
# model.add(Dense(32, activation='relu'))
# model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(num_classes, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', 
                    metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 20, 200)           500000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 200)               0         
_________________________________________________________________
dense (Dense)                (None, 64)                12864     
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 18)                1170      
Total params: 514,034
Trainable params: 14,034
Non-trainable params: 500,000
_________________________________________________________________


In [None]:
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_len, 
                    mask_zero=True, weights=[embedding_matrix], trainable=True))
model.add(LSTM(32, activation='relu', recurrent_dropout=0.2, dropout=0.2))
# model.add(GlobalAveragePooling1D())
# model.add(Dense(32, activation='relu'))
# model.add(Dropout(0.3))
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(num_classes, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', 
              metrics=['accuracy'])

model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 20, 200)           500000    
_________________________________________________________________
lstm_2 (LSTM)                (None, 32)                29824     
_________________________________________________________________
dense_6 (Dense)              (None, 16)                528       
_________________________________________________________________
dropout_3 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_7 (Dense)              (None, 18)                306       
Total params: 530,658
Trainable params: 530,658
Non-trainable params: 0
_________________________________________________________________


In [None]:
epochs = 20
history = model.fit(padded_train_sequences, intents_train, epochs=epochs
                    , validation_data=(padded_cv_sequences, intents_cv)
                    , verbose=2)

Epoch 1/20
20/20 - 6s - loss: 2.8663 - accuracy: 0.0746 - val_loss: 2.7940 - val_accuracy: 0.2037
Epoch 2/20
20/20 - 3s - loss: 2.7073 - accuracy: 0.2190 - val_loss: 2.5634 - val_accuracy: 0.3333
Epoch 3/20
20/20 - 3s - loss: 2.4006 - accuracy: 0.3270 - val_loss: 2.1737 - val_accuracy: 0.4630
Epoch 4/20
20/20 - 3s - loss: 1.9807 - accuracy: 0.4127 - val_loss: 1.6839 - val_accuracy: 0.6185
Epoch 5/20
20/20 - 3s - loss: 1.6150 - accuracy: 0.5127 - val_loss: 1.2901 - val_accuracy: 0.7037
Epoch 6/20
20/20 - 3s - loss: 1.3225 - accuracy: 0.5984 - val_loss: 1.0380 - val_accuracy: 0.7778
Epoch 7/20
20/20 - 3s - loss: 1.0633 - accuracy: 0.7095 - val_loss: 0.8006 - val_accuracy: 0.8407
Epoch 8/20
20/20 - 3s - loss: 0.9236 - accuracy: 0.7317 - val_loss: 0.6442 - val_accuracy: 0.9259
Epoch 9/20
20/20 - 3s - loss: 0.7866 - accuracy: 0.7619 - val_loss: 0.4834 - val_accuracy: 0.9370
Epoch 10/20
20/20 - 3s - loss: 0.6953 - accuracy: 0.7937 - val_loss: 0.4148 - val_accuracy: 0.9481
Epoch 11/20
20/20 -

In [None]:
inputs = ["what are the tests available for covid?", "bye", 
          "after how much time do I see the symptoms?", "That's great.",
          "how do i protect myself?", "what is covid-19?",
          "ok. what are the vaccines available?", 
          "i am looking for vaccination. i need help",
          "how many people have suffered?"]

In [None]:
for inp in inputs:
    result = model.predict(pad_sequences(tokenizer.texts_to_sequences([inp]), 
                                            padding='post' ,truncating='post', 
                                         maxlen=max_len))
    tag = lbl_encoder.inverse_transform([np.argmax(result)])[0]
    print(inp," - ",tag," - ",result[0][np.argmax(result)])

what are the tests available for covid?  -  covid_tests  -  0.9923597
bye  -  bye  -  1.0
after how much time do I see the symptoms?  -  development_period  -  0.9616318
That's great.  -  thanks  -  0.99854016
how do i protect myself?  -  protection  -  0.999724
what is covid-19?  -  definition_covid  -  0.7742823
ok. what are the vaccines available?  -  covid_vaccine  -  0.991459
i am looking for vaccination. i need help  -  vaccination_slot  -  0.9740002
how many people have suffered?  -  risk_people  -  0.9580922


In [None]:
for inp in inputs:
    result = model.predict(pad_sequences(tokenizer.texts_to_sequences([inp]), 
                                            padding='post' ,truncating='post', 
                                         maxlen=max_len))
    tag = lbl_encoder.inverse_transform([np.argmax(result)])[0]
    print(inp," - ",tag," - ",result[0][np.argmax(result)])

what are the tests available for covid?  -  covid_tests  -  0.98211676
bye  -  bye  -  0.9862748
after how much time do I see the symptoms?  -  development_period  -  0.99999905
That's great.  -  thanks  -  0.984144
how do i protect myself?  -  protection  -  0.9787177
what is covid-19?  -  definition_covid  -  0.6995017
ok. what are the vaccines available?  -  covid_vaccine  -  0.99892527
i am looking for vaccination. i need help  -  vaccination_slot  -  0.924329
how many people have suffered?  -  covid_numbers  -  0.38376588
