### Loading Datasets

In [1]:
import re
import numpy as np 
import pandas as pd
pd.set_option('display.max_rows', 700)
from sklearn.utils import shuffle

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
train = shuffle(pd.read_json('is_train.json'), random_state=0)
val = shuffle(pd.read_json('is_val.json'), random_state=0)
test = pd.read_json('is_test.json')
oos_train = pd.read_json('oos_train.json')
oos_val = pd.read_json('oos_val.json')
oos_test = pd.read_json('oos_test.json')

files = [
         (train,'train'),
         (val,'val'),
         (test,'test'),
         (oos_train,'oos_train'),
         (oos_val,'oos_val'),
         (oos_test,'oos_test')
         ]
         
for file,name in files:
    file.columns = ['text','intent']
    print(f'{name} shape:{file.shape}, {name} has {train.isna().sum().sum()} null values')
in_train = train.copy()

train shape:(15000, 2), train has 0 null values
val shape:(3000, 2), val has 0 null values
test shape:(4500, 2), test has 0 null values
oos_train shape:(100, 2), oos_train has 0 null values
oos_val shape:(100, 2), oos_val has 0 null values
oos_test shape:(1000, 2), oos_test has 0 null values


In [4]:
in_train.shape

(15000, 2)

In [5]:
in_train.intent.value_counts()

flight_status                100
text                         100
change_accent                100
card_declined                100
account_blocked              100
todo_list                    100
mpg                          100
make_call                    100
timezone                     100
uber                         100
application_status           100
pto_used                     100
min_payment                  100
change_language              100
time                         100
reminder_update              100
gas                          100
restaurant_reservation       100
pto_request                  100
nutrition_info               100
rewards_balance              100
change_volume                100
thank_you                    100
do_you_have_pets             100
expiration_date              100
next_holiday                 100
replacement_card_duration    100
where_are_you_from           100
oil_change_when              100
food_last                    100
play_music

In [6]:
in_train['intent'].nunique(), val['intent'].nunique(), test['intent'].nunique()

(150, 150, 150)

In [7]:
# oss_plus_train = pd.concat([in_train, oos_train],axis=0).reset_index(drop=True)
# oss_plus_val = pd.concat([val,oos_val],axis=0).reset_index(drop=True)
# oss_plus_test = pd.concat([test,oos_test],axis=0).reset_index(drop=True)

In [8]:
# oss_plus_train['intent'].nunique(), oss_plus_val['intent'].nunique(), oss_plus_test['intent'].nunique()

In [9]:
# def binarize(df):
#     df.intent = np.where(df.intent!='oos',0,1)
#     return df

In [10]:
# oos_plus_train = binarize(pd.concat([in_train,oos_train],axis=0).reset_index(drop=True))
# oos_plus_val = binarize(pd.concat([val,oos_val],axis=0).reset_index(drop=True))
# oos_plus_test = binarize(pd.concat([test,oos_test],axis=0).reset_index(drop=True))

In [11]:
# oos_count = oos_plus_train.intent.value_counts()
# oos_count

In [12]:
# oss_plus_train.head()

In [13]:
from sklearn.preprocessing import LabelBinarizer
labelBinary = LabelBinarizer()
 
labels = labelBinary.fit_transform(in_train['intent'])
text = in_train['text']
# import numpy as np
# val = [
#        [0, 0, 1],
#        [1, 0, 0]
# ]
 
# lb.inverse_transform(np.asarray(val))

In [14]:
labelBinary.classes_

array(['accept_reservations', 'account_blocked', 'alarm',
       'application_status', 'apr', 'are_you_a_bot', 'balance',
       'bill_balance', 'bill_due', 'book_flight', 'book_hotel',
       'calculator', 'calendar', 'calendar_update', 'calories', 'cancel',
       'cancel_reservation', 'car_rental', 'card_declined', 'carry_on',
       'change_accent', 'change_ai_name', 'change_language',
       'change_speed', 'change_user_name', 'change_volume',
       'confirm_reservation', 'cook_time', 'credit_limit',
       'credit_limit_change', 'credit_score', 'current_location',
       'damaged_card', 'date', 'definition', 'direct_deposit',
       'directions', 'distance', 'do_you_have_pets', 'exchange_rate',
       'expiration_date', 'find_phone', 'flight_status', 'flip_coin',
       'food_last', 'freeze_account', 'fun_fact', 'gas', 'gas_type',
       'goodbye', 'greeting', 'how_busy', 'how_old_are_you',
       'improve_credit_score', 'income', 'ingredient_substitution',
       'ingredients_l

In [15]:
import pickle

labelpath = 'label.pkl'
with open(labelpath, 'wb') as handle:
    pickle.dump(labelBinary, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [16]:
print(labels.shape, text.shape)

(15000, 150) (15000,)


In [None]:
labels

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

#### Transform validation dataset

In [17]:
test.head()

Unnamed: 0,text,intent
0,how would you say fly in italian,translate
1,what's the spanish word for pasta,translate
2,how would they say butter in zambia,translate
3,how do you say fast in spanish,translate
4,what's the word for trees in norway,translate


In [18]:
val_labels = labelBinary.transform(test['intent'])
val_text = test['text']

In [19]:
val_labels

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

### **Download & Load Word Vectors for Embedding (Glove)**

* *Download and extract word vectors Glove*

In [None]:
# %%time

import zipfile
!wget http://nlp.stanford.edu/data/glove.840B.300d.zip
zip_file = zipfile.ZipFile('glove.840B.300d.zip')
zip_file.extractall()

--2022-01-05 07:49:27--  http://nlp.stanford.edu/data/glove.840B.300d.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.840B.300d.zip [following]
--2022-01-05 07:49:27--  https://nlp.stanford.edu/data/glove.840B.300d.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.840B.300d.zip [following]
--2022-01-05 07:49:27--  http://downloads.cs.stanford.edu/nlp/data/glove.840B.300d.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2176768927 (2.0G) [application/zip

* *Load Word vectors in memory*

In [20]:
%%time

from tqdm.notebook import tqdm
import numpy as np 
 
def get_coefs(word, *arr):
    try:
        return word, np.asarray(arr, dtype='float32')
    except:
        return None, None

embedding_path = '/content/drive/MyDrive/data_science/embeddings/glove.840B.300d.txt'
embeddings_index = dict(get_coefs(*o.strip().split()) for o in tqdm(open(
    embedding_path)))
 
embed_size=300
 
for k in tqdm(list(embeddings_index.keys())):
    v = embeddings_index[k]
    try:
        if v.shape != (embed_size, ):
            embeddings_index.pop(k)
    except:
        pass
 
if None in embeddings_index:
  embeddings_index.pop(None)
  
values = list(embeddings_index.values())
all_embs = np.stack(values)
 
emb_mean, emb_std = all_embs.mean(), all_embs.std()

0it [00:00, ?it/s]

  0%|          | 0/2195885 [00:00<?, ?it/s]

CPU times: user 3min 19s, sys: 17.2 s, total: 3min 36s
Wall time: 3min 50s


### **Tokenizing the Dataset using keras**



* *Read the train and test file from directory and split the train dataset in train test split*

In [21]:
import tensorflow

In [22]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
 
 
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential
 
from tensorflow.keras.layers import Input, Dense, Embedding, Conv1D, Conv2D, MaxPooling1D, MaxPool2D
from tensorflow.keras.layers import Reshape, Flatten, Dropout, Concatenate
from tensorflow.keras.layers import SpatialDropout1D, concatenate
from tensorflow.keras.layers import GRU,LSTM,Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from tensorflow.keras.layers import TimeDistributed
 
from tensorflow.keras.callbacks import Callback
from tensorflow.keras.optimizers import Adam
 
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

#### **Save Keras Tokenizer**

In [23]:
text.map(len).max()

136

In [24]:
%%time
MAX_NB_WORDS = 10000
# tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(text)

CPU times: user 226 ms, sys: 3.82 ms, total: 230 ms
Wall time: 236 ms


In [25]:
%%time
train_sequences = tokenizer.texts_to_sequences(text)

CPU times: user 163 ms, sys: 1.87 ms, total: 165 ms
Wall time: 166 ms


In [26]:
train_sequences[:3]

[[66, 795, 58, 83, 691, 10, 130, 73],
 [10, 55, 6, 155, 2, 20, 42, 5, 22, 401, 114, 48, 127],
 [8, 22, 4, 264, 15, 1426]]

In [27]:
print(text[0], train_sequences[0])

what expression would i use to say i love you if i were an italian [66, 795, 58, 83, 691, 10, 130, 73]


In [28]:
def FindMaxLength(lst): 
    maxList = max(lst, key = lambda i: len(i)) 
    maxLength = len(maxList) 
    return maxLength

MAX_LENGTH = FindMaxLength(train_sequences)

In [29]:
print(MAX_LENGTH)

28


* Now the Sentences are mapped to lists of integers. However, we still cannot stack them together in a matrix since they have different lengths.
Hopefully Keras allows to **pad** sequences with **0s** to a maximum length. We'll set this length to 425.

In [30]:
%%time
padded_train_sequences = pad_sequences(train_sequences, maxlen=MAX_LENGTH)

CPU times: user 79.6 ms, sys: 2 ms, total: 81.6 ms
Wall time: 94.3 ms


In [31]:
%%time
padded_train_sequences

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 6.44 µs


array([[   0,    0,    0, ...,   10,  130,   73],
       [   0,    0,    0, ...,  114,   48,  127],
       [   0,    0,    0, ...,  264,   15, 1426],
       ...,
       [   0,    0,    0, ...,   41,   85,   18],
       [   0,    0,    0, ...,   13,    4,  600],
       [   0,    0,    0, ...,  526,    1,   30]], dtype=int32)

In [32]:
%%time
padded_train_sequences.shape

CPU times: user 5 µs, sys: 1 µs, total: 6 µs
Wall time: 10.5 µs


(15000, 28)

In [33]:
import pickle
 
with open('token-28.pkl', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

#### **Use Saved keras tokenizer**

In [34]:
import pickle
cat_token = 'token-28.pkl'
with open(cat_token, 'rb') as handle:
    load_token = pickle.load(handle)

In [35]:
text[2]

"what is the equivalent of, 'life is good' in french"

In [36]:
MAX_LENGTH = 28
train_sequences = load_token.texts_to_sequences(text)
padded_train_sequences = pad_sequences(train_sequences, maxlen=MAX_LENGTH)

# padding val
val_sequences = load_token.texts_to_sequences(val_text)
padded_val_sequences = pad_sequences(val_sequences, maxlen=MAX_LENGTH)

In [37]:
padded_train_sequences.shape

(15000, 28)

In [38]:
padded_val_sequences.shape

(4500, 28)

### **Training Model using Glove Embedding while using RNN & CNN**

In [39]:
%%time
# MAX_NB_WORDS = 2600
word_index = load_token.word_index
nb_words = MAX_NB_WORDS
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
 
oov = 0
for word, i in tqdm(word_index.items()):
    if i >= MAX_NB_WORDS: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
    else:
        oov += 1
 
print(oov)

  0%|          | 0/5229 [00:00<?, ?it/s]

332
CPU times: user 193 ms, sys: 2.08 ms, total: 195 ms
Wall time: 201 ms


* Loading the Layers and Embedding

In [40]:
len(labels[0])

150

In [41]:
labels

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [42]:
len(val_labels[0])

150

In [43]:
LABEL_VALUES = labels
LABEL_LEN = len(labels[0])
# MAX_LENGTH = 1155
# MAX_NB_WORDS = 1170

In [44]:
%%time
from tensorflow.keras.regularizers import l2

def get_rnn_cnn_model():
    embedding_dim = 300
    inp = Input(shape=(MAX_LENGTH, ))
    x = Embedding(MAX_NB_WORDS, embedding_dim, weights=[embedding_matrix], input_length=MAX_LENGTH, trainable=True)(inp)
    x = SpatialDropout1D(0.3)(x)
    x = Bidirectional(GRU(100, return_sequences=True))(x)
    # x = Bidirectional(LSTM(100, return_sequences=True))(x)
    # x = Conv1D(64, kernel_size = 2, padding = "valid", kernel_initializer = "he_uniform")(x)
    x = Conv1D(64, kernel_size = 2, padding = "valid", kernel_regularizer=l2(0.0005))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(LABEL_LEN, activation="softmax")(conc)
    model = Model(inputs=inp, outputs=outp)
    optimizer = tensorflow.keras.optimizers.Adam(learning_rate=0.001)
    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    return model

rnn_cnn_model = get_rnn_cnn_model()

CPU times: user 1.15 s, sys: 2.95 s, total: 4.1 s
Wall time: 6.83 s


* **Training and Saving the Model**

* *change mode to min if we are going to monitor the loss*
* *If we don't want to use validation data for whatever be the reason and implement check pointing, we have to change the ModelCheckpoint to work based on monitor='acc'*
* We have to fit validation data like this validation_data=(X_valid, y_valid),

In [45]:
filepath = 'model.h5'
checkpoint = ModelCheckpoint(filepath,
                             monitor='val_accuracy',
                             verbose=1,
                             save_best_only=True,
                             mode='max')
 
# batch size factor of 4,8,16,32
BATCH_SIZE = 256
epochs = 100

In [46]:
history = rnn_cnn_model.fit(padded_train_sequences, 
                    LABEL_VALUES, 
                    # validation_split = 0.1, #0.1
                    validation_data=(padded_val_sequences, val_labels), 
                    batch_size=BATCH_SIZE, 
                    callbacks=[checkpoint,
                               tensorflow.keras.callbacks.EarlyStopping(
                                   monitor='val_accuracy', patience=3)], 
                    epochs=epochs, 
                    verbose=1,
                    shuffle=True)

Epoch 1/100
Epoch 00001: val_accuracy improved from -inf to 0.23533, saving model to model.h5
Epoch 2/100
Epoch 00002: val_accuracy improved from 0.23533 to 0.67178, saving model to model.h5
Epoch 3/100
Epoch 00003: val_accuracy improved from 0.67178 to 0.81467, saving model to model.h5
Epoch 4/100
Epoch 00004: val_accuracy improved from 0.81467 to 0.86578, saving model to model.h5
Epoch 5/100
Epoch 00005: val_accuracy improved from 0.86578 to 0.88244, saving model to model.h5
Epoch 6/100
Epoch 00006: val_accuracy improved from 0.88244 to 0.89000, saving model to model.h5
Epoch 7/100
Epoch 00007: val_accuracy improved from 0.89000 to 0.90022, saving model to model.h5
Epoch 8/100
Epoch 00008: val_accuracy improved from 0.90022 to 0.90200, saving model to model.h5
Epoch 9/100
Epoch 00009: val_accuracy improved from 0.90200 to 0.90311, saving model to model.h5
Epoch 10/100
Epoch 00010: val_accuracy improved from 0.90311 to 0.90978, saving model to model.h5
Epoch 11/100
Epoch 00011: val_ac

### **Loading the Tokenizer and Model**

In [47]:
from keras.models import load_model
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
import pandas as pd
import numpy as np
import pickle

In [48]:
modelPath = 'model.h5'
tokenPath = 'token-28.pkl'
labelPath = 'label.pkl'

In [49]:
loadModel= load_model(modelPath)

with open(tokenPath, 'rb') as handle:
    loadToken = pickle.load(handle)

with open(labelPath, 'rb') as handle:
    loadLabel = pickle.load(handle)


In [50]:
LABEL = loadLabel.classes_

In [51]:
LABEL

array(['accept_reservations', 'account_blocked', 'alarm',
       'application_status', 'apr', 'are_you_a_bot', 'balance',
       'bill_balance', 'bill_due', 'book_flight', 'book_hotel',
       'calculator', 'calendar', 'calendar_update', 'calories', 'cancel',
       'cancel_reservation', 'car_rental', 'card_declined', 'carry_on',
       'change_accent', 'change_ai_name', 'change_language',
       'change_speed', 'change_user_name', 'change_volume',
       'confirm_reservation', 'cook_time', 'credit_limit',
       'credit_limit_change', 'credit_score', 'current_location',
       'damaged_card', 'date', 'definition', 'direct_deposit',
       'directions', 'distance', 'do_you_have_pets', 'exchange_rate',
       'expiration_date', 'find_phone', 'flight_status', 'flip_coin',
       'food_last', 'freeze_account', 'fun_fact', 'gas', 'gas_type',
       'goodbye', 'greeting', 'how_busy', 'how_old_are_you',
       'improve_credit_score', 'income', 'ingredient_substitution',
       'ingredients_l

### **Testing**

In [None]:
# oss_plus_val.head()

In [52]:
val.head()

Unnamed: 0,text,intent
311,can you check on the status of my credit card ...,application_status
1025,can you slow down the rate at which you talk,change_speed
1587,my card is impaired and i can't use it,damaged_card
2941,what's the traffic like right now for my route...,traffic
2980,is there a reason my card was declined,card_declined


In [53]:
val.shape

(3000, 2)

In [54]:
test_label = val['intent'].values
sentence = val['text'].values

In [55]:
# tokenization
maxLength = 28
test_sequences = loadToken.texts_to_sequences(sentence)
padded_test_sequences = pad_sequences(test_sequences, maxlen=maxLength)

In [56]:
# predection
pred_ls = loadModel.predict(padded_test_sequences)
# print(type(pred_ls))
category = loadLabel.inverse_transform(pred_ls)
# score = ('{:.2f}'.format(round(np.max(pred_ls), 2)*100))

In [57]:
category

array(['application_status', 'change_speed', 'damaged_card', ...,
       'do_you_have_pets', 'redeem_rewards', 'update_playlist'],
      dtype='<U25')

In [58]:
from sklearn.metrics import accuracy_score
from sklearn.metrics.cluster import adjusted_rand_score, normalized_mutual_info_score

# DeepAlingned : NMI: 93.86, ARI: 79.75, ACC: 86.49 
print('ACC : ', round(accuracy_score(test_label, category)*100, 2))
print('ARI : ', round(adjusted_rand_score(test_label, category)*100, 2))
print('NMI : ', round(normalized_mutual_info_score(test_label, category)*100, 2))

ACC :  91.53
ARI :  83.58
NMI :  93.85


In [None]:
# result = []
# for i in pred_ls:
#     if i > 0.5:
#         result.append(1)
#     else:
#         result.append(0)

### **Accuracy, ROC AUC, Confusion Matrix and Classification Report**

In [59]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import matthews_corrcoef
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline
sns.set(rc={'figure.facecolor':'white'})

# input values
TEST_LABEL = test_label
PREDICTED_LABEL = category
labels = LABEL

# accuracy = accuracy_score(TEST_LABEL, PREDICTED_LABEL)
# conf_mat = confusion_matrix(TEST_LABEL, PREDICTED_LABEL)
# fig, ax = plt.subplots(figsize=(30,27))
# sns.heatmap(conf_mat, annot=True, fmt='d',
#             xticklabels=labels, yticklabels=labels)
# plt.ylabel('Actual Label')
# # plt.xlabel('Predicted Label\n\nAccuracy={:0.4f}; ROC AUC={:0.4f}'.format(accuracy, roc_score))
# plt.xlabel('Predicted Label\n\nAccuracy={:0.4f}'.format(accuracy))
# # plt.savefig('/content/drive/My Drive/Lafarge/Model/Hardware/HardwareModel-Confusion-v1.5.jpg')
# # plt.savefig('HardwareModel-Confusion-v1.5.jpg')
# plt.xticks(rotation=90)
# plt.show()

In [60]:
print(classification_report(test_label, category, target_names=LABEL))

                           precision    recall  f1-score   support

      accept_reservations       1.00      0.75      0.86        20
          account_blocked       0.89      0.80      0.84        20
                    alarm       0.91      1.00      0.95        20
       application_status       0.95      0.90      0.92        20
                      apr       1.00      1.00      1.00        20
            are_you_a_bot       0.95      0.95      0.95        20
                  balance       0.89      0.80      0.84        20
             bill_balance       0.88      0.75      0.81        20
                 bill_due       0.86      0.95      0.90        20
              book_flight       1.00      1.00      1.00        20
               book_hotel       1.00      0.90      0.95        20
               calculator       0.87      1.00      0.93        20
                 calendar       0.86      0.90      0.88        20
          calendar_update       1.00      0.90      0.95     