In [1]:
# install hugging face transformers and datasets library
!pip install -q transformers
!pip install -q datasets

[K     |████████████████████████████████| 3.4 MB 5.7 MB/s 
[K     |████████████████████████████████| 67 kB 4.9 MB/s 
[K     |████████████████████████████████| 895 kB 46.2 MB/s 
[K     |████████████████████████████████| 3.3 MB 37.1 MB/s 
[K     |████████████████████████████████| 596 kB 45.6 MB/s 
[K     |████████████████████████████████| 306 kB 5.5 MB/s 
[K     |████████████████████████████████| 1.1 MB 43.7 MB/s 
[K     |████████████████████████████████| 243 kB 47.2 MB/s 
[K     |████████████████████████████████| 133 kB 46.1 MB/s 
[K     |████████████████████████████████| 160 kB 47.3 MB/s 
[K     |████████████████████████████████| 144 kB 44.4 MB/s 
[K     |████████████████████████████████| 271 kB 47.4 MB/s 
[?25h

### **Loading Datasets**

In [2]:
import re
import numpy as np 
import pandas as pd
pd.set_option('display.max_rows', 700)
from sklearn.utils import shuffle

In [3]:
# hugging face load_dataset module to import clinc_oos dataset
from datasets import load_dataset

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
df = load_dataset('clinc_oos', 'plus')

Downloading:   0%|          | 0.00/2.75k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.52k [00:00<?, ?B/s]

Downloading and preparing dataset clinc_oos/plus (download: 2.39 MiB, generated: 1.18 MiB, post-processed: Unknown size, total: 3.57 MiB) to /root/.cache/huggingface/datasets/clinc_oos/plus/1.0.0/abcc41d382f8137f039adc747af44714941e8196e845dfbdd8ae7a7e020e6ba1...


Downloading:   0%|          | 0.00/291k [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset clinc_oos downloaded and prepared to /root/.cache/huggingface/datasets/clinc_oos/plus/1.0.0/abcc41d382f8137f039adc747af44714941e8196e845dfbdd8ae7a7e020e6ba1. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [16]:
df

DatasetDict({
    train: Dataset({
        features: ['text', 'intent'],
        num_rows: 15250
    })
    validation: Dataset({
        features: ['text', 'intent'],
        num_rows: 3100
    })
    test: Dataset({
        features: ['text', 'intent'],
        num_rows: 5500
    })
})

In [17]:
train_df = shuffle(pd.DataFrame(df['train'])).reset_index(drop=True)
test_df = shuffle(pd.DataFrame(df['test'])).reset_index(drop=True)
val_df = shuffle(pd.DataFrame(df['validation'])).reset_index(drop=True)

In [18]:
train_df.shape, test_df.shape, val_df.shape

((15250, 2), (5500, 2), (3100, 2))

In [19]:
train_df.head()

Unnamed: 0,text,intent
0,"on a flight with jin air to jeju, how many car...",62
1,call mark,25
2,how much is the least i can pay for power bill,143
3,i'm out of bananas; add to shopping list,125
4,what's your designation,75


In [20]:
train_df['intent'].nunique(), val_df['intent'].nunique(), test_df['intent'].nunique()

(151, 151, 151)

#### Transform labels to one-hot encode format

In [21]:
from sklearn.preprocessing import LabelBinarizer
labelBinary = LabelBinarizer()

* Fit `labelBinary` to train and then transform test and validation dataset on that.

In [22]:
# train_df
train_labels = labelBinary.fit_transform(train_df['intent'])
train_text = train_df['text'].values

# test_df
test_labels = labelBinary.transform(test_df['intent'])
test_text = test_df['text'].values

* Save the `labelBinary` in pickle file. So, that we can use it later when we are going to test our model

In [23]:
import pickle

In [24]:
label_name = 'label-'+str(len(labelBinary.classes_))+'.pkl'
with open(label_name, 'wb') as handle:
    pickle.dump(labelBinary, handle, protocol=pickle.HIGHEST_PROTOCOL)

### **Download & Load Word Vectors for Embedding (FastText)**

* Download and extract word vectors FastText

In [25]:
# # %%time

# import zipfile
# !wget https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip
# zip_file = zipfile.ZipFile('crawl-300d-2M.vec.zip')
# zip_file.extractall()

* Load Word vectors in memory

In [26]:
from tqdm import tqdm
import codecs

print('loading word embeddings...')

embeddings_index = {}
f = codecs.open('/content/drive/MyDrive/data_science/embeddings/crawl-300d-2M.vec', encoding='utf-8')

for line in tqdm(f):
    values = line.rstrip().rsplit(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('found %s word vectors' % len(embeddings_index))

loading word embeddings...


1999996it [04:24, 7565.53it/s]

found 1999996 word vectors





### **Preprocessing**



In [27]:
import tensorflow as tf

#### Keras `Tokenizer` to tokenize our sentences.

In [28]:
%%time
MAX_NB_WORDS = 100000
tokenizer = tf.keras.preprocessing.text.Tokenizer(
    num_words=MAX_NB_WORDS)

CPU times: user 23 µs, sys: 0 ns, total: 23 µs
Wall time: 26.7 µs


* Updating vocabulary on tokenizer using our `train_text` dataset

In [29]:
tokenizer.fit_on_texts(train_text)

* Transform our training set on sequences of integers

In [30]:
train_sequences = tokenizer.texts_to_sequences(train_text)

In [31]:
train_sequences[:3]

[[13, 7, 83, 43, 2952, 315, 2, 2953, 11, 51, 157, 435, 22, 698],
 [80, 1096],
 [11, 52, 10, 4, 1097, 3, 14, 79, 8, 1098, 67]]

In [32]:
print(train_text[0], train_sequences[0])

on a flight with jin air to jeju, how many carry ons are allowed [13, 7, 83, 43, 2952, 315, 2, 2953, 11, 51, 157, 435, 22, 698]


* Finding the length of token with maximum length. So that we can use that on padding. It will be dynamic padding.

In [33]:
def FindMaxLength(lst): 
    maxList = max(lst, key = lambda i: len(i)) 
    maxLength = len(maxList) 
    return maxLength

max_length = FindMaxLength(train_sequences)
print(max_length)

28


* Now the Sentences are mapped to lists of integers. However, we still cannot stack them together in a matrix since they have different lengths.
Hopefully Keras allows to **pad** sequences with **0s** to a maximum length.

In [34]:
padded_train_sequences = tf.keras.preprocessing.sequence.pad_sequences(
    train_sequences, maxlen=max_length)

* We can see that we have sentences with same length as we have given.

In [35]:
padded_train_sequences.shape

(15250, 28)

* Now, we will save `tokenizer` so, that we can use that on our testing.

In [36]:
token_name = 'token-'+str(max_length)+'.pkl'
with open(token_name, 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

* Transform validation dataset same as training dataset for model training

In [37]:
test_sequences = tokenizer.texts_to_sequences(test_text)
padded_test_sequences = tf.keras.preprocessing.sequence.pad_sequences(
    test_sequences, maxlen=max_length)

In [38]:
padded_test_sequences.shape

(5500, 28)

### **Model Training**

* Building the [embedding matrix]( https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/) for the weights in Embedding Layer on training
* All words that aren't in the pre-trained model from fasttext would be changed to 0. the words are basically the ones with typos or names.

In [39]:
# save the dictionary in word_index
word_index = tokenizer.word_index

In [40]:
# embedding matrix
embed_dim = 300
print('preparing embedding matrix...')

words_not_found = []
nb_words = min(MAX_NB_WORDS, len(word_index)+1)
embedding_matrix = np.zeros((nb_words, embed_dim))

for word, i in word_index.items():
    if i >= nb_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if (embedding_vector is not None) and len(embedding_vector) > 0:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        words_not_found.append(word)
print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

preparing embedding matrix...
number of null word embeddings: 375


In [41]:
print("sample words not found: ", np.random.choice(words_not_found, 10))

sample words not found:  ["walgreen's" "sasha's" "that'd" "name's" "5th's" 'cancelt' "wife's"
 "lisa's" 'lisr' "couldn't"]


In [42]:
label_len = len(train_labels[0])
label_len

151

#### Model Architecture

In [43]:
def rnn_cnn_model():
    input = tf.keras.layers.Input(shape=(max_length, ))
    x = tf.keras.layers.Embedding(
        nb_words,
        embed_dim,
        weights=[embedding_matrix],
        input_length=max_length,
        trainable=True)(input)
    x = tf.keras.layers.SpatialDropout1D(0.3)(x)
    x = tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(100, return_sequences=True))(x)
        # tf.keras.layers.GRU(100, return_sequences=True))(x)
    x = tf.keras.layers.Conv1D(
        64,
        kernel_size = 2,
        padding = "valid",
        kernel_regularizer=tf.keras.regularizers.l2(0.0005))(x)
    avg_pool = tf.keras.layers.GlobalAveragePooling1D()(x)
    max_pool = tf.keras.layers.GlobalMaxPooling1D()(x)
    concat_layer = tf.keras.layers.concatenate([avg_pool, max_pool])
    output = tf.keras.layers.Dense(
        label_len,
        activation="softmax")(concat_layer)
    model = tf.keras.models.Model(inputs=input, outputs=output)
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, 
                  metrics=['accuracy'])
    model.summary()
    return model

rnn_cnn_model = rnn_cnn_model()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 28)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 28, 300)      1656000     ['input_1[0][0]']                
                                                                                                  
 spatial_dropout1d (SpatialDrop  (None, 28, 300)     0           ['embedding[0][0]']              
 out1D)                                                                                           
                                                                                                  
 bidirectional (Bidirectional)  (None, 28, 200)      320800      ['spatial_dropout1d[0][0]']  

* Create `checkpoint` and `early_stopping`
    * change mode to min if we are going to monitor the loss


In [44]:
model_filepath = 'clinc_oos_model.h5'
checkpoint = tf.keras.callbacks.ModelCheckpoint(
    model_filepath,
    monitor='val_accuracy',
    verbose=1,
    save_best_only=True,
    mode='max')

early_stopper = tf.keras.callbacks.EarlyStopping(
    monitor='val_accuracy',
    patience=4)

In [45]:
BATCH_SIZE = 264
epochs = 100

history = rnn_cnn_model.fit(
    padded_train_sequences, 
    train_labels, 
    # validation_split = 0.1, #0.1
    validation_data=(padded_test_sequences, test_labels), 
    batch_size=BATCH_SIZE,
    callbacks=[checkpoint, early_stopper],
    epochs=epochs, 
    verbose=1,
    shuffle=True)

Epoch 1/100
Epoch 00001: val_accuracy improved from -inf to 0.36000, saving model to clinc_oos_model.h5
Epoch 2/100
Epoch 00002: val_accuracy improved from 0.36000 to 0.68236, saving model to clinc_oos_model.h5
Epoch 3/100
Epoch 00003: val_accuracy improved from 0.68236 to 0.75345, saving model to clinc_oos_model.h5
Epoch 4/100
Epoch 00004: val_accuracy improved from 0.75345 to 0.78727, saving model to clinc_oos_model.h5
Epoch 5/100
Epoch 00005: val_accuracy improved from 0.78727 to 0.79364, saving model to clinc_oos_model.h5
Epoch 6/100
Epoch 00006: val_accuracy improved from 0.79364 to 0.81273, saving model to clinc_oos_model.h5
Epoch 7/100
Epoch 00007: val_accuracy did not improve from 0.81273
Epoch 8/100
Epoch 00008: val_accuracy did not improve from 0.81273
Epoch 9/100
Epoch 00009: val_accuracy did not improve from 0.81273
Epoch 10/100
Epoch 00010: val_accuracy did not improve from 0.81273


### **Testing**

#### Evaluation

* Evaluate the model on validation dataset

In [46]:
val_text = val_df['text'].values
val_labels = labelBinary.transform(val_df['intent'].values)

In [48]:
val_sequences = tokenizer.texts_to_sequences(val_text)
padded_val_sequences = tf.keras.preprocessing.sequence.pad_sequences(
    val_sequences, maxlen=max_length)

In [49]:
rnn_cnn_model.evaluate(padded_val_sequences, val_labels)



[0.5697658061981201, 0.8812903165817261]

#### Get predictions from saved model

In [50]:
import pandas as pd
import pickle

import tensorflow as tf

In [51]:
val_df.head()

Unnamed: 0,text,intent
0,my card isn't working because its destroyed,87
1,can you tell me when i should change my oil,123
2,can you find the exact address for where i am ...,136
3,please play the song that goes doo doo doo,106
4,how high is my spending limit on my wells farg...,41


In [52]:
test_text = val_df['text'].values
test_labels = val_df['intent'].values

* Load model, tokens and labels pickle file

In [53]:
model_path = '/content/clinc_oos_model.h5'
token_path = '/content/token-28.pkl'
label_path = '/content/label-151.pkl'

In [54]:
loaded_model = tf.keras.models.load_model(model_path)

with open(token_path, 'rb') as handle:
    loaded_token = pickle.load(handle)

with open(label_path, 'rb') as handle:
    loaded_label = pickle.load(handle)

* Tokenize our test sentences

In [55]:
max_length = 28
test_sequences = loaded_token.texts_to_sequences(test_text)
padded_test_sequences = tf.keras.preprocessing.sequence.pad_sequences(
    test_sequences, maxlen=max_length)

* Get predictions from our loaded model

In [56]:
predictions = loaded_model.predict(padded_test_sequences)
predicted_values = loaded_label.inverse_transform(predictions)

* Check `accuracy_score` and `classification_report`

In [57]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

print('Accuracy: ', round(accuracy_score(test_labels, predicted_values)*100, 2))
print(classification_report(test_labels, predicted_values))

Accuracy:  89.19
              precision    recall  f1-score   support

           0       0.93      0.70      0.80        20
           1       1.00      0.60      0.75        20
           2       0.89      0.80      0.84        20
           3       0.95      0.95      0.95        20
           4       0.62      0.90      0.73        20
           5       0.90      0.90      0.90        20
           6       0.94      0.85      0.89        20
           7       0.94      0.85      0.89        20
           8       0.95      1.00      0.98        20
           9       0.89      0.80      0.84        20
          10       0.77      1.00      0.87        20
          11       0.82      0.90      0.86        20
          12       0.95      0.90      0.92        20
          13       0.86      0.90      0.88        20
          14       0.91      1.00      0.95        20
          15       1.00      0.90      0.95        20
          16       0.76      0.80      0.78        20
          