In [1]:
# install hugging face transformers and datasets library
!pip install -q transformers
!pip install -q datasets

[K     |████████████████████████████████| 3.4 MB 5.3 MB/s 
[K     |████████████████████████████████| 596 kB 45.4 MB/s 
[K     |████████████████████████████████| 67 kB 4.7 MB/s 
[K     |████████████████████████████████| 895 kB 42.9 MB/s 
[K     |████████████████████████████████| 3.3 MB 36.0 MB/s 
[K     |████████████████████████████████| 306 kB 5.3 MB/s 
[K     |████████████████████████████████| 243 kB 45.5 MB/s 
[K     |████████████████████████████████| 1.1 MB 42.3 MB/s 
[K     |████████████████████████████████| 133 kB 49.7 MB/s 
[K     |████████████████████████████████| 271 kB 24.6 MB/s 
[K     |████████████████████████████████| 144 kB 36.7 MB/s 
[K     |████████████████████████████████| 160 kB 43.7 MB/s 
[?25h

### **Loading Datasets**

In [2]:
import re
import numpy as np 
import pandas as pd
pd.set_option('display.max_rows', 700)
from sklearn.utils import shuffle

In [3]:
# hugging face load_dataset module to import clinc_oos dataset
from datasets import load_dataset

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
df = load_dataset('banking77')

Downloading:   0%|          | 0.00/2.34k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.75k [00:00<?, ?B/s]

Using custom data configuration default


Downloading and preparing dataset banking77/default (download: 1.03 MiB, generated: 897.51 KiB, post-processed: Unknown size, total: 1.91 MiB) to /root/.cache/huggingface/datasets/banking77/default/1.1.0/aec0289529599d4572d76ab00c8944cb84f88410ad0c9e7da26189d31f62a55b...


Downloading:   0%|          | 0.00/158k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/51.1k [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset banking77 downloaded and prepared to /root/.cache/huggingface/datasets/banking77/default/1.1.0/aec0289529599d4572d76ab00c8944cb84f88410ad0c9e7da26189d31f62a55b. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
df

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 10003
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 3080
    })
})

In [7]:
train_df = shuffle(pd.DataFrame(df['train'])).reset_index(drop=True)
test_df = shuffle(pd.DataFrame(df['test'])).reset_index(drop=True)

In [8]:
train_df.shape, test_df.shape

((10003, 2), (3080, 2))

In [9]:
train_df.head()

Unnamed: 0,text,label
0,Can I order a new card to China?,9
1,Can I transfer my salary onto here?,50
2,Why is my purchase showing as pending?,45
3,How can I update my details since I moved?,30
4,Is there a charge for getting cash?,19


In [11]:
train_df['label'].nunique(), test_df['label'].nunique()

(77, 77)

#### Transform labels to one-hot encode format

In [12]:
from sklearn.preprocessing import LabelBinarizer
labelBinary = LabelBinarizer()

* Fit `labelBinary` to train dataset

In [13]:
# train_df
train_labels = labelBinary.fit_transform(train_df['label'])
train_text = train_df['text'].values

* Save the `labelBinary` in pickle file. So, that we can use it later when we are going to test our model

In [14]:
import pickle

In [15]:
label_name = 'label-'+str(len(labelBinary.classes_))+'.pkl'
with open(label_name, 'wb') as handle:
    pickle.dump(labelBinary, handle, protocol=pickle.HIGHEST_PROTOCOL)

### **Download & Load Word Vectors for Embedding (FastText)**

* Download and extract word vectors FastText

In [None]:
# # %%time

# import zipfile
# !wget https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip
# zip_file = zipfile.ZipFile('crawl-300d-2M.vec.zip')
# zip_file.extractall()

* Load Word vectors in memory

In [16]:
from tqdm import tqdm
import codecs

print('loading word embeddings...')

embeddings_index = {}
f = codecs.open('/content/drive/MyDrive/data_science/embeddings/crawl-300d-2M.vec', encoding='utf-8')

for line in tqdm(f):
    values = line.rstrip().rsplit(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('found %s word vectors' % len(embeddings_index))

loading word embeddings...


1999996it [04:17, 7755.03it/s]

found 1999996 word vectors





### **Preprocessing**



In [17]:
import tensorflow as tf

#### Keras `Tokenizer` to tokenize our sentences.

In [18]:
MAX_NB_WORDS = 100000
tokenizer = tf.keras.preprocessing.text.Tokenizer(
    num_words=MAX_NB_WORDS)

* Updating vocabulary on tokenizer using our `train_text` dataset

In [19]:
tokenizer.fit_on_texts(train_text)

* Transform our training set on sequences of integers

In [20]:
train_sequences = tokenizer.texts_to_sequences(train_text)

In [21]:
train_sequences[:3]

[[10, 1, 155, 4, 66, 6, 3, 288],
 [10, 1, 21, 2, 349, 838, 209],
 [14, 7, 2, 143, 104, 75, 53]]

In [22]:
print(train_text[0], train_sequences[0])

Can I order a new card to China? [10, 1, 155, 4, 66, 6, 3, 288]


* Finding the length of token with maximum length. So that we can use that on padding. It will be dynamic padding.

In [23]:
def FindMaxLength(lst): 
    maxList = max(lst, key = lambda i: len(i)) 
    maxLength = len(maxList) 
    return maxLength

max_length = FindMaxLength(train_sequences)
print(max_length)

79


* Now the Sentences are mapped to lists of integers. However, we still cannot stack them together in a matrix since they have different lengths.
Hopefully Keras allows to **pad** sequences with **0s** to a maximum length.

In [24]:
padded_train_sequences = tf.keras.preprocessing.sequence.pad_sequences(
    train_sequences, maxlen=max_length)

* We can see that we have sentences with same length as we have given.

In [25]:
padded_train_sequences.shape

(10003, 79)

* Now, we will save `tokenizer` so, that we can use that on our testing.

In [26]:
token_name = 'token-'+str(max_length)+'.pkl'
with open(token_name, 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

### **Model Training**

* Building the [embedding matrix]( https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/) for the weights in Embedding Layer on training
* All words that aren't in the pre-trained model from fasttext would be changed to 0. the words are basically the ones with typos or names.

In [29]:
# save the dictionary in word_index
word_index = tokenizer.word_index

In [30]:
# embedding matrix
embed_dim = 300
print('preparing embedding matrix...')

words_not_found = []
nb_words = min(MAX_NB_WORDS, len(word_index)+1)
embedding_matrix = np.zeros((nb_words, embed_dim))

for word, i in word_index.items():
    if i >= nb_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if (embedding_vector is not None) and len(embedding_vector) > 0:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        words_not_found.append(word)
print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

preparing embedding matrix...
number of null word embeddings: 98


In [31]:
print("sample words not found: ", np.random.choice(words_not_found, 10))

sample words not found:  ["provider's" '€1' "beneficiary's" 'passocde' "where'd" "account's"
 "where'd" "wouldn't" "should've" 'interbanks']


In [32]:
label_len = len(train_labels[0])
label_len

77

#### Model Architecture

In [33]:
def rnn_cnn_model():
    input = tf.keras.layers.Input(shape=(max_length, ))
    x = tf.keras.layers.Embedding(
        nb_words,
        embed_dim,
        weights=[embedding_matrix],
        input_length=max_length,
        trainable=True)(input)
    x = tf.keras.layers.SpatialDropout1D(0.3)(x)
    x = tf.keras.layers.Bidirectional(
        tf.keras.layers.GRU(100, return_sequences=True))(x)
    x = tf.keras.layers.Conv1D(
        64,
        kernel_size = 2,
        padding = "valid",
        kernel_regularizer=tf.keras.regularizers.l2(0.0005))(x)
    avg_pool = tf.keras.layers.GlobalAveragePooling1D()(x)
    max_pool = tf.keras.layers.GlobalMaxPooling1D()(x)
    concat_layer = tf.keras.layers.concatenate([avg_pool, max_pool])
    output = tf.keras.layers.Dense(
        label_len,
        activation="softmax")(concat_layer)
    model = tf.keras.models.Model(inputs=input, outputs=output)
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, 
                  metrics=['accuracy'])
    model.summary()
    return model

rnn_cnn_model = rnn_cnn_model()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 79)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 79, 300)      718800      ['input_1[0][0]']                
                                                                                                  
 spatial_dropout1d (SpatialDrop  (None, 79, 300)     0           ['embedding[0][0]']              
 out1D)                                                                                           
                                                                                                  
 bidirectional (Bidirectional)  (None, 79, 200)      241200      ['spatial_dropout1d[0][0]']  

* Create `checkpoint` and `early_stopping`
    * change mode to min if we are going to monitor the loss


In [34]:
model_filepath = 'banking77_model.h5'
checkpoint = tf.keras.callbacks.ModelCheckpoint(
    model_filepath,
    monitor='val_accuracy',
    verbose=1,
    save_best_only=True,
    mode='max')

early_stopper = tf.keras.callbacks.EarlyStopping(
    monitor='val_accuracy',
    patience=4)

In [36]:
BATCH_SIZE = 264
epochs = 100

history = rnn_cnn_model.fit(
    padded_train_sequences, 
    train_labels, 
    validation_split = 0.2, #0.1
    # validation_data=(padded_test_sequences, test_labels), 
    batch_size=BATCH_SIZE,
    callbacks=[checkpoint, early_stopper],
    epochs=epochs, 
    verbose=1,
    shuffle=True)

Epoch 1/100
Epoch 00001: val_accuracy improved from -inf to 0.25637, saving model to banking77_model.h5
Epoch 2/100
Epoch 00002: val_accuracy improved from 0.25637 to 0.58571, saving model to banking77_model.h5
Epoch 3/100
Epoch 00003: val_accuracy improved from 0.58571 to 0.73913, saving model to banking77_model.h5
Epoch 4/100
Epoch 00004: val_accuracy improved from 0.73913 to 0.80610, saving model to banking77_model.h5
Epoch 5/100
Epoch 00005: val_accuracy improved from 0.80610 to 0.83908, saving model to banking77_model.h5
Epoch 6/100
Epoch 00006: val_accuracy improved from 0.83908 to 0.85607, saving model to banking77_model.h5
Epoch 7/100
Epoch 00007: val_accuracy improved from 0.85607 to 0.86157, saving model to banking77_model.h5
Epoch 8/100
Epoch 00008: val_accuracy improved from 0.86157 to 0.86957, saving model to banking77_model.h5
Epoch 9/100
Epoch 00009: val_accuracy improved from 0.86957 to 0.87256, saving model to banking77_model.h5
Epoch 10/100
Epoch 00010: val_accuracy i

### **Testing**

#### Evaluation

* Evaluate the model on validation dataset

In [41]:
test_text = test_df['text'].values
test_labels = labelBinary.transform(test_df['label'].values)

In [42]:
test_sequences = tokenizer.texts_to_sequences(test_text)
padded_test_sequences = tf.keras.preprocessing.sequence.pad_sequences(
    test_sequences, maxlen=max_length)

In [43]:
rnn_cnn_model.evaluate(padded_test_sequences, test_labels)



[0.4947321116924286, 0.8827922344207764]

#### Get predictions from saved model

In [44]:
import pandas as pd
import pickle

import tensorflow as tf

In [45]:
test_df.head()

Unnamed: 0,text,label
0,I was looking to buy another card today.,39
1,When will my funds transfer?,66
2,I think someone got my card details and used i...,22
3,Do you happen to do exchanges of EUR?,36
4,how do i get a virtual card for one time use,37


In [46]:
test_text = test_df['text'].values
test_labels = test_df['label'].values

* Load model, tokens and labels pickle file

In [47]:
model_path = '/content/banking77_model.h5'
token_path = '/content/token-79.pkl'
label_path = '/content/label-77.pkl'

In [48]:
loaded_model = tf.keras.models.load_model(model_path)

with open(token_path, 'rb') as handle:
    loaded_token = pickle.load(handle)

with open(label_path, 'rb') as handle:
    loaded_label = pickle.load(handle)

* Tokenize our test sentences

In [49]:
max_length = 79
test_sequences = loaded_token.texts_to_sequences(test_text)
padded_test_sequences = tf.keras.preprocessing.sequence.pad_sequences(
    test_sequences, maxlen=max_length)

* Get predictions from our loaded model

In [50]:
predictions = loaded_model.predict(padded_test_sequences)
predicted_values = loaded_label.inverse_transform(predictions)

* Check `accuracy_score` and `classification_report`

In [51]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

print('Accuracy: ', round(accuracy_score(test_labels, predicted_values)*100, 2))
print(classification_report(test_labels, predicted_values))

Accuracy:  88.28
              precision    recall  f1-score   support

           0       1.00      0.93      0.96        40
           1       0.97      0.95      0.96        40
           2       0.98      1.00      0.99        40
           3       0.95      0.93      0.94        40
           4       0.95      0.90      0.92        40
           5       0.67      0.75      0.71        40
           6       0.97      0.93      0.95        40
           7       0.90      0.88      0.89        40
           8       1.00      0.93      0.96        40
           9       1.00      0.93      0.96        40
          10       0.89      0.80      0.84        40
          11       0.78      0.88      0.82        40
          12       0.85      0.88      0.86        40
          13       0.85      0.97      0.91        40
          14       0.71      0.85      0.77        40
          15       0.93      0.93      0.93        40
          16       0.78      0.90      0.84        40
          