### Loading Datasets

In [2]:
import re
import numpy as np 
import pandas as pd
pd.set_option('display.max_rows', 700)
from sklearn.utils import shuffle

In [3]:
import matplotlib.pyplot as plt

In [4]:
# from google.colab import drive
# drive.mount('/content/drive')

In [5]:
train = shuffle(pd.read_json('data/is_train.json'), random_state=0)
val = shuffle(pd.read_json('data/is_val.json'), random_state=0)
test = pd.read_json('data/is_test.json')
oos_train = pd.read_json('data/oos_train.json')
oos_val = pd.read_json('data/oos_val.json')
oos_test = pd.read_json('data/oos_test.json')

files = [
         (train,'train'),
         (val,'val'),
         (test,'test'),
         (oos_train,'oos_train'),
         (oos_val,'oos_val'),
         (oos_test,'oos_test')
         ]
         
for file,name in files:
    file.columns = ['text','intent']
    print(f'{name} shape:{file.shape}, {name} has {train.isna().sum().sum()} null values')
in_train = train.copy()

train shape:(15000, 2), train has 0 null values
val shape:(3000, 2), val has 0 null values
test shape:(4500, 2), test has 0 null values
oos_train shape:(100, 2), oos_train has 0 null values
oos_val shape:(100, 2), oos_val has 0 null values
oos_test shape:(1000, 2), oos_test has 0 null values


#### Merging oos intent to train, val and test

In [6]:
in_train = pd.concat([in_train, oos_train], axis=0).reset_index(drop=True)
val = pd.concat([val, oos_val], axis=0).reset_index(drop=True)
test = pd.concat([test, oos_test], axis=0).reset_index(drop=True)

In [7]:
# shuffling the datasets
in_train = shuffle(in_train , random_state=0).reset_index(drop=True)
val = shuffle(val, random_state=0).reset_index(drop=True)
test = shuffle(test, random_state=0).reset_index(drop=True)

In [8]:
in_train.shape

(15100, 2)

In [9]:
in_train.intent.value_counts()

redeem_rewards               100
who_made_you                 100
spending_history             100
pay_bill                     100
time                         100
goodbye                      100
rollover_401k                100
order_status                 100
pto_request                  100
calculator                   100
tire_change                  100
measurement_conversion       100
pto_request_status           100
rewards_balance              100
vaccines                     100
calendar                     100
what_is_your_name            100
reminder                     100
bill_balance                 100
account_blocked              100
spelling                     100
timezone                     100
freeze_account               100
last_maintenance             100
do_you_have_pets             100
gas_type                     100
jump_start                   100
w2                           100
oil_change_how               100
calories                     100
thank_you 

In [10]:
in_train['intent'].nunique(), val['intent'].nunique(), test['intent'].nunique()

(151, 151, 151)

In [11]:
# oss_plus_train = pd.concat([in_train, oos_train],axis=0).reset_index(drop=True)
# oss_plus_val = pd.concat([val,oos_val],axis=0).reset_index(drop=True)
# oss_plus_test = pd.concat([test,oos_test],axis=0).reset_index(drop=True)

In [12]:
# oss_plus_train['intent'].nunique(), oss_plus_val['intent'].nunique(), oss_plus_test['intent'].nunique()

In [13]:
# def binarize(df):
#     df.intent = np.where(df.intent!='oos',0,1)
#     return df

In [14]:
# oos_plus_train = binarize(pd.concat([in_train,oos_train],axis=0).reset_index(drop=True))
# oos_plus_val = binarize(pd.concat([val,oos_val],axis=0).reset_index(drop=True))
# oos_plus_test = binarize(pd.concat([test,oos_test],axis=0).reset_index(drop=True))

In [15]:
# oos_count = oos_plus_train.intent.value_counts()
# oos_count

In [16]:
# oss_plus_train.head()

#### Loading LabelBinarizer for One Hot Encoding

In [17]:
from sklearn.preprocessing import LabelBinarizer
labelBinary = LabelBinarizer()
 
train_labels = labelBinary.fit_transform(in_train['intent'])
train_sentences = in_train['text'].values
# import numpy as np
# val = [
#        [0, 0, 1],
#        [1, 0, 0]
# ]
 
# lb.inverse_transform(np.asarray(val))

In [18]:
labelBinary.classes_

array(['accept_reservations', 'account_blocked', 'alarm',
       'application_status', 'apr', 'are_you_a_bot', 'balance',
       'bill_balance', 'bill_due', 'book_flight', 'book_hotel',
       'calculator', 'calendar', 'calendar_update', 'calories', 'cancel',
       'cancel_reservation', 'car_rental', 'card_declined', 'carry_on',
       'change_accent', 'change_ai_name', 'change_language',
       'change_speed', 'change_user_name', 'change_volume',
       'confirm_reservation', 'cook_time', 'credit_limit',
       'credit_limit_change', 'credit_score', 'current_location',
       'damaged_card', 'date', 'definition', 'direct_deposit',
       'directions', 'distance', 'do_you_have_pets', 'exchange_rate',
       'expiration_date', 'find_phone', 'flight_status', 'flip_coin',
       'food_last', 'freeze_account', 'fun_fact', 'gas', 'gas_type',
       'goodbye', 'greeting', 'how_busy', 'how_old_are_you',
       'improve_credit_score', 'income', 'ingredient_substitution',
       'ingredients_l

In [19]:
import pickle

labelpath = 'model/3_clinc_oss_BERT_with_oos/label.pkl'
with open(labelpath, 'wb') as handle:
    pickle.dump(labelBinary, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [20]:
print(train_labels.shape, train_sentences.shape)
# train_labels.shape

(15100, 151) (15100,)


In [21]:
train_sentences

array(['is it possible for you to use the whisper voice',
       'search for a macaroni and cheese recipe', 'correct', ...,
       'what kind of mpg does this car get in the city', 'roll dice',
       'the tasks for today, what are they'], dtype=object)

In [22]:
train_labels

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

#### Transform validation dataset

In [23]:
val.head()

Unnamed: 0,text,intent
0,how much do i need to pay in tax,taxes
1,forget reservation at longhorn for 4,cancel_reservation
2,is rice bad after 3 days in fridge,food_last
3,i want to change your name to sarah,change_ai_name
4,what do i get paid,income


In [24]:
val_labels = labelBinary.transform(val['intent'])
val_sentences = val['text'].values

In [25]:
val_sentences

array(['how much do i need to pay in tax',
       'forget reservation at longhorn for 4',
       'is rice bad after 3 days in fridge', ...,
       'can i travel to france as far as safety goes',
       'read my reminder list to me please', 'create a reminder'],
      dtype=object)

In [26]:
val_labels

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

### BERT

In [27]:
!pip install -U tensorflow
!pip install -U tensorflow_hub
!pip -q install tensorflow_text



In [28]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

#### Loading BERT preprocessor and encoder

In [28]:
# loading BERT preprocessor and encoder
BERT_PREPROCESSOR = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3')
BERT_ENCODER = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4')

In [106]:
# Use BERT on a batch of raw text inputs.
text_1 = 'agra is a bank of yamuna'
text_2 = 'I went to the bank for withdrawal'
text_3 = 'the state bank of india is closed'
# text_4 = 'jupyter is a notebook'
# text_5 = 'anaconda is a big snake'
# text_6 = 'python is a oop language'

In [107]:
pre = BERT_PREPROCESSOR([text_1, text_2, text_3])
encoded = BERT_ENCODER(pre)

In [108]:
encoded.keys()

dict_keys(['sequence_output', 'encoder_outputs', 'default', 'pooled_output'])

In [110]:
seq = encoded['sequence_output']
pool = encoded['pooled_output']

In [111]:
out = seq[:, 0, :]

In [112]:
out.shape

TensorShape([3, 768])

In [113]:
out[0].shape

TensorShape([768])

In [114]:
pool.shape

TensorShape([3, 768])

In [115]:
# checking cosine similarity between text
# Use BERT on a batch of raw text inputs.
# text_1 = 'agra is a bank of yamuna'
# text_2 = 'I went to the bank for withdrawal'
# text_3 = 'the state bank of india is closed'

from sklearn.metrics.pairwise import cosine_similarity
# print(cosine_similarity([out[0]], [out[1]]))
print(cosine_similarity([out[0]], [out[1]]))
print(cosine_similarity([pool[0]], [pool[1]]))

print(cosine_similarity([out[1]], [out[2]]))
print(cosine_similarity([pool[1]], [pool[2]]))

print(cosine_similarity([out[0]], [out[2]]))
print(cosine_similarity([pool[0]], [pool[2]]))


[[0.8796952]]
[[0.89130044]]
[[0.7762632]]
[[0.6262022]]
[[0.817188]]
[[0.86584616]]


#### Architecture

In [None]:

# def get_rnn_cnn_model():
#     embedding_dim = 300
#     inp = Input(shape=(MAX_LENGTH, ))
#     x = Embedding(MAX_NB_WORDS, embedding_dim, weights=[embedding_matrix], input_length=MAX_LENGTH, trainable=True)(inp)
#     x = SpatialDropout1D(0.3)(x)
#     x = Bidirectional(GRU(100, return_sequences=True))(x)
#     # x = Bidirectional(LSTM(100, return_sequences=True))(x)
#     # x = Conv1D(64, kernel_size = 2, padding = "valid", kernel_initializer = "he_uniform")(x)
#     x = Conv1D(64, kernel_size = 2, padding = "valid", kernel_regularizer=l2(0.0005))(x)
#     avg_pool = GlobalAveragePooling1D()(x)
#     max_pool = GlobalMaxPooling1D()(x)
#     conc = concatenate([avg_pool, max_pool])
#     outp = Dense(LABEL_LEN, activation="softmax")(conc)
#     model = Model(inputs=inp, outputs=outp)
#     optimizer = tensorflow.keras.optimizers.Adam(learning_rate=0.001)
#     model.compile(loss='categorical_crossentropy',
#                   optimizer=optimizer,
#                   metrics=['accuracy'])
#     return model

In [29]:
def arch(num_classes):
    # BERT layers
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string,
                                       name='input_text')
    preprocessed_text = BERT_PREPROCESSOR(text_input)
    encoded_text = BERT_ENCODER(preprocessed_text)

    # neural layers
    reshape_output = tf.keras.layers.Reshape(
        [128, 768])(encoded_text['sequence_output'])
    x = tf.keras.layers.SpatialDropout1D(0.3)(reshape_output)
    x = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(
        100,
        return_sequences=True))(x)
    x = tf.keras.layers.Conv1D(
        64,
        kernel_size = 3,
        padding = "valid",
        kernel_regularizer=tf.keras.regularizers.l2(0.0005))(x)
    avg_pool = tf.keras.layers.GlobalAveragePooling1D()(x)
    max_pool = tf.keras.layers.GlobalMaxPooling1D()(x)
    conc = tf.keras.layers.concatenate([avg_pool, max_pool])
    output = tf.keras.layers.Dense(num_classes, activation="softmax")(conc)

    # construct final model and compiling it
    model = tf.keras.Model(inputs = [text_input], outputs=[output])
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    model.summary()
    return model

In [30]:
num_classes = len(labelBinary.classes_)
num_classes

151

In [31]:
bert_model = arch(num_classes)

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_text (InputLayer)         [(None,)]            0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        {'input_word_ids': ( 0           input_text[0][0]                 
__________________________________________________________________________________________________
keras_layer_1 (KerasLayer)      {'sequence_output':  109482241   keras_layer[0][0]                
                                                                 keras_layer[0][1]                
                                                                 keras_layer[0][2]                
______________________________________________________________________________________________

In [32]:
# tf.keras.utils.plot_model(bert_model)

In [28]:
# declare checkpoint 
filepath = 'model/3_clinc_oss_BERT_with_oos/v2'
checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath,
                             monitor='val_accuracy',
                             verbose=1,
                             save_best_only=True,
                             mode='max')
batch_size = 264
epochs = 10

In [None]:
history = bert_model.fit(
    train_sentences,
    train_labels,
    #  validation_split = 0.1, #0.1
    validation_data=(val_sentences, val_labels), 
    batch_size=batch_size, 
    callbacks=[checkpoint,
               tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=3)], 
    epochs=epochs, 
    verbose=1)

In [None]:
# save the model
# bert_model.save('model/3_clinc_oss_BERT_with_oos/v1')

### Retraining

In [None]:
v2_model = tf.keras.models.load_model('model/3_clinc_oss_BERT_with_oos/v2')

In [None]:
v2_model.summary()

In [None]:
# declare checkpoint 
filepath = 'model/3_clinc_oss_BERT_with_oos/v2'
checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath,
                             monitor='val_accuracy',
                             verbose=1,
                             save_best_only=True,
                             mode='max')
batch_size = 264
epochs = 10

In [None]:
history = v1_model.fit(
    train_sentences,
    train_labels,
    #  validation_split = 0.1, #0.1
    validation_data=(val_sentences, val_labels), 
    batch_size=batch_size, 
    callbacks=[checkpoint,
               tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=3)], 
    epochs=epochs, 
    verbose=1)

### **Testing**

In [39]:
test.head()

Unnamed: 0,text,intent
0,i would like to do some things in phoenix,travel_suggestion
1,what are the steps to rollover my 401k,rollover_401k
2,what films are in the running for best song at...,oos
3,will you tell me how the ai is doing,greeting
4,how would i say what is your name if i were fr...,translate


In [40]:
test.shape

(5500, 2)

In [41]:
test_label = test['intent'].values

In [42]:
test_label

array(['travel_suggestion', 'rollover_401k', 'oos', ..., 'nutrition_info',
       'tire_change', 'credit_limit'], dtype=object)

In [45]:
prediction = bert_model.predict(test['text'].values)

In [46]:
# prediction.flatten()

In [47]:
prediction.shape

(5500, 151)

In [48]:
predicted_results = labelBinary.inverse_transform(prediction)

In [49]:
from sklearn.metrics import accuracy_score
from sklearn.metrics.cluster import adjusted_rand_score, normalized_mutual_info_score

# DeepAlingned : NMI: 93.86, ARI: 79.75, ACC: 86.49 
print('ACC : ', round(accuracy_score(test_label, predicted_results)*100, 2))
print('ARI : ', round(adjusted_rand_score(test_label, predicted_results)*100, 2))
print('NMI : ', round(normalized_mutual_info_score(test_label, predicted_results)*100, 2))

ACC :  78.6
ARI :  23.94
NMI :  82.85


In [50]:
from sklearn.metrics import classification_report
print(classification_report(test_label, predicted_results, target_names=labelBinary.classes_))

                           precision    recall  f1-score   support

      accept_reservations       0.70      0.93      0.80        30
          account_blocked       0.84      0.90      0.87        30
                    alarm       0.97      0.97      0.97        30
       application_status       0.94      1.00      0.97        30
                      apr       0.85      0.97      0.91        30
            are_you_a_bot       1.00      1.00      1.00        30
                  balance       0.85      0.93      0.89        30
             bill_balance       0.67      0.80      0.73        30
                 bill_due       0.64      0.93      0.76        30
              book_flight       0.85      0.97      0.91        30
               book_hotel       0.94      1.00      0.97        30
               calculator       0.57      0.93      0.71        30
                 calendar       0.88      0.77      0.82        30
          calendar_update       0.87      0.87      0.87     