In [1]:
# install hugging face transformers and datasets library
!pip install -q transformers

### Loading Datasets

In [2]:
import re
import numpy as np 
import pandas as pd
pd.set_option('display.max_rows', 700)
from sklearn.utils import shuffle

Mount Google Drive

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Set filepaths of the datasets

In [3]:
train_path = '/content/drive/MyDrive/data_science/projects/clinc_oos/dataset/is_train.json'
val_path = '/content/drive/MyDrive/data_science/projects/clinc_oos/dataset/is_val.json'
test_path = '/content/drive/MyDrive/data_science/projects/clinc_oos/dataset/is_test.json'
oos_train_path = '/content/drive/MyDrive/data_science/projects/clinc_oos/dataset/oos_train.json'
oos_val_path = '/content/drive/MyDrive/data_science/projects/clinc_oos/dataset/oos_val.json'
oos_test_path = '/content/drive/MyDrive/data_science/projects/clinc_oos/dataset/oos_test.json'

Read json objects and convert them to dataframe

In [4]:
train = pd.read_json(train_path)
val = pd.read_json(val_path)
test = pd.read_json(test_path)
oos_train = pd.read_json(oos_train_path)
oos_val = pd.read_json(oos_val_path)
oos_test = pd.read_json(oos_test_path)

files = [
         (train,'train'),
         (val,'val'),
         (test,'test'),
         (oos_train,'oos_train'),
         (oos_val,'oos_val'),
         (oos_test,'oos_test')
         ]
         
for file,name in files:
    file.columns = ['text','intent']
    print(f'{name} shape:{file.shape}, {name} has {train.isna().sum().sum()} null values')

train shape:(15000, 2), train has 0 null values
val shape:(3000, 2), val has 0 null values
test shape:(4500, 2), test has 0 null values
oos_train shape:(100, 2), oos_train has 0 null values
oos_val shape:(100, 2), oos_val has 0 null values
oos_test shape:(1000, 2), oos_test has 0 null values


Merge oos dataset with their counterparts

In [5]:
train = pd.concat([train, oos_train], axis=0).reset_index(drop=True)
val = pd.concat([val, oos_val], axis=0).reset_index(drop=True)
test = pd.concat([test, oos_test], axis=0).reset_index(drop=True)

Shuffle the datasets

In [6]:
train = shuffle(train , random_state=0).reset_index(drop=True)
val = shuffle(val, random_state=0).reset_index(drop=True)
test = shuffle(test, random_state=0).reset_index(drop=True)

In [7]:
train.shape

(15100, 2)

In [8]:
train.head()

Unnamed: 0,text,intent
0,how old are your pets,do_you_have_pets
1,please find the routing number for chase bank ...,routing
2,i would like to switch to a different insuranc...,insurance_change
3,are the items on my todo list listed alphabeti...,todo_list
4,has my day off request been looked at yet,pto_request_status


In [9]:
train['intent'].value_counts()

report_fraud                 100
flight_status                100
min_payment                  100
distance                     100
travel_notification          100
account_blocked              100
vaccines                     100
uber                         100
calendar_update              100
report_lost_card             100
nutrition_info               100
thank_you                    100
who_made_you                 100
bill_balance                 100
travel_suggestion            100
change_language              100
transactions                 100
cancel_reservation           100
what_are_your_hobbies        100
calendar                     100
pto_balance                  100
time                         100
yes                          100
w2                           100
balance                      100
maybe                        100
restaurant_reviews           100
weather                      100
pto_request                  100
ingredient_substitution      100
change_vol

In [10]:
train['intent'].nunique(), val['intent'].nunique(), test['intent'].nunique()

(151, 151, 151)

Use LabelEncoder to encode our labels to numeric values like `0,1,2,3,4,5, etc`.

In [11]:
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
# labelBinary = LabelBinarizer()
labelEncoder = LabelEncoder()
# import numpy as np
# val = [
#        [0, 0, 1],
#        [1, 0, 0]
# ]
 
# lb.inverse_transform(np.asarray(val))

In [12]:
# # Function to calculate VIF
# def calculate_vif(data):
#     vif_df = pd.DataFrame(columns = ['Var', 'Vif'])
#     x_var_names = data.columns
#     for i in range(0, x_var_names.shape[0]):
#         y = data[x_var_names[i]]
#         x = data[x_var_names.drop([x_var_names[i]])]
#         r_squared = sm.OLS(y,x).fit().rsquared
#         vif = round(1/(1-r_squared),2)
#         vif_df.loc[i] = [x_var_names[i], vif]
#     return vif_df.sort_values(by = 'Vif', axis = 0, ascending=False, inplace=False)

# X=df.drop(['Salary'],axis=1)
# calculate_vif(X)

Transform train and test label to numeric values. We will use test set from our model validation during training and val set for model final evaluation

In [13]:
train_labels = labelEncoder.fit_transform(train['intent'])
train_text = list(train['text'])

test_text = list(test['text'])
test_labels = labelEncoder.transform(test['intent'])

In [14]:
print(train_labels.shape, len(train_text))
print(test_labels.shape, len(test_text))

(15100,) 15100
(5500,) 5500


Save encoded labels to pickle file. So, that we can use it to transform our model output

In [15]:
import pickle

labelpath = 'distilBERT_label.pkl'
with open(labelpath, 'wb') as handle:
    pickle.dump(labelEncoder, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Tokenization annd Padding

In [16]:
from transformers import DistilBertTokenizer

Download DistilBERT tokenizer from transformers

In [17]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

We will set `truncation=True` and `padding='longest'` for dynamic padding

In [18]:
%%time

tokenized_train = tokenizer(
    train_text,
    truncation=True,
    padding='longest')

tokenized_test = tokenizer(
    test_text,
    truncation=True,
    padding='longest')

CPU times: user 6.52 s, sys: 49.1 ms, total: 6.57 s
Wall time: 6.57 s


### Fine-tune with TensorFlow

Next, convert our datasets to the `tf.data.Dataset` format 

In [19]:
import tensorflow as tf

In [20]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(tokenized_train),
    train_labels
))

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(tokenized_test),
    test_labels
))

#### Load our model

In [21]:
from transformers import TFDistilBertForSequenceClassification

We will use `distilbert-base-uncased` model for our classification and `num_labels=151` because we have `151 labels`.

In [22]:
model = TFDistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased', num_labels=151)

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_transform', 'vocab_layer_norm', 'vocab_projector', 'activation_13']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier', 'dropout_19', 'pre_classifier']
You should probably TRAIN this model on a down-stream task to be able to use i

#### Complile our model

In [23]:
learning_rate = 2e-05
batch_size = 264
batch_size = 264
seed = 42 # shuffling
num_epochs = 50

We will use `loss='categorical_crossentropy'` because we have categorical labels

In [24]:
optimizer = tf.keras.optimizers.Adam(
    learning_rate=learning_rate,
    beta_1=0.9,
    beta_2=0.999,
    epsilon=1e-08)
model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])

We will also use `tf.keras.callbacks.EarlyStopping`, we will monitor `val_loss` with `mode='min'` and `patience=3`

In [25]:
early_stopper = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    min_delta=0,
    patience=3,
    verbose=0,
    mode='min',
    baseline=None,
    restore_best_weights=True
)

#### Finally, fine-tune the model by calling model.fit:

In [27]:
model.fit(
    train_dataset.shuffle(seed).batch(batch_size),
    epochs=num_epochs,
    callbacks=[early_stopper],
    # batch_size=256,
    validation_data=test_dataset.shuffle(seed).batch(batch_size))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50


<keras.callbacks.History at 0x7fdfaf211610>

#### Evaluation on Testing set

In [None]:
# evaluation = model.evaluate(validation_dataset.batch(eval_batch_size), return_dict=True)
# print(evaluation)

{'loss': 0.3852382004261017, 'accuracy': 0.9396774172782898}


####Saving Model and Tokenizers 

In [28]:
save_model_path = '/content/drive/MyDrive/data_science/projects/clinc_oos/model/4_clinc_oss_distilBERT_with_oos/v2'
model.save_pretrained(save_model_path)
tokenizer.save_pretrained(save_model_path)

('/content/drive/MyDrive/data_science/projects/clinc_oos/model/4_clinc_oss_distilBERT_with_oos/v2/tokenizer_config.json',
 '/content/drive/MyDrive/data_science/projects/clinc_oos/model/4_clinc_oss_distilBERT_with_oos/v2/special_tokens_map.json',
 '/content/drive/MyDrive/data_science/projects/clinc_oos/model/4_clinc_oss_distilBERT_with_oos/v2/vocab.txt',
 '/content/drive/MyDrive/data_science/projects/clinc_oos/model/4_clinc_oss_distilBERT_with_oos/v2/added_tokens.json')

#### Load model and tokenizer of transformersbook/distilbert-base-uncased-distilled-clinc
* This model is in pytorch

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

In [None]:
loaded_tokenizer = AutoTokenizer.from_pretrained("transformersbook/distilbert-base-uncased-distilled-clinc")
loaded_model = AutoModelForSequenceClassification.from_pretrained("transformersbook/distilbert-base-uncased-distilled-clinc")

In [None]:
def predict(text, loaded_tokenizer, loaded_model):
    inputs = loaded_tokenizer(
        text,
        # truncation=True,
        # padding=True,
        return_tensors="pt"
        )
    outputs = loaded_model(**inputs)
    loss = outputs.loss
    logits = outputs.logits
    predicted_arr = logits.softmax(dim=-1).tolist()
    index_val = predicted_arr[0].index(max(predicted_arr[0]))
    return index_val

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier

In [None]:
from tqdm.notebook import tqdm

predicted_values = []

for text in tqdm(validation_text, desc='Prediction Progress'):
    predicted_values.append(predict(text, loaded_tokenizer, loaded_model))

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics.cluster import adjusted_rand_score, normalized_mutual_info_score

# DeepAlingned : NMI: 93.86, ARI: 79.75, ACC: 86.49 
print('ACC : ', round(accuracy_score(predicted_values, validation_labels)*100, 2))
print('ARI : ', round(adjusted_rand_score(predicted_values, validation_labels)*100, 2))
print('NMI : ', round(normalized_mutual_info_score(predicted_values, validation_labels)*100, 2))

#### Testing using Saved model and tokenizer

In [29]:
from transformers import DistilBertTokenizer
from transformers import TFDistilBertForSequenceClassification
from tqdm.notebook import tqdm

Download the save model and tokenizer

In [30]:
loaded_tokenizer = DistilBertTokenizer.from_pretrained(save_model_path)
loaded_model = TFDistilBertForSequenceClassification.from_pretrained(save_model_path)

Some layers from the model checkpoint at /content/drive/MyDrive/data_science/projects/clinc_oos/model/4_clinc_oss_distilBERT_with_oos/v2 were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at /content/drive/MyDrive/data_science/projects/clinc_oos/model/4_clinc_oss_distilBERT_with_oos/v2 and are newly initialized: ['dropout_39']
You should probably TRAIN t

Download the save labels encodings

In [36]:
labelPath = '/content/drive/MyDrive/data_science/projects/clinc_oos/model/4_clinc_oss_distilBERT_with_oos/distilBERT_label.pkl'
with open(labelPath, 'rb') as handle:
    load_labels = pickle.load(handle)

In [33]:
val_text = list(val['text'])
val_labels = list(val['intent'])
len(val_text), len(val_labels)

(3100, 3100)

Create a function to get the predictions from validation set

In [47]:
def tf_predict(text, loaded_tokenizer, loaded_model):
    predict_input = loaded_tokenizer.encode(text,
                                    truncation=True,
                                    padding='longest',
                                    return_tensors="tf")
    output = loaded_model(predict_input)[0]
    prediction_value = tf.argmax(output, axis=1).numpy()
    predicted_intent = load_labels.inverse_transform(prediction_value)[0]
    return predicted_intent

In [50]:
predicted_intents = []
for text in tqdm(val_text, desc='Prediction Progress'):
    predicted_intents.append(tf_predict(text, loaded_tokenizer, loaded_model))

Prediction Progress:   0%|          | 0/3100 [00:00<?, ?it/s]

Testing Metrics are `accuracy_score`, `adjusted_rand_score`, `normalized_mutual_info_score` and `classification_report`

In [51]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics.cluster import adjusted_rand_score, normalized_mutual_info_score

In [57]:
print('ACC : ', round(accuracy_score(predicted_intents, val_labels)*100, 2))
print('ARI : ', round(adjusted_rand_score(predicted_intents, val_labels)*100, 2))
print('NMI : ', round(normalized_mutual_info_score(predicted_intents, val_labels)*100, 2))

ACC :  94.0
ARI :  84.6
NMI :  95.51


In [58]:
print(classification_report(predicted_intents, val_labels))

                           precision    recall  f1-score   support

      accept_reservations       0.90      0.90      0.90        20
          account_blocked       0.90      0.90      0.90        20
                    alarm       1.00      1.00      1.00        20
       application_status       1.00      0.95      0.98        21
                      apr       1.00      1.00      1.00        20
            are_you_a_bot       0.90      0.95      0.92        19
                  balance       0.90      1.00      0.95        18
             bill_balance       0.95      0.95      0.95        20
                 bill_due       1.00      0.87      0.93        23
              book_flight       1.00      1.00      1.00        20
               book_hotel       1.00      1.00      1.00        20
               calculator       1.00      0.95      0.98        21
                 calendar       0.95      0.95      0.95        20
          calendar_update       1.00      0.95      0.98     