In [16]:
# install hugging face transformers and datasets library
!pip install -q transformers
!pip install -q datasets

### Load CLINC_OOS Dataset from datasets

In [17]:
from datasets import load_dataset

In [18]:
data = load_dataset('clinc_oos', 'plus')

Downloading:   0%|          | 0.00/2.75k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.52k [00:00<?, ?B/s]

Downloading and preparing dataset clinc_oos/plus (download: 2.39 MiB, generated: 1.18 MiB, post-processed: Unknown size, total: 3.57 MiB) to /root/.cache/huggingface/datasets/clinc_oos/plus/1.0.0/abcc41d382f8137f039adc747af44714941e8196e845dfbdd8ae7a7e020e6ba1...


Downloading:   0%|          | 0.00/291k [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset clinc_oos downloaded and prepared to /root/.cache/huggingface/datasets/clinc_oos/plus/1.0.0/abcc41d382f8137f039adc747af44714941e8196e845dfbdd8ae7a7e020e6ba1. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
data.keys()

dict_keys(['train', 'validation', 'test'])

In [None]:
shuffle_data = data.shuffle(seed=42)

In [None]:
# train dataset
train_text = shuffle_data['train']['text']
train_labels = shuffle_data['train']['intent']
# val dataset
validation_text = shuffle_data['validation']['text']
validation_labels = shuffle_data['validation']['intent']
# test dataset
test_text = shuffle_data['test']['text']
test_labels = shuffle_data['test']['intent']

In [None]:
print(len(train_text), len(train_labels))
print(len(validation_text), len(validation_labels))
print(len(test_text), len(test_labels))

15250 15250
3100 3100
5500 5500


In [None]:
# number of labels
print(len(set(train_labels)))
print(len(set(validation_labels)))
print(len(set(test_labels)))

151
151
151


Transforming the labels to one hot encoding

In [None]:
# from sklearn.preprocessing import LabelBinarizer
# labelBinary = LabelBinarizer()
 
# train_labels = labelBinary.fit_transform(shuffle_data['train']['intent'])
# val_labels = labelBinary.transform(shuffle_data['validation']['intent'])

### Tokenization annd Padding

In [None]:
from transformers import DistilBertTokenizer

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
tokenized_train = tokenizer(train_text, truncation=True, padding=True)
tokenized_validation = tokenizer(validation_text, truncation=True, padding=True)
tokenized_test = tokenizer(test_text, truncation=True, padding=True)

In [None]:
# def FindMaxLength(lst): 
#     maxList = max(lst, key = lambda i: len(i)) 
#     maxLength = len(maxList) 
#     return maxLength

# MAX_LENGTH = FindMaxLength(tokenized_train['input_ids'])
# print(MAX_LENGTH)

### Fine-tune with TensorFlow

Next, convert your datasets to the tf.data.Dataset format 

In [None]:
import tensorflow as tf

In [None]:
len(set(train_labels))

151

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(tokenized_train),
    train_labels
))

validation_dataset = tf.data.Dataset.from_tensor_slices((
    dict(tokenized_validation),
    validation_labels
))

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(tokenized_test),
    test_labels
))

In [None]:
validation_dataset

<TensorSliceDataset shapes: ({input_ids: (29,), attention_mask: (29,)}, ()), types: ({input_ids: tf.int32, attention_mask: tf.int32}, tf.int32)>

In [None]:
train_dataset

<TensorSliceDataset shapes: ({input_ids: (33,), attention_mask: (33,)}, ()), types: ({input_ids: tf.int32, attention_mask: tf.int32}, tf.int32)>

In [None]:
test_dataset

<TensorSliceDataset shapes: ({input_ids: (30,), attention_mask: (30,)}, ()), types: ({input_ids: tf.int32, attention_mask: tf.int32}, tf.int32)>

Set up an optimizer function, learning rate schedule, and some training hyperparameters:

In [None]:
# from transformers import create_optimizer
# import tensorflow as tf

In [None]:
# batch_size = 16
# num_epochs = 5
# batches_per_epoch = len(tokenized_imdb["train"]) // batch_size
# total_train_steps = int(batches_per_epoch * num_epochs)
# optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

#### Load our model

In [None]:
from transformers import TFDistilBertForSequenceClassification

In [None]:
model = TFDistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased', num_labels=151)

Downloading:   0%|          | 0.00/347M [00:00<?, ?B/s]

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_layer_norm', 'activation_13', 'vocab_transform', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier', 'classifier', 'dropout_19']
You should probably TRAIN this model on a down-stream task to be able to use i

#### Complile our model

In [None]:
learning_rate = 2e-05
train_batch_size = 264
eval_batch_size = 264
seed = 42
# optimizer:Adam with betas=(0.9,0.999) and epsilon=1e-08
# lr_scheduler_type: linear
num_epochs = 10

In [None]:
optimizer = tf.keras.optimizers.Adam(
    learning_rate=learning_rate,
    beta_1=0.9,
    beta_2=0.999,
    epsilon=1e-08)
model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])

In [None]:
filepath = 'v1'
checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath,
                             monitor='val_accuracy',
                             verbose=1,
                             save_best_only=True,
                             mode='max')

#### Finally, fine-tune the model by calling model.fit:

In [None]:
model.fit(train_dataset.shuffle(seed).batch(train_batch_size),
          epochs=num_epochs,
        #   batch_size=BATCH_SIZE,
          validation_data=test_dataset.shuffle(seed).batch(train_batch_size),
          callbacks=[checkpoint,
               tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=3)])

Epoch 1/10
Epoch 00001: val_accuracy improved from -inf to 0.67164, saving model to v1




INFO:tensorflow:Assets written to: v1/assets


INFO:tensorflow:Assets written to: v1/assets


Epoch 2/10
Epoch 00002: val_accuracy improved from 0.67164 to 0.76018, saving model to v1
























INFO:tensorflow:Assets written to: v1/assets


INFO:tensorflow:Assets written to: v1/assets


Epoch 3/10
Epoch 00003: val_accuracy improved from 0.76018 to 0.80218, saving model to v1
























INFO:tensorflow:Assets written to: v1/assets


INFO:tensorflow:Assets written to: v1/assets


Epoch 4/10
Epoch 00004: val_accuracy improved from 0.80218 to 0.82982, saving model to v1
























INFO:tensorflow:Assets written to: v1/assets


INFO:tensorflow:Assets written to: v1/assets


Epoch 5/10
Epoch 00005: val_accuracy improved from 0.82982 to 0.84618, saving model to v1
























INFO:tensorflow:Assets written to: v1/assets


INFO:tensorflow:Assets written to: v1/assets


Epoch 6/10
Epoch 00006: val_accuracy improved from 0.84618 to 0.86236, saving model to v1
























INFO:tensorflow:Assets written to: v1/assets


INFO:tensorflow:Assets written to: v1/assets


Epoch 7/10
Epoch 00007: val_accuracy improved from 0.86236 to 0.86636, saving model to v1
























INFO:tensorflow:Assets written to: v1/assets


INFO:tensorflow:Assets written to: v1/assets


Epoch 8/10
Epoch 00008: val_accuracy did not improve from 0.86636
Epoch 9/10
Epoch 00009: val_accuracy improved from 0.86636 to 0.87018, saving model to v1
























INFO:tensorflow:Assets written to: v1/assets


INFO:tensorflow:Assets written to: v1/assets


Epoch 10/10
Epoch 00010: val_accuracy improved from 0.87018 to 0.87545, saving model to v1
























INFO:tensorflow:Assets written to: v1/assets


INFO:tensorflow:Assets written to: v1/assets




<keras.callbacks.History at 0x7f2733d46d10>

#### Evaluation on Testing set

In [None]:
evaluation = model.evaluate(validation_dataset.batch(eval_batch_size), return_dict=True)
print(evaluation)

{'loss': 0.3852382004261017, 'accuracy': 0.9396774172782898}


#### Saving Model

In [None]:
# save_directory = "/saved_models" # change this to your preferred location

model.save_pretrained('model')
tokenizer.save_pretrained('model')

('model/tokenizer_config.json',
 'model/special_tokens_map.json',
 'model/vocab.txt',
 'model/added_tokens.json')

#### Load model and tokenizer of transformersbook/distilbert-base-uncased-distilled-clinc
* This model is in pytorch

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

In [None]:
loaded_tokenizer = AutoTokenizer.from_pretrained("transformersbook/distilbert-base-uncased-distilled-clinc")
loaded_model = AutoModelForSequenceClassification.from_pretrained("transformersbook/distilbert-base-uncased-distilled-clinc")

In [None]:
def predict(text, loaded_tokenizer, loaded_model):
    inputs = loaded_tokenizer(
        text,
        # truncation=True,
        # padding=True,
        return_tensors="pt"
        )
    outputs = loaded_model(**inputs)
    loss = outputs.loss
    logits = outputs.logits
    predicted_arr = logits.softmax(dim=-1).tolist()
    index_val = predicted_arr[0].index(max(predicted_arr[0]))
    return index_val

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier

In [None]:
from tqdm.notebook import tqdm

predicted_values = []

for text in tqdm(validation_text, desc='Prediction Progress'):
    predicted_values.append(predict(text, loaded_tokenizer, loaded_model))

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics.cluster import adjusted_rand_score, normalized_mutual_info_score

# DeepAlingned : NMI: 93.86, ARI: 79.75, ACC: 86.49 
print('ACC : ', round(accuracy_score(predicted_values, validation_labels)*100, 2))
print('ARI : ', round(adjusted_rand_score(predicted_values, validation_labels)*100, 2))
print('NMI : ', round(normalized_mutual_info_score(predicted_values, validation_labels)*100, 2))

#### Testing using Saved model and tokenizer

In [None]:
from transformers import DistilBertTokenizer
from transformers import TFDistilBertForSequenceClassification
from tqdm.notebook import tqdm

In [None]:
loaded_tokenizer = DistilBertTokenizer.from_pretrained('/content/model')
loaded_model = TFDistilBertForSequenceClassification.from_pretrained('/content/model')

Some layers from the model checkpoint at /content/model were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at /content/model and are newly initialized: ['dropout_39']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
len(validation_text), len(validation_labels)

(3100, 3100)

In [None]:
def tf_predict(text, loaded_tokenizer, loaded_model):
    predict_input = loaded_tokenizer.encode(text,
                                    truncation=True,
                                    padding=True,
                                    return_tensors="tf")
    output = loaded_model(predict_input)[0]
    prediction_value = tf.argmax(output, axis=1).numpy()[0]
    return prediction_value

In [None]:
predicted_values = []
for text in tqdm(validation_text, desc='Prediction Progress'):
    predicted_values.append(tf_predict(text, loaded_tokenizer, loaded_model))

Prediction Progress:   0%|          | 0/3100 [00:00<?, ?it/s]

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics.cluster import adjusted_rand_score, normalized_mutual_info_score

# DeepAlingned : NMI: 93.86, ARI: 79.75, ACC: 86.49 
print('ACC : ', round(accuracy_score(predicted_values, validation_labels)*100, 2))
print('ARI : ', round(adjusted_rand_score(predicted_values, validation_labels)*100, 2))
print('NMI : ', round(normalized_mutual_info_score(predicted_values, validation_labels)*100, 2))

ACC :  93.97
ARI :  85.19
NMI :  95.39


In [None]:
from sklearn.metrics import classification_report
print(classification_report(predicted_values, validation_labels))

              precision    recall  f1-score   support

           0       1.00      0.95      0.98        21
           1       0.65      0.87      0.74        15
           2       0.85      0.89      0.87        19
           3       1.00      0.91      0.95        22
           4       0.95      0.95      0.95        20
           5       0.95      0.95      0.95        20
           6       0.85      0.94      0.89        18
           7       1.00      1.00      1.00        20
           8       1.00      0.95      0.98        21
           9       0.90      0.90      0.90        20
          10       1.00      0.80      0.89        25
          11       0.90      1.00      0.95        18
          12       0.95      0.95      0.95        20
          13       0.85      0.94      0.89        18
          14       1.00      1.00      1.00        20
          15       0.90      0.95      0.92        19
          16       0.90      0.75      0.82        24
          17       1.00    

### [Push our model to Hugging Face hub](https://huggingface.co/docs/transformers/model_sharing)

In [3]:
!huggingface-cli login


        _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
        _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
        _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
        _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
        _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

        To login, `huggingface_hub` now requires a token generated from https://huggingface.co/settings/token.
        (Deprecated, will be removed in v0.3.0) To login with username and password instead, interrupt with Ctrl+C.
        
Token: 
Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on you

In [5]:
from transformers import DistilBertTokenizer
from transformers import TFDistilBertForSequenceClassification
# from tqdm.notebook import tqdm

In [7]:
# load saved model and tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('/content/drive/MyDrive/data_science/projects/clinc_oos/model/4_clinc_oss_distilBERT_with_oos/model')
model = TFDistilBertForSequenceClassification.from_pretrained('/content/drive/MyDrive/data_science/projects/clinc_oos/model/4_clinc_oss_distilBERT_with_oos/model')

Some layers from the model checkpoint at /content/drive/MyDrive/data_science/projects/clinc_oos/model/4_clinc_oss_distilBERT_with_oos/model were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at /content/drive/MyDrive/data_science/projects/clinc_oos/model/4_clinc_oss_distilBERT_with_oos/model and are newly initialized: ['dropout_39']
You should probably T

In [15]:
model.push_to_hub('distilbert-base-uncased-distilled-clinc', use_temp_dir=True)
tokenizer.push_to_hub('distilbert-base-uncased-distilled-clinc', use_temp_dir=True)

Cloning https://huggingface.co/ianuragbhatt/distilbert-base-uncased-distilled-clinc into local empty directory.


Upload file tf_model.h5:   0%|          | 3.36k/256M [00:00<?, ?B/s]

To https://huggingface.co/ianuragbhatt/distilbert-base-uncased-distilled-clinc
   64af3b6..3d6af4a  main -> main

Cloning https://huggingface.co/ianuragbhatt/distilbert-base-uncased-distilled-clinc into local empty directory.
To https://huggingface.co/ianuragbhatt/distilbert-base-uncased-distilled-clinc
   3d6af4a..d21a6fa  main -> main



'https://huggingface.co/ianuragbhatt/distilbert-base-uncased-distilled-clinc/commit/d21a6fab39733dc487ae857870b38b94b7c7411a'

In [12]:
!sudo apt-get install git-lfs

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following NEW packages will be installed:
  git-lfs
0 upgraded, 1 newly installed, 0 to remove and 37 not upgraded.
Need to get 2,129 kB of archives.
After this operation, 7,662 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 git-lfs amd64 2.3.4-1 [2,129 kB]
Fetched 2,129 kB in 1s (2,204 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 76, <> line 1.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (This frontend requires a controlling tty.)
debconf: falling back to frontend: Teletype
dpkg-preconfigure: unable to re-open stdin: 
Selecting previously unselected package git-lfs.
(Reading database ... 155225 files and directories c