In [1]:
# install hugging face transformers and datasets library
!pip install -q transformers
!pip install -q datasets

[K     |████████████████████████████████| 3.4 MB 5.4 MB/s 
[K     |████████████████████████████████| 895 kB 45.1 MB/s 
[K     |████████████████████████████████| 596 kB 44.8 MB/s 
[K     |████████████████████████████████| 3.3 MB 34.9 MB/s 
[K     |████████████████████████████████| 61 kB 503 kB/s 
[K     |████████████████████████████████| 306 kB 4.9 MB/s 
[K     |████████████████████████████████| 243 kB 43.4 MB/s 
[K     |████████████████████████████████| 1.1 MB 44.2 MB/s 
[K     |████████████████████████████████| 132 kB 48.1 MB/s 
[K     |████████████████████████████████| 160 kB 49.0 MB/s 
[K     |████████████████████████████████| 271 kB 49.2 MB/s 
[K     |████████████████████████████████| 192 kB 37.3 MB/s 
[?25h

### Load CLINC_OOS Dataset from datasets

In [1]:
from datasets import load_dataset

In [2]:
data = load_dataset('clinc_oos', 'plus')

Reusing dataset clinc_oos (/root/.cache/huggingface/datasets/clinc_oos/plus/1.0.0/abcc41d382f8137f039adc747af44714941e8196e845dfbdd8ae7a7e020e6ba1)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
data.keys()

dict_keys(['train', 'validation', 'test'])

In [4]:
shuffle_data = data.shuffle(seed=42)

Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/clinc_oos/plus/1.0.0/abcc41d382f8137f039adc747af44714941e8196e845dfbdd8ae7a7e020e6ba1/cache-64357f20924b27ca.arrow
Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/clinc_oos/plus/1.0.0/abcc41d382f8137f039adc747af44714941e8196e845dfbdd8ae7a7e020e6ba1/cache-ccbcee96db1db4b5.arrow
Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/clinc_oos/plus/1.0.0/abcc41d382f8137f039adc747af44714941e8196e845dfbdd8ae7a7e020e6ba1/cache-c20fc082d1656e9c.arrow


In [5]:
# train dataset
train_text = shuffle_data['train']['text']
train_labels = shuffle_data['train']['intent']
# val dataset
validation_text = shuffle_data['validation']['text']
validation_labels = shuffle_data['validation']['intent']
# test dataset
test_text = shuffle_data['test']['text']
test_labels = shuffle_data['test']['intent']

In [6]:
print(len(train_text), len(train_labels))
print(len(validation_text), len(validation_labels))
print(len(test_text), len(test_labels))

15250 15250
3100 3100
5500 5500


In [7]:
# number of labels
print(len(set(train_labels)))
print(len(set(validation_labels)))
print(len(set(test_labels)))

151
151
151


Transforming the labels to one hot encoding

In [8]:
# from sklearn.preprocessing import LabelBinarizer
# labelBinary = LabelBinarizer()
 
# train_labels = labelBinary.fit_transform(shuffle_data['train']['intent'])
# val_labels = labelBinary.transform(shuffle_data['validation']['intent'])

### Tokenization annd Padding

In [10]:
from transformers import DistilBertTokenizer

In [11]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [12]:
tokenized_train = tokenizer(train_text, truncation=True, padding=True)
tokenized_validation = tokenizer(validation_text, truncation=True, padding=True)
tokenized_test = tokenizer(test_text, truncation=True, padding=True)

In [13]:
# def FindMaxLength(lst): 
#     maxList = max(lst, key = lambda i: len(i)) 
#     maxLength = len(maxList) 
#     return maxLength

# MAX_LENGTH = FindMaxLength(tokenized_train['input_ids'])
# print(MAX_LENGTH)

### Fine-tune with TensorFlow

Next, convert your datasets to the tf.data.Dataset format 

In [14]:
import tensorflow as tf

In [15]:
len(set(train_labels))

151

In [16]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(tokenized_train),
    train_labels
))

validation_dataset = tf.data.Dataset.from_tensor_slices((
    dict(tokenized_validation),
    validation_labels
))

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(tokenized_test),
    test_labels
))

In [17]:
validation_dataset

<TensorSliceDataset shapes: ({input_ids: (29,), attention_mask: (29,)}, ()), types: ({input_ids: tf.int32, attention_mask: tf.int32}, tf.int32)>

In [18]:
train_dataset

<TensorSliceDataset shapes: ({input_ids: (33,), attention_mask: (33,)}, ()), types: ({input_ids: tf.int32, attention_mask: tf.int32}, tf.int32)>

In [19]:
test_dataset

<TensorSliceDataset shapes: ({input_ids: (30,), attention_mask: (30,)}, ()), types: ({input_ids: tf.int32, attention_mask: tf.int32}, tf.int32)>

In [20]:
# import numpy as np
# np.array(list(dict(padded_train).values()))[0]

Set up an optimizer function, learning rate schedule, and some training hyperparameters:

In [21]:
# from transformers import create_optimizer
# import tensorflow as tf

In [22]:
# batch_size = 16
# num_epochs = 5
# batches_per_epoch = len(tokenized_imdb["train"]) // batch_size
# total_train_steps = int(batches_per_epoch * num_epochs)
# optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

#### Load our model

In [23]:
from transformers import TFDistilBertForSequenceClassification

In [25]:
model = TFDistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased', num_labels=151)

Downloading:   0%|          | 0.00/347M [00:00<?, ?B/s]

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_layer_norm', 'vocab_transform', 'activation_13', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier', 'pre_classifier', 'dropout_19']
You should probably TRAIN this model on a down-stream task to be able to use i

#### Complile our model

In [38]:
learning_rate = 2e-05
train_batch_size = 48
eval_batch_size = 48
seed = 42
# optimizer:Adam with betas=(0.9,0.999) and epsilon=1e-08
# lr_scheduler_type: linear
num_epochs = 10

In [41]:
optimizer = tf.keras.optimizers.Adam(
    learning_rate=learning_rate,
    beta_1=0.9,
    beta_2=0.999,
    epsilon=1e-08)
model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])

#### Finally, fine-tune the model by calling model.fit:

In [42]:
model.fit(train_dataset.shuffle(seed).batch(train_batch_size),
          epochs=num_epochs,
        #   batch_size=BATCH_SIZE,
          validation_data=validation_dataset.shuffle(seed).batch(train_batch_size))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fae13924650>

#### Evaluation on Testing set

In [43]:
evaluation = model.evaluate(test_dataset.batch(BATCH_SIZE), return_dict=True)
print(evaluation)

{'loss': 0.6959190368652344, 'accuracy': 0.8667272925376892}


#### Saving Model

In [24]:
# save_directory = "/saved_models" # change this to your preferred location

model.save_pretrained('v1')
tokenizer.save_pretrained('v1')

('v1/tokenizer_config.json',
 'v1/special_tokens_map.json',
 'v1/vocab.txt',
 'v1/added_tokens.json')

#### Load model and tokenizer

In [9]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

In [10]:
loaded_tokenizer = AutoTokenizer.from_pretrained("transformersbook/distilbert-base-uncased-distilled-clinc")
loaded_model = AutoModelForSequenceClassification.from_pretrained("transformersbook/distilbert-base-uncased-distilled-clinc")

In [69]:
inputs = loaded_tokenizer(
    text,
    truncation=True,
    padding=True,
    return_tensors="pt"
    )
# labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
# outputs = model(**inputs, labels=labels)
outputs = model(**inputs)
loss = outputs.loss
logits = outputs.logits

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier

In [76]:
loaded_model.num_labels

151

In [71]:
print(logits.softmax(dim=-1).tolist())

[[0.43872538208961487, 0.5612746477127075], [0.4248712956905365, 0.5751286745071411]]


#### Testing

In [36]:
len(validation_text), len(validation_labels)

(3100, 3100)

In [37]:
def predict(text, loaded_tokenizer, loaded_model):
    inputs = loaded_tokenizer(
        text,
        # truncation=True,
        # padding=True,
        return_tensors="pt"
        )
    outputs = loaded_model(**inputs)
    loss = outputs.loss
    logits = outputs.logits
    predicted_arr = logits.softmax(dim=-1).tolist()
    index_val = predicted_arr[0].index(max(predicted_arr[0]))
    return index_val

In [38]:
from tqdm.notebook import tqdm

predicted_values = []

for text in tqdm(validation_text, desc='Prediction Progress'):
    predicted_values.append(predict(text, loaded_tokenizer, loaded_model))

Prediction Progress:   0%|          | 0/3100 [00:00<?, ?it/s]

In [39]:
predicted_values[:4]

[143, 143, 105, 16]

In [40]:
from sklearn.metrics import accuracy_score
from sklearn.metrics.cluster import adjusted_rand_score, normalized_mutual_info_score

# DeepAlingned : NMI: 93.86, ARI: 79.75, ACC: 86.49 
print('ACC : ', round(accuracy_score(predicted_values, validation_labels)*100, 2))
print('ARI : ', round(adjusted_rand_score(predicted_values, validation_labels)*100, 2))
print('NMI : ', round(normalized_mutual_info_score(predicted_values, validation_labels)*100, 2))

ACC :  93.94
ARI :  84.95
NMI :  95.35
