<a href="https://colab.research.google.com/github/larissapoghosyan/Capstone_Project/blob/main/Transformers_BERT_embeddings_both_datasets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Import libraries

In [None]:
!pip install transformers

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import transformers
from transformers import BertTokenizer, BertModel
from transformers import RobertaTokenizer, RobertaModel
from transformers import AlbertModel, AlbertTokenizer
from transformers import DistilBertModel, DistilBertTokenizer
from transformers import  AutoModel, AutoTokenizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
import math
import h5py
import os
import copy
from tqdm import tqdm
import time

import warnings

torch.cuda.empty_cache()

In [None]:
!/usr/local/cuda/bin/nvcc --version
!nvidia-smi

In [None]:
torch.cuda.is_available()

True

In [None]:
data = pd.read_csv('/content/IMDb_Reviews.csv',engine='python', error_bad_lines=False)
# data = pd.read_csv('/content/outofscope-intent-classification-dataset.csv')



  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
data.shape

(50000, 2)

In [None]:
# Label Encoder:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le_fitted = le.fit_transform(data.iloc[:,1])
data.iloc[:,1] = le_fitted
label_col = np.array(data.iloc[:,1]).reshape(len(data),1)
label_col = label_col.astype(int)
np.unique(label_col)

array([0, 1])

### Model Config

In [None]:

## Initializing models for BERT and other variants of BERT, 
## Defining output files according to output type
models_config = {
    'BERT': {
        'tokenizer' : AutoTokenizer.from_pretrained('bert-base-uncased'),
        'model': AutoModel.from_pretrained('bert-base-uncased',
                                        output_attentions = True,
                                        output_hidden_states = True
                                        ),
        'output_type': ['pooler_output',
                        'token_avg',
                        'cls_last_hid',
                        'n_lyrs_cat',
                        'n_lyrs_cat_tkn_avg'],
        'output_file': ['bert_imdb_pooler',
                        'bert_imdb_token_avg',
                        'bert_imdb_cls_last_hid',
                        'bert_imdb_CLS_cat',
                        'bert_imdb_tkn_cat']
        },

    'RoBERT': {
        'tokenizer' : AutoTokenizer.from_pretrained('roberta-base'),
        'model': AutoModel.from_pretrained('roberta-base',
                                        output_attentions = True,
                                        output_hidden_states = True
                                        ),
        'output_type': ['pooler_output',
                        'token_avg',
                        'cls_last_hid',
                        'n_lyrs_cat',
                        'n_lyrs_cat_tkn_avg'],
        'output_file': ['robert_imdb_pooler',
                        'robert_imdb_token_avg',
                        'robert_imdb_cls_last_hid',
                        'robert_imdb_CLS_cat',
                        'robert_imdb_tkn_cat']
        },

    'AlBERT': {
        'tokenizer' : AutoTokenizer.from_pretrained('albert-base-v2'),
        'model': AutoModel.from_pretrained('albert-base-v2',
                                        output_attentions = True,
                                        output_hidden_states = True
                                        ),
        'output_type': ['pooler_output',
                        'token_avg',
                        'cls_last_hid',
                        'n_lyrs_cat',
                        'n_lyrs_cat_tkn_avg'],
        'output_file': ['albert_imdb_pooler',
                        'albert_imdb_token_avg',
                        'albert_imdb_cls_last_hid',
                        'albert_imdb_CLS_cat',
                        'albert_imdb_tkn_cat']

        },
    'DistilBERT': {
        'tokenizer' : AutoTokenizer.from_pretrained('distilbert-base-uncased'),
        'model': AutoModel.from_pretrained('distilbert-base-uncased',
                                        output_attentions = True,
                                        output_hidden_states = True
                                        ),
        'output_type': ['pooler_output',
                        'token_avg',
                        'cls_last_hid',
                        'n_lyrs_cat',
                        'n_lyrs_cat_tkn_avg'],
        'output_file': ['distilbert_imdb_pooler',
                        'distilbert_imdb_token_avg',
                        'distilbert_imdb_cls_last_hid',
                        'distilbert_imdb_CLS_cat',
                        'distilbert_imdb_tkn_cat']

        },
    'TinyBERT': {
        'tokenizer' : AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-TinyBERT-L6-v2'),
        'model': AutoModel.from_pretrained('sentence-transformers/paraphrase-TinyBERT-L6-v2',
                                        output_attentions = True,
                                        output_hidden_states = True
                                        ),
        'output_type': ['pooler_output',
                        'token_avg',
                        'cls_last_hid',
                        'n_lyrs_cat',
                        'n_lyrs_cat_tkn_avg'],
        'output_file': ['tinybert_imdb_pooler',
                        'tinybert_imdb_token_avg',
                        'tinybert_imdb_cls_last_hid',
                        'tinybert_imdb_CLS_cat',
                        'tinybert_imdb_tkn_cat']

        },
    'Sentence-BERT': {
        'tokenizer' : AutoTokenizer.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens'),
        'model': AutoModel.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens',
                                        output_attentions = True,
                                        output_hidden_states = True
                                        ),
        'output_type': ['pooler_output',
                        'token_avg',
                        'cls_last_hid',
                        'n_lyrs_cat',
                        'n_lyrs_cat_tkn_avg'],
        'output_file': ['sentence_bert_imdb_pooler',
                        'sentence_bert_imdb_token_avg',
                        'sentence_bert_imdb_cls_last_hid',
                        'sentence_bert_imdb_CSL_cat',
                        'sentence_bert_imdb_tkn_cat']

        }
      }

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias',

In [None]:
models_config_intent = {
    'BERT': {
        'tokenizer' : AutoTokenizer.from_pretrained('bert-base-uncased'),
        'model': AutoModel.from_pretrained('bert-base-uncased',
                                        output_attentions = True,
                                        output_hidden_states = True
                                        ),
        'output_type': ['pooler_output',
                        'token_avg',
                        'cls_last_hid',
                        'n_lyrs_cat',
                        'n_lyrs_cat_tkn_avg'],
        'output_file': ['bert_intent_pooler',
                        'bert_intent_token_avg',
                        'bert_intent_cls_last_hid',
                        'bert_intent_CLS_cat',
                        'bert_intent_tkn_cat']
        },

    'RoBERT': {
        'tokenizer' : AutoTokenizer.from_pretrained('roberta-base'),
        'model': AutoModel.from_pretrained('roberta-base',
                                        output_attentions = True,
                                        output_hidden_states = True
                                        ),
        'output_type': ['pooler_output',
                        'token_avg',
                        'cls_last_hid',
                        'n_lyrs_cat',
                        'n_lyrs_cat_tkn_avg'],
        'output_file': ['robert_intent_pooler',
                        'robert_intent_token_avg',
                        'robert_intent_cls_last_hid',
                        'robert_intent_CLS_cat',
                        'robert_intent_tkn_cat']
        },

    'AlBERT': {
        'tokenizer' : AutoTokenizer.from_pretrained('albert-base-v2'),
        'model': AutoModel.from_pretrained('albert-base-v2',
                                        output_attentions = True,
                                        output_hidden_states = True
                                        ),
        'output_type': ['pooler_output',
                        'token_avg',
                        'cls_last_hid',
                        'n_lyrs_cat',
                        'n_lyrs_cat_tkn_avg'],
        'output_file': ['albert_intent_pooler',
                        'albert_intent_token_avg',
                        'albert_intent_cls_last_hid',
                        'albert_intent_CLS_cat',
                        'albert_intent_tkn_cat']

        },
    'DistilBERT': {
        'tokenizer' : AutoTokenizer.from_pretrained('distilbert-base-uncased'),
        'model': AutoModel.from_pretrained('distilbert-base-uncased',
                                        output_attentions = True,
                                        output_hidden_states = True
                                        ),
        'output_type': ['pooler_output',
                        'token_avg',
                        'cls_last_hid',
                        'n_lyrs_cat',
                        'n_lyrs_cat_tkn_avg'],
        'output_file': ['distilbert_intent_pooler',
                        'distilbert_intent_token_avg',
                        'distilbert_intent_cls_last_hid',
                        'distilbert_intent_CLS_cat',
                        'distilbert_intent_tkn_cat']

        },
    'TinyBERT': {
        'tokenizer' : AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-TinyBERT-L6-v2'),
        'model': AutoModel.from_pretrained('sentence-transformers/paraphrase-TinyBERT-L6-v2',
                                        output_attentions = True,
                                        output_hidden_states = True
                                        ),
        'output_type': ['pooler_output',
                        'token_avg',
                        'cls_last_hid',
                        'n_lyrs_cat',
                        'n_lyrs_cat_tkn_avg'],
        'output_file': ['tinybert_intent_pooler',
                        'tinybert_intent_token_avg',
                        'tinybert_intent_cls_last_hid',
                        'tinybert_intent_CLS_cat',
                        'tinybert_intent_tkn_cat']

        },
    'Sentence-BERT': {
        'tokenizer' : AutoTokenizer.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens'),
        'model': AutoModel.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens',
                                        output_attentions = True,
                                        output_hidden_states = True
                                        ),
        'output_type': ['pooler_output',
                        'token_avg',
                        'cls_last_hid',
                        'n_lyrs_cat',
                        'n_lyrs_cat_tkn_avg'],
        'output_file': ['sentence_bert_intent_pooler',
                        'sentence_bert_intent_token_avg',
                        'sentence_bert_intent_cls_last_hid',
                        'sentence_bert_intent_CSL_cat',
                        'sentence_bert_intent_tkn_cat']

        }
      }

In [None]:
!/usr/local/cuda/bin/nvcc --version
!nvidia-smi

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2020 NVIDIA Corporation
Built on Mon_Oct_12_20:09:46_PDT_2020
Cuda compilation tools, release 11.1, V11.1.105
Build cuda_11.1.TC455_06.29190527_0
Wed May 11 14:40:58 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   73C    P0    72W / 149W |   6238MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+------------

### Wrap into a Class

In [None]:
class BertEmbeddings:

  def __init__(self, model, tokenizer, batch_size, max_len):
      self.model = model
      self.tokenizer = tokenizer
      self.batch_size = batch_size
      self.max_len = max_len
      self.embedding_size = 768
      self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
      self.model.eval()
      self.model = self.model.to(self.device)


  def open_h5py_file(self, output_file, dataset, n_lyrs):
     with h5py.File(output_file+'.h5', 'w') as hf_berts:
       hf_berts.create_dataset(output_file,
                               shape=(len(dataset), self.embedding_size*n_lyrs),
                               chunks=True,
                               dtype='float32'
                               )
  def derive_outputs(self, dataset, output_type, output_file, n_lyrs=1):
    self.open_h5py_file(output_file, dataset, n_lyrs)
    for offset in tqdm(range(0, len(dataset), self.batch_size)):
        batch_text = dataset.iloc[:, 0][offset: offset + self.batch_size]
        self.batched_encoding = self.tokenizer.batch_encode_plus(
                batch_text.tolist(), 
                max_length=self.max_len,
                padding='max_length',
                return_tensors='pt',
                truncation=True        
                ).to(self.device)
        with h5py.File(output_file+'.h5', 'a') as hf_berts:
          with torch.no_grad():
            input_ids_gpu = self.model(self.batched_encoding['input_ids'])
            # Call the necessary method here
            if output_type == 'token_avg':
              avg_vects = self.last_hidden_state_token_avg(input_ids_gpu)
              hf_berts[output_file][offset : offset + self.batch_size, :] = avg_vects.cpu()
            elif output_type == 'n_lyrs_cat':
              concat_CLS = self.n_lyrs_concat(n_lyrs, input_ids_gpu)
              hf_berts[output_file][offset : offset + self.batch_size, :] = concat_CLS.cpu()
            elif output_type == 'n_lyrs_cat_tkn_avg':
              avg_cat_vects = self.n_lyrs_concat_tkn_avg(n_lyrs, input_ids_gpu)
              hf_berts[output_file][offset : offset + self.batch_size, :] = avg_cat_vects.cpu()
            else: 
              raise Exception('Not a valid Output type, check models_config')

  def last_hidden_state_token_avg(self, input_ids):
      attention_msk = self.batched_encoding['attention_mask'].unsqueeze(dim=-1)
      # fill embeddings for CLS tokens with zeros
      attention_msk[:, 0] = 0
      valid_vects = (input_ids.last_hidden_state * attention_msk).sum(dim=1)
      attention_sum = attention_msk.sum(dim=1)
      avg_vects = valid_vects / attention_sum
      return avg_vects
  
  def n_lyrs_concat(self, n_lyrs, input_ids):
          lst_n_lyrs = tuple(torch.stack(input_ids.hidden_states, dim=0))[-n_lyrs:]
          list_n_lyrs = [tensor[:,0,:] for tensor in lst_n_lyrs]
          concat_CLS = torch.cat(list_n_lyrs, dim=1)
          return concat_CLS

  def n_lyrs_concat_tkn_avg(self, n_lyrs, input_ids):
    valid_vects = []
    avg_vects = []
    attention_msk = self.batched_encoding['attention_mask'].unsqueeze(dim=-1)
    # fill embeddings for CLS tokens with zeros
    attention_msk[:, 0] = 0
    attention_sum = attention_msk.sum(dim=1)
    # get the last n layers from model
    lst_n_lyrs = tuple(torch.stack(input_ids.hidden_states, dim=0))[-n_lyrs:]

    for state in lst_n_lyrs:
      valid_vect = (state * attention_msk).sum(dim=1)
      token_avgs = valid_vect / attention_sum
      avg_vects.append(token_avgs)
      valid_vects.append(valid_vect)
    
   
    concat_token_avg = torch.cat(avg_vects, dim=1)

    return concat_token_avg
    

### BERT Embedding objects -- feature extraction

change model_config to model_config_intent to get outputs on the Intent classification dataset

In [None]:
# BERT

## creating object of BertEmbeddings class
bertembedding_object = BertEmbeddings(
    model=models_config['BERT']['model'],
    tokenizer=models_config['BERT']['tokenizer'],
    batch_size=32,
    max_len=256,
    )

## Token Average outputs from the last hidden layer
bert_token_avg = bertembedding_object.derive_outputs(
    dataset = data,
    output_type=models_config['BERT']['output_type'][1],
    output_file=models_config['BERT']['output_file'][1],
    )

## CLS concatenated outputs from last 4 layers
bert_n_lyrs_cat = bertembedding_object.derive_outputs(
    dataset = data,
    output_type=models_config['BERT']['output_type'][3],
    output_file=models_config['BERT']['output_file'][3],
    n_lyrs = 4,
    )

In [None]:
# RoBERT

## Creating object of class BertEmbeddings
robertembedding_object = BertEmbeddings(
    model=models_config['RoBERT']['model'],
    tokenizer=models_config['RoBERT']['tokenizer'],
    batch_size=32,
    max_len=256,
    )

## Token Average outputs from the last hidden layer
robert_token_avg = robertembedding_object.derive_outputs(
    dataset = data,
    output_type=models_config['RoBERT']['output_type'][1],
    output_file=models_config['RoBERT']['output_file'][1],
    )


## CLS concatenated outputs from last 4 layers
robert_n_lyrs_cat = robertembedding_object.derive_outputs(
    dataset = data,
    output_type=models_config['RoBERT']['output_type'][3],
    output_file=models_config['RoBERT']['output_file'][3],
    n_lyrs = 4,
    )

In [None]:
# Albert

## Creating object of class BertEmbeddings
albertembedding_object = BertEmbeddings(
    model=models_config['AlBERT']['model'],
    tokenizer=models_config['AlBERT']['tokenizer'],
    batch_size=32,
    max_len=256,
    )

## Token Average outputs from the last hidden layer
robert_token_avg = albertembedding_object.derive_outputs(
    dataset = data,
    output_type=models_config['AlBERT']['output_type'][1],
    output_file=models_config['AlBERT']['output_file'][1],
    )

## CLS concatenated outputs from last 4 layers
robert_n_lyrs_cat = albertembedding_object.derive_outputs(
    dataset = data,
    output_type=models_config['AlBERT']['output_type'][3],
    output_file=models_config['AlBERT']['output_file'][3],
     n_lyrs = 4,
     )

100%|██████████| 1161/1161 [24:45<00:00,  1.28s/it]
100%|██████████| 1161/1161 [24:57<00:00,  1.29s/it]


In [None]:
# DistilBERT

## Creating object of class BertEmbeddings
distilbertembedding_object = BertEmbeddings(
    model=models_config['DistilBERT']['model'],
    tokenizer=models_config['DistilBERT']['tokenizer'],
    batch_size=32,
    max_len=256,
    )

## Token Average outputs from the last hidden layer
distilbert_token_avg = distilbertembedding_object.derive_outputs(
    dataset = data,
    output_type=models_config['DistilBERT']['output_type'][1],
    output_file=models_config['DistilBERT']['output_file'][1],
    )

## CLS concatenated outputs from last 4 layers
distilbert_n_lyrs_cat = distilbertembedding_object.derive_outputs(
    dataset = data,
    output_type=models_config['DistilBERT']['output_type'][3],
    output_file=models_config['DistilBERT']['output_file'][3],
    n_lyrs = 4,
    )


100%|██████████| 1161/1161 [11:21<00:00,  1.70it/s]
100%|██████████| 1161/1161 [11:23<00:00,  1.70it/s]


In [None]:
# TinyBERT

## Creating object of class BertEmbeddings
tinybertembedding_object = BertEmbeddings(
    model=models_config['TinyBERT']['model'],
    tokenizer=models_config['TinyBERT']['tokenizer'],
    batch_size=32,
    max_len=256,
    )

## Token Average outputs from the last hidden layer
tinybert_token_avg = tinybertembedding_object.derive_outputs(
    dataset = data,
    output_type=models_config['TinyBERT']['output_type'][1],
    output_file=models_config['TinyBERT']['output_file'][1],
    )

## CLS concatenated outputs from last 4 layers
tinybert_n_lyrs_cat = tinybertembedding_object.derive_outputs(
    dataset = data,
    output_type=models_config['TinyBERT']['output_type'][3],
    output_file=models_config['TinyBERT']['output_file'][3],
    n_lyrs = 4,
    )

100%|██████████| 1563/1563 [06:50<00:00,  3.80it/s]
100%|██████████| 1563/1563 [06:53<00:00,  3.78it/s]


In [None]:
# SentenceBERT


## Creating object of class BertEmbeddings
sentencebertembedding_object = BertEmbeddings(
    model=models_config['Sentence-BERT']['model'],
    tokenizer=models_config['Sentence-BERT']['tokenizer'],
    batch_size=32,
    max_len=256,
    )

## Token Average outputs from the last hidden layer
sentencebert_token_avg = sentencebertembedding_object.derive_outputs(
    dataset = data,
    output_type=models_config['Sentence-BERT']['output_type'][1],
    output_file=models_config['Sentence-BERT']['output_file'][1],
    )

## CLS concatenated outputs from last 4 layers
sentencebert_n_lyrs_cat = sentencebertembedding_object.derive_outputs(
    dataset = data,
    output_type=models_config['Sentence-BERT']['output_type'][3],
    output_file=models_config['Sentence-BERT']['output_file'][3],
    n_lyrs = 4,
    )


100%|██████████| 1563/1563 [13:14<00:00,  1.97it/s]
100%|██████████| 1563/1563 [13:21<00:00,  1.95it/s]


## Checking inference time of Robert (The Best Performing Model according to the final results)


In [None]:
# We sample 1000 data points randomly and then run the algorithm on those points
data_1000_sample = data.sample(1000)
data_1000_sample.shape

(1000, 2)

In [None]:
start_time = time.time()

## Creating object of class BertEmbeddings
robertembedding_object = BertEmbeddings(
    model=models_config['RoBERT']['model'],
    tokenizer=models_config['RoBERT']['tokenizer'],
    batch_size=1,
    max_len=256,
    )

## Token Average outputs from the last hidden layer
robert_token_avg = robertembedding_object.derive_outputs(
    dataset = data_1000_sample,
    output_type=models_config['RoBERT']['output_type'][1],
    output_file=models_config['RoBERT']['output_file'][1],
    )


end_time = time.time()
print(f'\n Inference ran for {round((end_time -  start_time))} seconds for 1000 datapoints')
print(f' \n For 1 datapoint inference ran for {round((end_time -  start_time)/1000, 2)} seconds')

100%|██████████| 1000/1000 [00:46<00:00, 21.32it/s]


 Inference ran for 47 seconds for 1000 datapoints
 
 For 1 datapoint inference ran for 0.05 seconds





In [None]:
le = LabelEncoder()
le_fitted = le.fit_transform(data_1000_sample.iloc[:,-1])
data_1000_sample.iloc[:,-1] = le_fitted.astype('int')
label_col = np.array(data_1000_sample['sentiment']).reshape(len(data_1000_sample),1)
label_col.shape

(1000, 1)

In [None]:
hf_hidden = h5py.File('/content/robert_imdb_token_avg.h5', 'r')
rb_lst_hidden = np.array(hf_hidden.get('robert_imdb_token_avg'))
hf_hidden.close()
print(rb_lst_hidden.shape)
rb_lst_hidden = np.append(rb_lst_hidden, label_col, axis=1)
rb_lst_hidden.shape

(1000, 768)


(1000, 769)

In [None]:
# RoBERTa
start_time = time.time()

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    clf = LogisticRegression(solver = "lbfgs", random_state = 0)
    clf.fit(rb_lst_hidden, label_col)

end_time = time.time()
print(f'\n Inference ran for {round((end_time -  start_time))} seconds for 1000 datapoints')
print(f' \n For 1 datapoint inference ran for {round((end_time -  start_time)/1000, 2)} seconds')


 Inference ran for 0 seconds for 1000 datapoints
 
 For 1 datapoint inference ran for 0.0 seconds


Adding the Embedding extraction inference time with logistic regression inference time we get 0.05 seconds for 1 data point