In [13]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [14]:
%%HTML
<style type="text/css">
table.dataframe td, table.dataframe th {
    border: 1px  black solid !important;
  color: black !important;
}
</style>

In [15]:
import os
import torch

from datasets import load_dataset
from transformers import AutoModel, AutoTokenizer

#system call
os.system("")

# importing local modules
import utils

0

In [16]:
# assigning a device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# selecting a transformer model
model_ckpt = 'distilbert-base-uncased'

# initializing AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

# initializing transformer AutoModels
model = AutoModel.from_pretrained(model_ckpt).to(device)
print(f'the available device is: {device}')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


the available device is: cuda


In [17]:
# load emotion dataset
emotions = load_dataset('emotion')

No config specified, defaulting to: emotion/split
Found cached dataset emotion (/home/karvsmech/.cache/huggingface/datasets/emotion/split/1.0.0/cca5efe2dfeb58c1d098e0f9eeb200e9927d889b5a03c67097275dfb5fe463bd)


  0%|          | 0/3 [00:00<?, ?it/s]

In [18]:
print(utils.Style.RED + 'printing emotions' + utils.Style.RESET)
print(emotions)

print(utils.Style.RED + '\nprinting the type of emotions dataset' + utils.Style.RESET)
print(type(emotions))

[31mprinting emotions[0m
DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})
[31m
printing the type of emotions dataset[0m
<class 'datasets.dataset_dict.DatasetDict'>


In [19]:
print(utils.Style.RED + 'printing emotions train' + utils.Style.RESET)
print(emotions['train'])

print(utils.Style.RED + '\nprinting the type of emotions train' + utils.Style.RESET)
print(type(emotions['train']))

[31mprinting emotions train[0m
Dataset({
    features: ['text', 'label'],
    num_rows: 16000
})
[31m
printing the type of emotions train[0m
<class 'datasets.arrow_dataset.Dataset'>


In [20]:
print(utils.Style.RED + 'column names of the emotion dataset' + utils.Style.RESET)
emotions.column_names

print(utils.Style.RED + '\naccessing the text column' + utils.Style.RESET)
emotions['train']['text'][:2]

print(utils.Style.RED + '\ndatatype of the text column' + utils.Style.RESET)
type(emotions['train']['text'][:2]) # it becomes a plain list

print(utils.Style.RED + '\naccessing the label column' + utils.Style.RESET)
emotions['train']['label'][:2]

print(utils.Style.RED + '\ndatatype of the label column' + utils.Style.RESET)
type(emotions['train']['label'][:2]) # it becomes a plain list

[31mcolumn names of the emotion dataset[0m


{'train': ['text', 'label'],
 'validation': ['text', 'label'],
 'test': ['text', 'label']}

[31m
accessing the text column[0m


['i didnt feel humiliated',
 'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake']

[31m
datatype of the text column[0m


list

[31m
accessing the label column[0m


[0, 0]

[31m
datatype of the label column[0m


list

In [21]:
emotions.set_format('torch', columns=['text', 'label'])

print(utils.Style.RED + 'accessing the text column' + utils.Style.RESET)
emotions['train']['text'][:2]

print(utils.Style.RED + '\ndatatype of the text column' + utils.Style.RESET)
type(emotions['train']['text'][:2]) # text can't be converted to a tensor. it stays as plain list

print(utils.Style.RED + '\naccessing the label column' + utils.Style.RESET)
emotions['train']['label'][:2]

print(utils.Style.RED + '\ndatatype of the label column' + utils.Style.RESET)
type(emotions['train']['label'][:2]) # list becomes a tensor

[31maccessing the text column[0m


['i didnt feel humiliated',
 'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake']

[31m
datatype of the text column[0m


list

[31m
accessing the label column[0m


tensor([0, 0])

[31m
datatype of the label column[0m


torch.Tensor

In [22]:
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)

In [23]:
emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)

print(utils.Style.RED + 'printing emotions_encoded' + utils.Style.RESET)
emotions_encoded

print(utils.Style.RED + '\nprinting emotions_encoded column names' + utils.Style.RESET)
emotions_encoded.column_names

print(utils.Style.RED + 'accessing the text column' + utils.Style.RESET)
emotions_encoded['train']['text'][:2]

print(utils.Style.RED + '\ndatatype of the text column' + utils.Style.RESET)
type(emotions_encoded['train']['text'][:2]) # text can't be converted to a tensor. it stays as plain list

print(utils.Style.RED + '\naccessing the label column' + utils.Style.RESET)
emotions_encoded['train']['label'][:2]

print(utils.Style.RED + '\ndatatype of the label column' + utils.Style.RESET)
type(emotions_encoded['train']['label'][:2])

print(utils.Style.RED + '\naccessing the input_ids column' + utils.Style.RESET)
emotions_encoded['train']['input_ids'][:2]

print(utils.Style.RED + '\ndatatype of the input_ids column' + utils.Style.RESET)
type(emotions_encoded['train']['input_ids'][:2]) # text can't be converted to a tensor. it stays as plain list

print(utils.Style.RED + '\naccessing the attention_mask column' + utils.Style.RESET)
emotions_encoded['train']['attention_mask'][:2]

print(utils.Style.RED + '\ndatatype of the attention_mask column' + utils.Style.RESET)
type(emotions_encoded['train']['attention_mask'][:2])

Loading cached processed dataset at /home/karvsmech/.cache/huggingface/datasets/emotion/split/1.0.0/cca5efe2dfeb58c1d098e0f9eeb200e9927d889b5a03c67097275dfb5fe463bd/cache-d385021a8c0bb3a0.arrow
Loading cached processed dataset at /home/karvsmech/.cache/huggingface/datasets/emotion/split/1.0.0/cca5efe2dfeb58c1d098e0f9eeb200e9927d889b5a03c67097275dfb5fe463bd/cache-83457ba7a585dbf4.arrow
Loading cached processed dataset at /home/karvsmech/.cache/huggingface/datasets/emotion/split/1.0.0/cca5efe2dfeb58c1d098e0f9eeb200e9927d889b5a03c67097275dfb5fe463bd/cache-1b486cc7f8695f03.arrow


[31mprinting emotions_encoded[0m


DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
})

[31m
printing emotions_encoded column names[0m


{'train': ['text', 'label', 'input_ids', 'attention_mask'],
 'validation': ['text', 'label', 'input_ids', 'attention_mask'],
 'test': ['text', 'label', 'input_ids', 'attention_mask']}

[31maccessing the text column[0m


['i didnt feel humiliated',
 'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake']

[31m
datatype of the text column[0m


list

[31m
accessing the label column[0m


tensor([0, 0])

[31m
datatype of the label column[0m


torch.Tensor

[31m
accessing the input_ids column[0m


tensor([[  101,  1045,  2134,  2102,  2514, 26608,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [  101,  1045,  2064,  2175,  2013,  3110,  2061, 20625,  2000,  2061,
          9636, 17772,  2074,  2013,  2108,  2105,  2619,  2040, 14977,  1998,
          2003,  8300,   102,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0, 

[31m
datatype of the input_ids column[0m


torch.Tensor

[31m
accessing the attention_mask column[0m


tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

[31m
datatype of the attention_mask column[0m


torch.Tensor

In [12]:
print(utils.Style.RED + 'the columns the model expects as an input' + utils.Style.RESET)
tokenizer.model_input_names # text and label are not required by the model?
# here the model is a masked language model and it has no use for targets.
# we are only trying to use the last hidden state produced by 'distilbert-base-uncased' model for the input data, as features for the classification task.
# we are essentially trying to build a classification head on a 'distilbert-base-uncased' model body with weights unchanged.

[31mthe columns the model expects as an input[0m


['input_ids', 'attention_mask']

In [28]:
# retrieving the hidden state for the entire dataset.
def extract_hidden_states(batch):
    # place model inputs on the device GPU if available or CPU
    inputs_ = {k: v.to(device) for k, v in batch.items() if k in tokenizer.model_input_names}
    # extract last hidden states
    with torch.no_grad():
        last_hidden_state = model(**inputs_).last_hidden_state

    # return vector for CLS token as is common in text classification tasks
    return {'hidden_state': last_hidden_state[:, 0].cpu().numpy()}

In [29]:
emotions_hidden = emotions_encoded.map(extract_hidden_states, batched=True)

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [30]:
# a column called hidden_state has been added to the dataset dict
emotions_hidden.column_names
emotions_hidden['train']['hidden_state']

{'train': ['text', 'label', 'input_ids', 'attention_mask', 'hidden_state'],
 'validation': ['text',
  'label',
  'input_ids',
  'attention_mask',
  'hidden_state'],
 'test': ['text', 'label', 'input_ids', 'attention_mask', 'hidden_state']}

tensor([[-0.1168,  0.0986, -0.1296,  ...,  0.0587,  0.3543,  0.4042],
        [-0.0324, -0.0323, -0.1957,  ..., -0.1747,  0.3546,  0.3028],
        [ 0.0397,  0.2022,  0.1423,  ..., -0.1141,  0.3394,  0.3958],
        ...,
        [-0.0034, -0.0959,  0.0584,  ..., -0.0427,  0.2496,  0.3076],
        [ 0.0666,  0.1733,  0.1290,  ...,  0.0612,  0.2904,  0.4684],
        [ 0.0167,  0.1013, -0.0073,  ..., -0.0649,  0.3454,  0.2199]])

In [None]:
# creating a feature matrix