In [1]:
#!pip install datasets transformers pandas matplotlib torch

In [2]:
# Catch up...

In [3]:
!pip install huggingface_hub



In [4]:
from huggingface_hub import notebook_login

In [5]:
notebook_login()

ValueError: Invalid token passed.

In [6]:
from datasets import load_dataset
from transformers import AutoTokenizer
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn.functional as F

In [7]:
# Load the dataset
emotions = load_dataset("emotion")

# Split the dataset into training, test and validation sets
train_ds = emotions["train"]
valid_ds = emotions["validation"]
test_ds = emotions["test"]

print("Example dataset object:")
print(train_ds)

# Set emotions to pandas dataframe
emotions.set_format(type="pandas")
df = emotions["train"][:]
print(df.head())


# We can also obtain our string labels from the dataset
def label_int2str(row):
    return emotions["train"].features["label"].int2str(row)


df["label_name"] = df["label"].apply(label_int2str)
print(df.head())



Using custom data configuration default
Reusing dataset emotion (C:\Users\lewis\.cache\huggingface\datasets\emotion\default\0.0.0\348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705)


  0%|          | 0/3 [00:00<?, ?it/s]

Example dataset object:
Dataset({
    features: ['text', 'label'],
    num_rows: 16000
})
                                                text  label
0                            i didnt feel humiliated      0
1  i can go from feeling so hopeless to so damned...      0
2   im grabbing a minute to post i feel greedy wrong      3
3  i am ever feeling nostalgic about the fireplac...      2
4                               i am feeling grouchy      3
                                                text  label label_name
0                            i didnt feel humiliated      0    sadness
1  i can go from feeling so hopeless to so damned...      0    sadness
2   im grabbing a minute to post i feel greedy wrong      3      anger
3  i am ever feeling nostalgic about the fireplac...      2       love
4                               i am feeling grouchy      3      anger


In [8]:
# It is worth analysing the distribution of labels in the dataset
#df["label_name"].value_counts(ascending=True).plot.barh()
#plt.title("Frequency of Classes")
#plt.show()

# We can see the dataset is unbalanced, to solve this we can
# a) Randomly oversample the minority class
# b) Randomly undersample the majority class
# c) Gather more labled data
# This is not covered in this chapter but more information can be found here: https://oreil.ly/5XBhb

In [9]:
## Maximum Context Size
# The maximum context size is the maximum input sequence length of the transformer model.
# In the case of DistilBERT, the maximum context size is 512.

# Lets have a look at the distribution of words per tweet in the emotions database.

#df["Words Per Tweet"] = df["text"].str.split().apply(len)
#df.boxplot("Words Per Tweet", by="label_name", grid=False,
#           showfliers=False, color="black")
#plt.suptitle("")
#plt.xlabel("")
# plt.show()

# We can see that the majority of tweets are less than 20 words, and the longest are still under DistilBERTs maximum context size of 512.

# Reset formatting of the dataset as we dont need to visualise any more.
emotions.reset_format()

# Character Tokenization
text = "Tokenizing text is a core task of NLP."
tokenized_text = list(text)
print(tokenized_text)

# Our model expects each token to be represented by an integer, a simple way to do this is to encode each unique token with a unique integer.
token2idx = {ch: idx for idx, ch in enumerate(sorted(set(tokenized_text)))}
print(token2idx)
# Now we can map our tokens to the integers.
input_ids = [token2idx[token] for token in tokenized_text]
print(input_ids)

# We must now convert our input_ids to a 2D tensor of one-hot encoding vectors.
input_ids = torch.tensor(input_ids)
one_hot_encodings = F.one_hot(input_ids, num_classes=len(token2idx))
print(one_hot_encodings.shape)

# Lets Examine the first vector
print(f"Token: {tokenized_text[0]}")
print(f"Tensor index: {input_ids[0]}")
print(f"One-hot: {one_hot_encodings[0]}")

# This approach is not very good as it loses linguistic structures such as words, which could be learned this way
# however, greatly increases the complexity of the training process. Word tokenization is used to solve this.


# Word Tokenization

# Split the text into words.
tokenized_text = text.split()
print(tokenized_text)

# Following the previous example, we would now map each word to an integer. However, one problem with this is
# punctuation so, "NLP." is treated as a single token. Given words can often have deviations like this (or such as
# misspellings). This would leave us with a wasteful sized vocabulary.

# One option is to use only the top N most frequent words.
# and mapping unknown words to the same "unk" token.
# However, another option is subword tokenization.

# Subword Tokenization - combining character and word tokenization.

# BERT uses 'WordPiece' tokenization. Lets see it in action.
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

encoded_text = tokenizer(text)  # Lets feed it our "Tokenizing text is a core task of NLP." example text.
print(encoded_text)  # We get unique ids!

# Lets now decode the ids back to words.
tokens = tokenizer.convert_ids_to_tokens(encoded_text.input_ids)
print(tokens)  # Tokenizing and NLP have been split, this is expected as they are not common words.
# the '##' prefix is used to indicate that the token is a subword.

# Lets see it as a string
print(tokenizer.convert_tokens_to_string(tokens))


# Tokenizing the whole dataset

# Create a tokenizer function, with padding, and truncation to the max length.
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)


# Tokenize the dataset.
emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)

# Training a text classification model

from transformers import AutoModel

model_ckpt = "distilbert-base-uncased"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModel.from_pretrained(model_ckpt).to(device)

# Extracting the last hidden states
text = "this is a test"
inputs = tokenizer(text, return_tensors="pt")
print(f"Input tensor shape: {inputs['input_ids'].size()}")

inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.no_grad():
    outputs = model(**inputs)
print(outputs)

outputs.last_hidden_state.size()


# Adding extracted hidden states to the dataset
def extract_hidden_states(batch):
    # Place model inputs on the GPU
    inputs = {k: v.to(device) for k, v in batch.items()
              if k in tokenizer.model_input_names}
    # Extract last hidden states
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
    # Return vector for [CLS] token
    return {"hidden_state": last_hidden_state[:, 0].cpu().numpy()}

['T', 'o', 'k', 'e', 'n', 'i', 'z', 'i', 'n', 'g', ' ', 't', 'e', 'x', 't', ' ', 'i', 's', ' ', 'a', ' ', 'c', 'o', 'r', 'e', ' ', 't', 'a', 's', 'k', ' ', 'o', 'f', ' ', 'N', 'L', 'P', '.']
{' ': 0, '.': 1, 'L': 2, 'N': 3, 'P': 4, 'T': 5, 'a': 6, 'c': 7, 'e': 8, 'f': 9, 'g': 10, 'i': 11, 'k': 12, 'n': 13, 'o': 14, 'r': 15, 's': 16, 't': 17, 'x': 18, 'z': 19}
[5, 14, 12, 8, 13, 11, 19, 11, 13, 10, 0, 17, 8, 18, 17, 0, 11, 16, 0, 6, 0, 7, 14, 15, 8, 0, 17, 6, 16, 12, 0, 14, 9, 0, 3, 2, 4, 1]
torch.Size([38, 20])
Token: T
Tensor index: 5
One-hot: tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
['Tokenizing', 'text', 'is', 'a', 'core', 'task', 'of', 'NLP.']


Loading cached processed dataset at C:\Users\lewis\.cache\huggingface\datasets\emotion\default\0.0.0\348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705\cache-391177785bf07cb8.arrow
Loading cached processed dataset at C:\Users\lewis\.cache\huggingface\datasets\emotion\default\0.0.0\348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705\cache-ec84319c9643be2f.arrow


{'input_ids': [101, 19204, 6026, 3793, 2003, 1037, 4563, 4708, 1997, 17953, 2361, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['[CLS]', 'token', '##izing', 'text', 'is', 'a', 'core', 'task', 'of', 'nl', '##p', '.', '[SEP]']
[CLS] tokenizing text is a core task of nlp. [SEP]


  0%|          | 0/1 [00:00<?, ?ba/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Input tensor shape: torch.Size([1, 6])
BaseModelOutput(last_hidden_state=tensor([[[-0.1565, -0.1862,  0.0528,  ..., -0.1188,  0.0662,  0.5470],
         [-0.3575, -0.6484, -0.0618,  ..., -0.3040,  0.3508,  0.5221],
         [-0.2772, -0.4459,  0.1818,  ..., -0.0948, -0.0076,  0.9958],
         [-0.2841, -0.3917,  0.3753,  ..., -0.2151, -0.1173,  1.0526],
         [ 0.2661, -0.5094, -0.3180,  ..., -0.4203,  0.0144, -0.2149],
         [ 0.9441,  0.0112, -0.4714,  ...,  0.1439, -0.7288, -0.1619]]]), hidden_states=None, attentions=None)


In [10]:
# Convert to the torch format
emotions_encoded.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [11]:
# Extract hidden states of the dataset
emotions_hidden = emotions_encoded.map(extract_hidden_states, batched=True)

  0%|          | 0/16 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [12]:
# Show column names
emotions_hidden["train"].column_names

['text', 'label', 'input_ids', 'attention_mask', 'hidden_state']

In [13]:
# Creating a feature matrix
import numpy as np

X_train = np.array(emotions_hidden["train"]["hidden_state"])
X_valid = np.array(emotions_hidden["validation"]["hidden_state"])
y_train = np.array(emotions_hidden["train"]["label"])
y_valid = np.array(emotions_hidden["validation"]["label"])
print(X_train.shape, X_valid.shape)

(16000, 768) (2000, 768)


In [14]:
!pip install umap sklearn



In [15]:
#from umap import UMAP

In [16]:
#from sklearn.linear_model import LinearRegression

In [17]:
from sklearn.linear_model import LogisticRegression

In [18]:
#lr_clf = LogisticRegression(max_iter=3000)
#lr_clf.fit(X_train, y_train)
#lr_clf.score(X_valid, y_valid)

In [19]:
# 63% accuracy seems not great however, the dataset is unbalanced and so the model likely works better. Lets try classifying using the dummy classifier for context...

In [20]:
#from sklearn.dummy import DummyClassifier

#dummy_clf = DummyClassifier(strategy="most_frequent")
#dummy_clf.fit(X_train, y_train)
#dummy_clf.score(X_valid, y_valid)

In [21]:
# We can see our simple classfier with DistilBERT is significantly better than our baseline, progress!

In [22]:
#Lets take a look at the confusion matrix for our classifier 

In [23]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

In [24]:
#def plot_confusion_matrix(y_preds, y_true, labels):
#    cm = confusion_matrix(y_true, y_preds, normalize="true")
#    fig, ax = plt.subplots(figsize=(6, 6))
#    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
#    disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False)
#    plt.title("Normalized confusion matrix")
#    plt.show()

#y_preds = lr_clf.predict(X_valid)
#plot_confusion_matrix(y_preds, y_valid, labels)

In [25]:
# I am getting this error because the code setting the labels variable is ommitted from the notebook, I can see the generated confusion matrix in the textbook

In [26]:
# Lets finally fine tune DistilBERT!

In [27]:
# Note: We are using AutoModelForSequenceClassification as opposed to AutoModel as this will load the model with a classification head, which we can train easily. Also the error message is to be expected as we must train the layer.

In [28]:
from transformers import AutoModelForSequenceClassification

num_labels = 6
model = (AutoModelForSequenceClassification
         .from_pretrained(model_ckpt, num_labels=num_labels)
         .to(device))

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classi

In [29]:
# Lets define some performance metrics

In [30]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [31]:
#

In [32]:
#

In [33]:
#

In [34]:
# Lets define our training

In [35]:
!pip install absl-py



In [36]:
!pip install tensorflow



In [37]:
from transformers import Trainer, TrainingArguments

batch_size = 64
logging_steps = len(emotions_encoded["train"]) // batch_size
model_name = f"{model_ckpt}-finetuned-emotion"
training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=2,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
                                  push_to_hub=True,
                                  log_level="error")

In [38]:
# Theres a couple errors above, I believe the installation of python on my laptop is not configured properly. I will fix this for future tutorials

In [39]:
# Now lets define the trainer

In [40]:
trainer = Trainer(model=model, args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=emotions_encoded["train"],
                  eval_dataset=emotions_encoded["validation"],
                  tokenizer=tokenizer)

C:\Users\lewis\OneDrive - Edinburgh Napier University\Research\NLP\EmotionAnalysis\distilbert-base-uncased-finetuned-emotion is already a clone of https://huggingface.co/lewiswatson/distilbert-base-uncased-finetuned-emotion. Make sure you pull the latest changes with `repo.git_pull()`.


In [42]:
trainer.train();

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2106,0.224209,0.927,0.927102


NotADirectoryError: [WinError 267] The directory name is invalid: 'C:\\Users\\lewis\\AppData\\Local\\Temp\\tmp8vivahcr\\lfs_progress'

In [43]:
trainer.push_to_hub(commit_message="Re-ran Training! - Still 1 Epoch")

To https://huggingface.co/lewiswatson/distilbert-base-uncased-finetuned-emotion
   401eaeb..41e7630  main -> main



In [None]:
#notebook_login()