In [9]:
import tensorflow as tf
from transformers import TFAutoModel, AutoTokenizer
from datasets import load_dataset

In [10]:
model = TFAutoModel.from_pretrained("bert-base-uncased")

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [11]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [12]:
inputs = tokenizer(['Hello world', 'Hi how are you'], padding=True, truncation=True,
                  return_tensors='tf')
inputs

{'input_ids': <tf.Tensor: shape=(2, 6), dtype=int32, numpy=
array([[ 101, 7592, 2088,  102,    0,    0],
       [ 101, 7632, 2129, 2024, 2017,  102]], dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(2, 6), dtype=int32, numpy=
array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(2, 6), dtype=int32, numpy=
array([[1, 1, 1, 1, 0, 0],
       [1, 1, 1, 1, 1, 1]], dtype=int32)>}

In [13]:
output = model(inputs)
output

TFBaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=<tf.Tensor: shape=(2, 6, 768), dtype=float32, numpy=
array([[[-0.16888332,  0.13606355, -0.13940018, ..., -0.6251125 ,
          0.05217262,  0.36714536],
        [-0.3632745 ,  0.14121903,  0.8799885 , ...,  0.10433032,
          0.2887578 ,  0.37267894],
        [-0.69859415, -0.69879794,  0.06450251, ..., -0.22103661,
          0.00986893, -0.5939796 ],
        [ 0.83098257,  0.12366717, -0.15119013, ...,  0.10309545,
         -0.67792666, -0.26285172],
        [-0.40266633, -0.01928236,  0.5732502 , ..., -0.20656842,
          0.02338582,  0.20126349],
        [-0.6228408 , -0.27453488,  0.1811763 , ..., -0.12944865,
         -0.03839079, -0.05733156]],

       [[ 0.09286558, -0.02636361, -0.12239343, ..., -0.21063566,
          0.17386371,  0.17250973],
        [ 0.40742022, -0.05930945,  0.55234593, ..., -0.6790563 ,
          0.6555748 , -0.2945646 ],
        [-0.21155298, -0.6858643 , -0.46280792, ...,  0.15278494

In [14]:
emotions = load_dataset('SetFit/emotion')

  0%|          | 0/3 [00:00<?, ?it/s]

In [15]:
emotions

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 16000
    })
    test: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 2000
    })
    validation: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 2000
    })
})

In [16]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

In [17]:
emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [18]:
emotions_encoded

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 16000
    })
    test: Dataset({
        features: ['text', 'label', 'label_text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
    validation: Dataset({
        features: ['text', 'label', 'label_text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
})

In [19]:
# setting 'input_ids', 'attention_mask', 'token_type_ids', and 'label'
# to the tensorflow format. Now if you access this dataset you will get these
# columns in `tf.Tensor` format

emotions_encoded.set_format('tf', 
                            columns=['input_ids', 'attention_mask', 'token_type_ids', 'label'])

# setting BATCH_SIZE to 64.
BATCH_SIZE = 64

def order(inp):
    '''
    This function will group all the inputs of BERT
    into a single dictionary and then output it with
    labels.
    '''
    data = list(inp.values())
    return {
        'input_ids': data[1],
        'attention_mask': data[2],
        'token_type_ids': data[3]
    }, data[0]

# converting train split of `emotions_encoded` to tensorflow format
train_dataset = tf.data.Dataset.from_tensor_slices(emotions_encoded['train'][:])
# set batch_size and shuffle
train_dataset = train_dataset.batch(BATCH_SIZE).shuffle(1000)
# map the `order` function
train_dataset = train_dataset.map(order, num_parallel_calls=tf.data.AUTOTUNE)

# ... doing the same for test set ...
test_dataset = tf.data.Dataset.from_tensor_slices(emotions_encoded['test'][:])
test_dataset = test_dataset.batch(BATCH_SIZE)
test_dataset = test_dataset.map(order, num_parallel_calls=tf.data.AUTOTUNE)

In [20]:
inp, out = next(iter(train_dataset)) # a batch from train_dataset
print(inp, '\n\n', out)

{'input_ids': <tf.Tensor: shape=(64, 87), dtype=int64, numpy=
array([[  101,  4921,  2063, ...,     0,     0,     0],
       [  101,  1045,  2318, ...,     0,     0,     0],
       [  101,  1045,  6181, ...,     0,     0,     0],
       ...,
       [  101,  1045,  2514, ...,     0,     0,     0],
       [  101,  1045,  2788, ...,     0,     0,     0],
       [  101, 10047,  3110, ...,     0,     0,     0]])>, 'attention_mask': <tf.Tensor: shape=(64, 87), dtype=int64, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])>, 'token_type_ids': <tf.Tensor: shape=(64, 87), dtype=int64, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])>} 

 tf.Tensor(
[1 0 2 0 1 3 4 0 2 2 1 0 1 

In [21]:
class BERTForClassification(tf.keras.Model):
    
    def __init__(self, bert_model, num_classes):
        super().__init__()
        self.bert = bert_model
        self.fc = tf.keras.layers.Dense(num_classes, activation='softmax')
        
    def call(self, inputs):
        x = self.bert(inputs)[1]
        return self.fc(x)

In [22]:
classifier = BERTForClassification(model, num_classes=6)

classifier.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics=['accuracy']
)

In [None]:
# Function to classify new text
def classify_text(text):
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors='tf')
    logits = classifier(inputs)
    predicted_class = tf.argmax(logits, axis=1).numpy()[0]
    return emotions['train'].features['label'].int2str(predicted_class)

# Classify a new example
new_text = "I feel great!"
predicted_emotion = classify_text(new_text)
print(f"The predicted emotion for the text '{new_text}' is: {predicted_emotion}")


In [23]:
history = classifier.fit(
    train_dataset,
    epochs=3
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [24]:
classifier.evaluate(test_dataset)



[0.16878357529640198, 0.9235000014305115]

In [34]:
# Load the Emotion dataset
emotions = load_dataset('SetFit/emotion')

# Inspect the dataset structure and label features
print(emotions)
print("\nLabel feature details for the 'train' split:")
print(emotions['train'].features['label'])


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 16000
    })
    test: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 2000
    })
    validation: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 2000
    })
})

Label feature details for the 'train' split:
Value(dtype='int64', id=None)


In [36]:
from datasets import load_dataset

# Load the Emotion dataset
emotions = load_dataset('SetFit/emotion')

# Access the 'train' split
train_data = emotions['train']

# Check the unique values in 'label'
unique_labels = set(train_data['label'])
print("Unique numerical labels:", unique_labels)

# Check the unique values in 'label_text'
# Ensure that 'label_text' exists in the dataset
if 'label_text' in train_data.features:
    unique_label_texts = set(train_data['label_text'])
    print("Unique label texts:", unique_label_texts)
else:
    print("No 'label_text' feature found in the dataset.")


  0%|          | 0/3 [00:00<?, ?it/s]

Unique numerical labels: {0, 1, 2, 3, 4, 5}
Unique label texts: {'anger', 'joy', 'surprise', 'fear', 'sadness', 'love'}


In [37]:
# Create a label map from numerical labels to text labels
label_map = {
    0: 'anger',
    1: 'joy',
    2: 'surprise',
    3: 'fear',
    4: 'sadness',
    5: 'love'
}


In [38]:

# Function to classify new text
def classify_text(text):
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors='tf')
    logits = classifier(inputs)
    predicted_class = tf.argmax(logits, axis=1).numpy()[0]
    label = label_map.get(predicted_class, "Unknown")
    return label

# Classify a new example
new_text = "I feel great!"
predicted_emotion = classify_text(new_text)
print(f"The predicted emotion for the text '{new_text}' is: {predicted_emotion}")


The predicted emotion for the text 'I feel great!' is: joy
