<a href="https://colab.research.google.com/github/ipavlopoulos/xlmr-emotion/blob/main/XLMR_emotion_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install transformers[sentencepiece]

In [1]:
#@title Import libraries
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input,Lambda
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
import transformers
from transformers import TFAutoModel, AutoTokenizer
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors
from tensorflow.keras import layers
import matplotlib.pyplot as plt
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.random import set_seed
from tensorflow.keras.callbacks import ReduceLROnPlateau,ModelCheckpoint
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
# Load the TensorBoard notebook extension.
%load_ext tensorboard
from datetime import datetime
from packaging import version
from tensorflow import keras

In [6]:
#@title Load the data (SemEval 2018 Task 1 / need to be downloaded first)
#SemEval_with_extra_neutral
data_train = pd.read_csv('semeval_train_with_extra_neutral.csv',sep='\t')
data_dev = pd.read_csv('semeval_dev_with_extra_neutral.csv',sep='\t')
data_test = pd.read_csv('semeval_test_with_extra_neutral.csv',sep='\t')

# Load the model and fine-tune it on the data

In [7]:
# Construct a XLM-R tokenizer based on SentencePiece
model_name="jplu/tf-xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/512 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]



In [8]:
##Define sentences and labels
sentences_train = data_train.clean.values
labels_train = data_train.loc[:,'anger':'trust'].values

sentences_dev = data_dev.clean.values
labels_dev = data_dev.loc[:,'anger':'trust'].values

sentences_test = data_test.clean.values
labels_test = data_test.loc[:,'anger':'trust'].values

In [12]:
#Function for encoding according to XLM-R
def encode(texts, tokenizer, maxlen):
    # Convert NumPy array to list
    texts = texts.tolist()

    dic = tokenizer.batch_encode_plus(
        texts,
        return_attention_mask=False, #need only the input_ids
        truncation=True,
        return_token_type_ids=False,
        pad_to_max_length=True,
        max_length=maxlen
    )

    return np.array(dic['input_ids'])

In [13]:
max_len=109
# Tokenize and encode sentences in each set
x_dev = encode(sentences_dev, tokenizer, maxlen=max_len)
x_test = encode(sentences_test, tokenizer, maxlen=max_len)
x_train = encode(sentences_train, tokenizer, maxlen=max_len)



In [14]:
#Prepare the inputs for the XLM-R
AUTO = tf.data.experimental.AUTOTUNE
BATCH_SIZE = 16

train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train, labels_train))
    .repeat()
    .shuffle(len(sentences_train),seed=1995)
    .batch(BATCH_SIZE)# Combines consecutive elements of this dataset into batches.
    .prefetch(AUTO)) #This allows later elements to be prepared while the current element is being processed.

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_dev, labels_dev))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO))

test_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_test,labels_test))
    .batch(BATCH_SIZE))

In [15]:
# Define the model architecture
def build_model(transformer, max_len):
    np.random.seed(2909)
    set_seed(1995)

    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")

    # Define the Lambda layer with output_shape
    def transformer_wrapper(inputs):
        return transformer(inputs)[0]

    # Get the shape of the output tensor from the transformer model
    output_shape = (max_len, transformer.config.hidden_size)  # (sequence_length, hidden_size)

    sequence_output = Lambda(transformer_wrapper, output_shape=output_shape)(input_word_ids)
    cls_token = sequence_output[:, 0, :]  # Extract the [CLS] token representation
    out = Dense(8, activation='sigmoid')(cls_token)  # Output layer

    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(learning_rate=1e-5), loss='binary_crossentropy', metrics=['accuracy'])

    return model

# Model configuration
model_name = "jplu/tf-xlm-roberta-base"

# Instantiate the transformer model
transformer_layer = TFAutoModel.from_pretrained(model_name)

# Build and summarize the model
model = build_model(transformer_layer, max_len=max_len)
model.summary()

tf_model.h5:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

Some layers from the model checkpoint at jplu/tf-xlm-roberta-base were not used when initializing TFXLMRobertaModel: ['lm_head']
- This IS expected if you are initializing TFXLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFXLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFXLMRobertaModel were initialized from the model checkpoint at jplu/tf-xlm-roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLMRobertaModel for predictions without further training.


In [16]:
early_stopping = EarlyStopping(
    monitor='val_loss',
    mode='min',
    patience=3,
    verbose=1,
    restore_best_weights=True)

callbacks = [
    early_stopping
]

In [17]:
n_steps = x_train.shape[0] // BATCH_SIZE
train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=30,
    callbacks=callbacks
)

eval = model.evaluate(test_dataset, verbose=1)

Epoch 1/30
[1m466/466[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 126ms/step - accuracy: 0.4504 - loss: 0.6489 - val_accuracy: 0.4859 - val_loss: 0.5982
Epoch 2/30
[1m466/466[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 98ms/step - accuracy: 0.4694 - loss: 0.5911 - val_accuracy: 0.4859 - val_loss: 0.5491
Epoch 3/30
[1m466/466[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 102ms/step - accuracy: 0.4808 - loss: 0.5482 - val_accuracy: 0.4859 - val_loss: 0.5144
Epoch 4/30
[1m466/466[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 100ms/step - accuracy: 0.4725 - loss: 0.5215 - val_accuracy: 0.4849 - val_loss: 0.4899
Epoch 5/30
[1m466/466[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 101ms/step - accuracy: 0.4700 - loss: 0.5000 - val_accuracy: 0.4821 - val_loss: 0.4732
Epoch 6/30
[1m466/466[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 101ms/step - accuracy: 0.4674 - loss: 0.4905 - val_accuracy: 0.4802 - val_loss: 0.4619
Epoch 7/30


In [23]:
#@title Save the weights
model.save_weights('xlmr-e.weights.h5')
!zip 'xlmr-e.weights.zip' "xlmr-e.weights.h5"
# Copy to your drive folder (optional)
#!cp -r xlm_r_weights_.zip /content/drive/MyDrive/

  adding: xlmr-e.weights.h5 (deflated 20%)


In [24]:
#@title Load the saved model

# Define the model architecture (same as before)
def build_model(transformer, max_len):
    np.random.seed(2909)
    set_seed(1995)

    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")

    # Define the Lambda layer with output_shape
    def transformer_wrapper(inputs):
        return transformer(inputs)[0]

    # Get the shape of the output tensor from the transformer model
    output_shape = (max_len, transformer.config.hidden_size)  # (sequence_length, hidden_size)

    sequence_output = Lambda(transformer_wrapper, output_shape=output_shape)(input_word_ids)
    cls_token = sequence_output[:, 0, :]  # Extract the [CLS] token representation
    out = Dense(8, activation='sigmoid')(cls_token)  # Output layer

    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(learning_rate=1e-5), loss='binary_crossentropy', metrics=['accuracy'])

    return model

# Model configuration
model_name = "jplu/tf-xlm-roberta-base"

# Instantiate the transformer model
transformer_layer = TFAutoModel.from_pretrained(model_name)

# Build the new model
new_model = build_model(transformer_layer, max_len=max_len)

# Load the saved weights
new_model.load_weights('xlmr-e.weights.h5')

Some layers from the model checkpoint at jplu/tf-xlm-roberta-base were not used when initializing TFXLMRobertaModel: ['lm_head']
- This IS expected if you are initializing TFXLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFXLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFXLMRobertaModel were initialized from the model checkpoint at jplu/tf-xlm-roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLMRobertaModel for predictions without further training.
  saveable.load_own_variables(weights_store.get(inner_path))
