In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import XLMRobertaTokenizer, TFXLMRobertaModel

# Load the 20 Newsgroups dataset
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
data = fetch_20newsgroups(subset='all', categories=categories, shuffle=True, random_state=42)

# Create a DataFrame from the dataset
data_df = pd.DataFrame({'text': data.data, 'target': data.target})

# Split the data into training and testing sets
train_df, test_df = train_test_split(data_df, test_size=0.2, random_state=42)

# Initialize the XLM-R tokenizer and model
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
model = TFXLMRobertaModel.from_pretrained('xlm-roberta-base', trainable=False)

# Tokenize the text data
train_encodings = tokenizer(list(train_df['text']), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(list(test_df['text']), truncation=True, padding=True, max_length=512)

# Convert the tokenized data into tensors
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_df['target'].values
)).batch(32)

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    test_df['target'].values
)).batch(32)

# Fine-tune XLM-R on the text classification task
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

input_ids = tf.keras.layers.Input(shape=(512,), name='input_ids', dtype='int32')
attention_mask = tf.keras.layers.Input(shape=(512,), name='attention_mask', dtype='int32')
outputs = model({'input_ids': input_ids, 'attention_mask': attention_mask})[0]
outputs = tf.keras.layers.GlobalAveragePooling1D()(outputs)
outputs = tf.keras.layers.Dropout(0.1)(outputs)
outputs = tf.keras.layers.Dense(4, activation='softmax')(outputs)

model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=outputs)

model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
# Compile the model
#model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.fit(train_dataset, epochs=3)

# Evaluate the model on the test set
_, test_acc = model.evaluate(test_dataset)
print('Test Accuracy:', test_acc)

# Use the fine-tuned model for prediction
preds = model.predict(test_dataset)
pred_labels = np.argmax(preds, axis=1)

# Calculate accuracy on the test set
accuracy = accuracy_score(test_df['target'], pred_labels)
print('Ensemble Model Accuracy:', accuracy)


All model checkpoint layers were used when initializing TFXLMRobertaModel.

All the layers of TFXLMRobertaModel were initialized from the model checkpoint at xlm-roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLMRobertaModel for predictions without further training.


Epoch 1/3


Epoch 2/3
Epoch 3/3
Test Accuracy: 0.18436577916145325
Ensemble Model Accuracy: 0.18436578171091444


In [15]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import XLMRobertaTokenizer, TFXLMRobertaModel
from kerastuner import HyperModel
from kerastuner.tuners import RandomSearch

# Load the 20 Newsgroups dataset
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
data = fetch_20newsgroups(subset='all', categories=categories, shuffle=True, random_state=42)

# Create a DataFrame from the dataset
data_df = pd.DataFrame({'text': data.data, 'target': data.target})

# Split the data into training and testing sets
train_df, test_df = train_test_split(data_df, test_size=0.2, random_state=42)

# Initialize the XLM-R tokenizer and model
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

# Tokenize the text data
train_encodings = tokenizer(list(train_df['text']), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(list(test_df['text']), truncation=True, padding=True, max_length=512)

# Convert the tokenized data into tensors
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_df['target'].values
)).batch(32)

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    test_df['target'].values
)).batch(32)

# Define the HyperModel class
class MyHyperModel(HyperModel):
    def build(self, hp):
        model = TFXLMRobertaModel.from_pretrained('xlm-roberta-base', trainable=False)
        
        input_ids = tf.keras.layers.Input(shape=(512,), name='input_ids', dtype='int32')
        attention_mask = tf.keras.layers.Input(shape=(512,), name='attention_mask', dtype='int32')
        outputs = model({'input_ids': input_ids, 'attention_mask': attention_mask})[0]
        outputs = tf.keras.layers.GlobalAveragePooling1D()(outputs)
        outputs = tf.keras.layers.Dropout(hp.Float('dropout', 0, 0.5, step=0.1, default=0.1))(outputs)
        outputs = tf.keras.layers.Dense(4, activation='softmax')(outputs)
        
        model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=outputs)
        
        optimizer = tf.keras.optimizers.Adam(hp.Float('learning_rate', 1e-5, 1e-3, sampling='log'))
        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
        metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
        
        model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
        
        return model

# Create an instance of the HyperModel
hypermodel = MyHyperModel()

# Define the tuner and perform hyperparameter tuning
tuner = RandomSearch(
    hypermodel,
    objective='val_accuracy',
    max_trials=5,
    executions_per_trial=1,
    directory='my_dir',
    project_name='text_classification'
)

tuner.search(train_dataset, validation_data=test_dataset, epochs=3)

# Retrieve the best model and save it
best_model = tuner.get_best_models(num_models=1)[0]
best_model.save('best_model.h5')


Trial 5 Complete [01h 04m 48s]
val_accuracy: 0.29646018147468567

Best val_accuracy So Far: 0.3303834795951843
Total elapsed time: 07h 49m 22s
INFO:tensorflow:Oracle triggered exit


All model checkpoint layers were used when initializing TFXLMRobertaModel.

All the layers of TFXLMRobertaModel were initialized from the model checkpoint at xlm-roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLMRobertaModel for predictions without further training.


In [18]:
import tensorflow as tf
from transformers import TFXLMRobertaModel

# Define a dictionary of custom objects
custom_objects = {'TFXLMRobertaModel': TFXLMRobertaModel}

# Load the model with the custom object scope
loaded_model = tf.keras.models.load_model('best_model.h5', custom_objects=custom_objects)



# Fine tuning pipeline

A fine-tuning pipeline for an XLM-R (Cross-lingual Language Model - RoBERTa) model refers to the process of adapting a pre-trained XLM-R model to a specific task or domain by further training it on a task-specific dataset. Fine-tuning allows the model to learn task-specific patterns and improve its performance on the target task.

In the provided code snippet, the XLM-R model is loaded and then modified to suit a text classification task. The pre-trained model is used as a starting point, and the last layers are typically replaced or extended with task-specific layers. The model is then fine-tuned on a dataset specific to the text classification task.

The fine-tuning pipeline typically involves the following steps:

Loading the pre-trained XLM-R model: A pre-trained XLM-R model, which has been trained on a large corpus of text data, is loaded. This model has already learned general language representations.

Modifying the model architecture: The last layers of the pre-trained XLM-R model are replaced or extended to match the requirements of the specific task. This may involve adding additional layers, changing the output dimensions, or modifying the activation functions.

Providing task-specific data: A dataset specific to the text classification task is prepared. This dataset consists of labeled text examples, where each example is associated with a specific class or label.

Fine-tuning the model: The modified model is trained on the task-specific dataset using techniques such as backpropagation and gradient descent. The model's parameters are updated to minimize the loss function and improve its performance on the classification task.

Evaluating the fine-tuned model: The performance of the fine-tuned model is evaluated using appropriate evaluation metrics, such as accuracy or F1 score, on a separate validation or test dataset. This helps assess how well the model has adapted to the text classification task.

The goal of the fine-tuning pipeline is to leverage the knowledge encoded in the pre-trained XLM-R model and transfer it to the specific text classification task, resulting in improved performance and better representation of the task-specific data. Fine-tuning allows the model to capture domain-specific patterns and nuances, leading to more accurate predictions on the target task.

methods available to automate the process of selecting the best model architecture and modifications based on the performance on a validation set. Here are a few approaches:

Grid Search: Grid search is a technique where you define a set of hyperparameters and their possible values. It exhaustively searches all possible combinations of hyperparameters and evaluates the model performance using cross-validation. This allows you to automatically select the best combination of hyperparameters.

Random Search: Similar to grid search, random search involves defining a set of hyperparameters and their possible values. However, instead of exhaustively searching all combinations, random search randomly selects a subset of combinations to evaluate. This approach is beneficial when the search space is large, and evaluating all combinations is computationally expensive.

Bayesian Optimization: Bayesian optimization is a more advanced technique that uses a probabilistic model to estimate the performance of different hyperparameter combinations. It intelligently explores the hyperparameter space by selecting the next set of hyperparameters based on the results of previous evaluations. This approach can efficiently find good hyperparameter settings with fewer evaluations compared to grid search or random search.

Automated Hyperparameter Tuning Libraries: There are several libraries available that provide automated hyperparameter tuning capabilities, such as scikit-optimize, Optuna, or Hyperopt. These libraries implement various optimization algorithms and offer convenient interfaces to define search spaces, objective functions, and evaluation strategies.

By utilizing these automated techniques, you can systematically explore different model architectures and modifications while continuously monitoring the performance on a validation set. This helps in finding the optimal combination of hyperparameters and modifications without the need for manual experimentation.

Optuna, a popular library for hyperparameter optimization: