This .ipynb file helps us decide which tokenisation model is better . The code was found online but we applyed it to our own code and especially our own data

# STEP 3-Challenging tokenisation

We needed to prove which tokeniser gave us better results for most languages not only the ones we know . We decided to compare them using the codee below:

In [5]:
import numpy as np
import pandas as pd
import os, warnings
warnings.filterwarnings('ignore')


import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam


from transformers import BertTokenizer, TFBertModel               #BERT
from transformers import DistilBertTokenizer, TFDistilBertModel    #DistilBERT
from transformers import XLMRobertaTokenizer, TFXLMRobertaModel    #XLM-RoBERTa


from sklearn.model_selection import train_test_split


import gc

from tabulate import tabulate

os.environ["WANDB_API_KEY"] = "0"

In [6]:
def Init_TPU():

    try:
        resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
        tf.config.experimental_connect_to_cluster(resolver)
        tf.tpu.experimental.initialize_tpu_system(resolver)
        strategy = tf.distribute.experimental.TPUStrategy(resolver)
        REPLICAS = strategy.num_replicas_in_sync
        print("Connected to TPU Successfully:\n TPUs Initialised with Replicas:",REPLICAS)

        return strategy

    except ValueError:

        print("Connection to TPU Falied")
        print("Using default strategy for CPU and single GPU")
        strategy = tf.distribute.get_strategy()

        return strategy

strategy=Init_TPU()

Connection to TPU Falied
Using default strategy for CPU and single GPU


In [7]:

train_url = os.path.join('train_cleaned_numeric_labels.csv')
train_data = pd.read_csv(train_url, header='infer')
gc.collect()

4116

In [8]:
# Transformer Model Name
Bert_model = 'bert-base-multilingual-cased'
distilBert_model = 'distilbert-base-multilingual-cased'
xlmRoberta_model = 'jplu/tf-xlm-roberta-base'

# Define Tokenizer for each
Bert_toknzr = BertTokenizer.from_pretrained(Bert_model)
distilBert_toknzr = DistilBertTokenizer.from_pretrained(distilBert_model)
xlmRoberta_toknzr = XLMRobertaTokenizer.from_pretrained(xlmRoberta_model)


In [9]:
# Checking the output of tokenizer
sentence = 'Le temps est beau le ciel est bleu!'

print("BERT Model Tokenizer Output:",Bert_toknzr.convert_tokens_to_ids(list(Bert_toknzr.tokenize(sentence))))
print("DistilBERT Model Tokenizer Output:",distilBert_toknzr.convert_tokens_to_ids(list(distilBert_toknzr.tokenize(sentence))))
print("XLM-RoBERTa Model Tokenizer Output:",xlmRoberta_toknzr.convert_tokens_to_ids(list(xlmRoberta_toknzr.tokenize(sentence))))

BERT Model Tokenizer Output: [10281, 12358, 10176, 58817, 10141, 40229, 10176, 52816, 106]
DistilBERT Model Tokenizer Output: [10281, 12358, 10176, 58817, 10141, 40229, 10176, 52816, 106]
XLM-RoBERTa Model Tokenizer Output: [636, 4109, 437, 44551, 95, 6, 67708, 437, 123774, 38]


In [10]:
# Extraire uniquement la colonne 'Text' sous forme de liste simple de chaînes
train = train_data['Text'].values.tolist()

# Maximum length
max_len = 50

# Tokenisation avec BERT
train_encode_Bert = Bert_toknzr.batch_encode_plus(train, pad_to_max_length=True, max_length=max_len, truncation=True)

# Tokenisation avec DistilBERT
train_encode_DistilBert = distilBert_toknzr.batch_encode_plus(train, pad_to_max_length=True, max_length=max_len, truncation=True)

# Tokenisation avec XLM-RoBERTa
train_encode_XlmRoberta = xlmRoberta_toknzr.batch_encode_plus(train, pad_to_max_length=True, max_length=max_len, truncation=True)


In [11]:


test_size = 0.1

# BERT
x_tr_bert, x_val_bert, y_tr_bert, y_val_bert = train_test_split(train_encode_Bert['input_ids'], train_data.Label.values, test_size=test_size)

# DistilBERT
x_tr_Dbert, x_val_Dbert, y_tr_Dbert, y_val_Dbert = train_test_split(train_encode_DistilBert['input_ids'], train_data.Label.values, test_size=test_size)

# XLM-RoBERTa
x_tr_XR, x_val_XR, y_tr_XR, y_val_XR = train_test_split(train_encode_XlmRoberta['input_ids'], train_data.Label.values, test_size=test_size)

In [12]:
gc.collect()

0

In [13]:

AUTO = tf.data.experimental.AUTOTUNE
batch_size = 16 * strategy.num_replicas_in_sync

#BERT
tr_ds_bert = (tf.data.Dataset.from_tensor_slices((x_tr_bert, y_tr_bert)).repeat().shuffle(2048).batch(batch_size).prefetch(AUTO))
val_ds_bert = (tf.data.Dataset.from_tensor_slices((x_val_bert, y_val_bert)).batch(batch_size).prefetch(AUTO))

#DistilBERT
tr_ds_Dbert = (tf.data.Dataset.from_tensor_slices((x_tr_Dbert, y_tr_Dbert)).repeat().shuffle(2048).batch(batch_size).prefetch(AUTO))
val_ds_Dbert = (tf.data.Dataset.from_tensor_slices((x_val_Dbert, y_val_Dbert)).batch(batch_size).prefetch(AUTO))

#XLM-RoBERTa
tr_ds_XR = (tf.data.Dataset.from_tensor_slices((x_tr_XR, y_tr_XR)).repeat().shuffle(2048).batch(batch_size).prefetch(AUTO))
val_ds_XR = (tf.data.Dataset.from_tensor_slices((x_val_XR, y_val_XR)).batch(batch_size).prefetch(AUTO))

In [14]:
gc.collect()


0

In [7]:
import tensorflow as tf
from transformers import TFBertModel, TFDistilBertModel, TFXLMRobertaModel
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model

import os

os.environ['TF_USE_LEGACY_KERAS'] = '2'

# Définir les modèles pré-entraînés
Bert_model = 'bert-base-uncased'
distilBert_model = 'distilbert-base-uncased'
xlmRoberta_model = 'xlm-roberta-base'

max_len = 50  # Longueur maximale des séquences
def build_model():
    # Modèle BERT
    input_ids_bert = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
    attention_mask_bert = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")

    # Créer le modèle BERT
    bert = TFBertModel.from_pretrained(Bert_model)
    # Correct call method
    bert_outputs = bert(input_ids=input_ids_bert, attention_mask=attention_mask_bert)
    cls_token_bert = bert_outputs[0][:, 0, :]
    output_layer_bert = Dense(3, activation='softmax')(cls_token_bert)
    model_bert = Model(inputs=[input_ids_bert, attention_mask_bert], outputs=output_layer_bert)
    model_bert.compile(optimizer=Adam(learning_rate=1e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    # Modèle DistilBERT
    input_ids_dbert = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
    attention_mask_dbert = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")

    # Créer le modèle DistilBERT
    distilbert = TFDistilBertModel.from_pretrained(distilBert_model)
    # Correct call method
    distilbert_outputs = distilbert(input_ids=input_ids_dbert, attention_mask=attention_mask_dbert)
    cls_token_dbert = distilbert_outputs[0][:, 0, :]
    output_layer_dbert = Dense(3, activation='softmax')(cls_token_dbert)
    model_dbert = Model(inputs=[input_ids_dbert, attention_mask_dbert], outputs=output_layer_dbert)
    model_dbert.compile(optimizer=Adam(learning_rate=1e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    # Modèle XLM-RoBERTa
    input_ids_xlm = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
    attention_mask_xlm = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")

    # Créer le modèle XLM-RoBERTa
    xlm = TFXLMRobertaModel.from_pretrained(xlmRoberta_model)
    # Correct call method
    xlm_outputs = xlm(input_ids=input_ids_xlm, attention_mask=attention_mask_xlm)
    cls_token_xlm = xlm_outputs[0][:, 0, :]
    output_layer_xlm = Dense(3, activation='softmax')(cls_token_xlm)
    model_xlm = Model(inputs=[input_ids_xlm, attention_mask_xlm], outputs=output_layer_xlm)
    model_xlm.compile(optimizer=Adam(learning_rate=1e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    return model_bert, model_dbert, model_xlm

# Appel de la fonction
model_bert, model_Dbert, model_XLMRoberta = build_model()

# Vérifier les architectures des modèles
print("BERT Model Summary:")
model_bert.summary()

print("\nDistilBERT Model Summary:")
model_Dbert.summary()

print("\nXLM-RoBERTa Model Summary:")
model_XLMRoberta.summary()

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

ValueError: Exception encountered when calling layer 'tf_bert_model_5' (type TFBertModel).

Data of type <class 'keras.src.backend.common.keras_tensor.KerasTensor'> is not allowed only (<class 'tensorflow.python.framework.tensor.Tensor'>, <class 'bool'>, <class 'int'>, <class 'transformers.utils.generic.ModelOutput'>, <class 'tuple'>, <class 'list'>, <class 'dict'>, <class 'numpy.ndarray'>) is accepted for attention_mask.

Call arguments received by layer 'tf_bert_model_5' (type TFBertModel):
  • input_ids=<KerasTensor shape=(None, 50), dtype=int32, sparse=False, name=input_ids>
  • attention_mask=<KerasTensor shape=(None, 50), dtype=int32, sparse=False, name=attention_mask>
  • token_type_ids=None
  • position_ids=None
  • head_mask=None
  • inputs_embeds=None
  • encoder_hidden_states=None
  • encoder_attention_mask=None
  • past_key_values=None
  • use_cache=None
  • output_attentions=None
  • output_hidden_states=None
  • return_dict=None
  • training=False

In [20]:
epochs = 30  # < change if you wish
n_steps = len(train_data) // batch_size
# Train BERT Model

model_bert.fit(tr_ds_bert,
          steps_per_epoch = n_steps,
          validation_data = val_ds_bert,
          epochs = epochs)

NameError: name 'model_bert' is not defined

In [21]:
model_Dbert.fit(tr_ds_Dbert,
          steps_per_epoch = n_steps,
          validation_data = val_ds_Dbert,
          epochs = epochs)

NameError: name 'model_Dbert' is not defined

In [22]:

model_XLMRoberta.fit(tr_ds_XR,
          steps_per_epoch = n_steps,
          validation_data = val_ds_XR,
          epochs = epochs)

NameError: name 'model_XLMRoberta' is not defined

In [23]:
# Evaluate BERT
res_bert = model_bert.evaluate(val_ds_bert, verbose=0)

# Evaluate DistilBERT
res_Dbert = model_Dbert.evaluate(val_ds_Dbert, verbose=0)

# Evaluate XLM-RoBERTa
res_XlmRoberta = model_XLMRoberta.evaluate(val_ds_XR, verbose=0)

#Tabulate Data
tab_data = [["BERT","30","Adam","128","1e-5",'{:.2%}'.format(res_bert[1])],
            ["DistilBERT","30","Adam","128","1e-5",'{:.2%}'.format(res_Dbert[1])],
            ["XLM-RoBERTa","30","Adam","128","1e-5",'{:.2%}'.format(res_XlmRoberta[1])]]

print(tabulate(tab_data, headers=['Models','Epochs','Optimizer','Batch Size','Learning Rate','Accuracy'], tablefmt='pretty'))

NameError: name 'model_bert' is not defined

Connection to TPU Failed
Using default strategy for CPU and single GPU


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

ValueError: Exception encountered when calling layer 'tf_bert_model_6' (type TFBertModel).

Data of type <class 'keras.src.backend.common.keras_tensor.KerasTensor'> is not allowed only (<class 'tensorflow.python.framework.tensor.Tensor'>, <class 'bool'>, <class 'int'>, <class 'transformers.utils.generic.ModelOutput'>, <class 'tuple'>, <class 'list'>, <class 'dict'>, <class 'numpy.ndarray'>) is accepted for attention_mask.

Call arguments received by layer 'tf_bert_model_6' (type TFBertModel):
  • input_ids=<KerasTensor shape=(None, 50), dtype=int32, sparse=False, name=input_ids>
  • attention_mask=<KerasTensor shape=(None, 50), dtype=int32, sparse=False, name=attention_mask>
  • token_type_ids=<KerasTensor shape=(None, 50), dtype=int32, sparse=False, name=token_type_ids>
  • position_ids=None
  • head_mask=None
  • inputs_embeds=None
  • encoder_hidden_states=None
  • encoder_attention_mask=None
  • past_key_values=None
  • use_cache=None
  • output_attentions=None
  • output_hidden_states=None
  • return_dict=None
  • training=False