In [1]:
import tensorflow as tf
gpus = tf.config.list_physical_devices('GPU')
tf.config.set_visible_devices(gpus[0], 'GPU')
tf.config.experimental.set_memory_growth(gpus[0], True)
# tf.config.set_logical_device_configuration(
# gpus[0],
# [tf.config.LogicalDeviceConfiguration(memory_limit=9216  )])

In [2]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

model_name = "PlanTL-GOB-ES/roberta-large-bne"
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3,
                                                             from_pt=True)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForSequenceClassification: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predicti

In [3]:
from datasets import load_dataset, DatasetDict

raw_dataset = DatasetDict.load_from_disk(".././data")

In [14]:
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['Title', 'Opinion', 'Polarity', 'Attraction', '__index_level_0__'],
        num_rows: 23944
    })
    test: Dataset({
        features: ['Title', 'Opinion', 'Polarity', 'Attraction', '__index_level_0__'],
        num_rows: 5987
    })
})

In [5]:
def tokenize(example):
    tokenized_example = tokenizer(example["Opinion"], truncation=True)
    tokenized_example["label"] = example["Attraction"]
    return tokenized_example


tokenized_dataset = raw_dataset.map(tokenize, batched=True, remove_columns=raw_dataset["train"].column_names)

Loading cached processed dataset at ../data/train/cache-29588ab3c4762e54.arrow


  0%|          | 0/6 [00:00<?, ?ba/s]

In [6]:
tokenized_dataset["train"][0]

{'input_ids': [0,
  450,
  1934,
  371,
  7638,
  650,
  365,
  341,
  2767,
  313,
  538,
  7638,
  11019,
  68,
  477,
  7085,
  342,
  390,
  22116,
  650,
  27821,
  66,
  625,
  344,
  1416,
  1232,
  8847,
  11019,
  68,
  2337,
  2767,
  5190,
  341,
  5070,
  344,
  17876,
  470,
  1719,
  342,
  341,
  2085,
  26909,
  313,
  1488,
  55,
  2],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'label': 0}

In [7]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

In [8]:
batch_size = 2

tf_train_dataset = tokenized_dataset["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "label"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=batch_size,
)

tf_eval_dataset = tokenized_dataset["test"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "label"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=batch_size,
)

In [9]:
model.config.num_labels

3

In [10]:
from tensorflow.keras import mixed_precision
policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_global_policy(policy)

INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPU will likely run quickly with dtype policy mixed_float16 as it has compute capability of at least 7.0. Your GPU: GeForce RTX 3060, compute capability 8.6


In [11]:
from transformers import create_optimizer

num_epochs = 3
num_train_steps = len(tf_train_dataset) * num_epochs

optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! Please ensure your labels are passed as keys in the input dict so that they are accessible to the model during the forward pass. To disable this behaviour, please pass a loss argument, or explicitly pass loss=None if you do not want your model to compute a loss.


In [None]:
from transformers.keras_callbacks import PushToHubCallback
callback = PushToHubCallback(output_dir="classificationEsp2_Attraction", tokenizer=tokenizer)

model.fit(
    tf_train_dataset,
    validation_data=tf_eval_dataset,
    callbacks=[callback],
    epochs=num_epochs
)

/home/javilonso/NLP_Sentiment_UC3M/RoBERTaESP/Attraction/classificationEsp2_Attraction is already a clone of https://huggingface.co/javilonso/classificationEsp2_Attraction. Make sure you pull the latest changes with `repo.git_pull()`.


Epoch 1/3
 2587/11972 [=====>........................] - ETA: 39:41 - loss: 0.4502

In [37]:
import numpy as np
from datasets import load_metric
 
load_accuracy = load_metric("accuracy")
load_f1 = load_metric("f1")
for index, batch in enumerate(tf_eval_dataset):
    logits = model.predict(batch)["logits"]
    labels = batch["labels"]
    predictions = np.argmax(logits, axis=-1)
    load_accuracy.add_batch(predictions=predictions, references=labels)
    load_f1.add_batch(predictions=predictions, references=labels)
    if(index % 100 == 0):
        print(index)


accuracy = load_accuracy.compute()["accuracy"]
f1 = load_f1.compute(average=None)["f1"]

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900


In [41]:
print(f'Accuracy: {accuracy:.2f}')
print(f'F1: {f1}')


Accuracy: 0.55
F1: [0.70579334 0.         0.        ]
