In [3]:
import tensorflow as tf
gpus = tf.config.list_physical_devices('GPU')
tf.config.set_visible_devices(gpus[0], 'GPU')
tf.config.experimental.set_memory_growth(gpus[0], True)
# tf.config.set_logical_device_configuration(
# gpus[0],
# [tf.config.LogicalDeviceConfiguration(memory_limit=9216  )])

In [4]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

model_name = "PlanTL-GOB-ES/gpt2-base-bne"
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5,
                                                             from_pt=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFGPT2ForSequenceClassification: ['transformer.h.10.attn.masked_bias', 'transformer.h.3.attn.masked_bias', 'transformer.h.1.attn.masked_bias', 'transformer.h.0.attn.masked_bias', 'lm_head.weight', 'transformer.h.8.attn.masked_bias', 'transformer.h.2.attn.masked_bias', 'transformer.h.7.attn.masked_bias', 'transformer.h.5.attn.masked_bias', 'transformer.h.9.attn.masked_bias', 'transformer.h.11.attn.masked_bias', 'transformer.h.4.attn.masked_bias', 'transformer.h.6.attn.masked_bias']
- This IS expected if you are initializing TFGPT2ForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFGPT2ForSequenceClassif

In [7]:
from datasets import load_dataset, DatasetDict

raw_dataset = load_dataset("javilonso/mex_data", use_auth_token=True)

Downloading:   0%|          | 0.00/907 [00:00<?, ?B/s]

Using custom data configuration javilonso--mex_data-0c030e0c8e30da9e


Downloading and preparing dataset None/None (download: 10.73 MiB, generated: 18.02 MiB, post-processed: Unknown size, total: 28.75 MiB) to /home/javilonso/.cache/huggingface/datasets/parquet/javilonso--mex_data-0c030e0c8e30da9e/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/2.26M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/8.99M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Dataset parquet downloaded and prepared to /home/javilonso/.cache/huggingface/datasets/parquet/javilonso--mex_data-0c030e0c8e30da9e/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
raw_dataset

DatasetDict({
    test: Dataset({
        features: ['Title', 'Opinion', 'Polarity', 'Attraction', '__index_level_0__'],
        num_rows: 5987
    })
    train: Dataset({
        features: ['Title', 'Opinion', 'Polarity', 'Attraction', '__index_level_0__'],
        num_rows: 23944
    })
})

In [10]:
def tokenize(example):
    tokenized_example = tokenizer(example["Opinion"], truncation=True)
    tokenized_example["label"] = example["Polarity"]
    return tokenized_example


tokenized_dataset = raw_dataset.map(tokenize, batched=True, remove_columns=raw_dataset["train"].column_names)

  0%|          | 0/6 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


  0%|          | 0/24 [00:00<?, ?ba/s]

In [11]:
tokenized_dataset["train"][0]

{'input_ids': [2,
  454,
  1936,
  369,
  7658,
  644,
  364,
  341,
  2762,
  313,
  533,
  7658,
  11062,
  68,
  859,
  7082,
  342,
  387,
  22319,
  644,
  28172,
  66,
  623,
  344,
  1416,
  1233,
  8883,
  11062,
  68,
  4742,
  2762,
  5209,
  341,
  5089,
  344,
  17984,
  467,
  1713,
  342,
  341,
  2090,
  27152,
  313,
  1488,
  55],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'label': 4}

In [12]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

In [14]:
batch_size = 4

tokenizer.add_special_tokens({'pad_token': '[PAD]'})

tf_train_dataset = tokenized_dataset["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "label"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=batch_size,
)

tf_eval_dataset = tokenized_dataset["test"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "label"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=batch_size,
)

In [15]:
model.config.num_labels

5

In [16]:
from tensorflow.keras import mixed_precision
policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_global_policy(policy)

INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPU will likely run quickly with dtype policy mixed_float16 as it has compute capability of at least 7.0. Your GPU: GeForce RTX 3060, compute capability 8.6


In [17]:
from transformers import create_optimizer

num_epochs = 3
num_train_steps = len(tf_train_dataset) * num_epochs

optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! Please ensure your labels are passed as keys in the input dict so that they are accessible to the model during the forward pass. To disable this behaviour, please pass a loss argument, or explicitly pass loss=None if you do not want your model to compute a loss.


In [None]:
from transformers.keras_callbacks import PushToHubCallback
callback = PushToHubCallback(output_dir="classificationPolEsp2", tokenizer=tokenizer)

model.fit(
    tf_train_dataset,
    validation_data=tf_eval_dataset,
    callbacks=[callback],
    epochs=num_epochs
)

Cloning https://huggingface.co/javilonso/classificationPolEsp2 into local empty directory.


Epoch 1/3
Epoch 2/3
Epoch 3/3


Upload file tf_model.h5:   0%|          | 32.0k/473M [00:00<?, ?B/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [22]:
import numpy as np
from datasets import load_metric
 
load_accuracy = load_metric("accuracy")
load_f1 = load_metric("f1")
for i, batch in enumerate(tf_eval_dataset):
    if i % 100 == 0:
        print(i)
    logits = model.predict(batch)["logits"]
    labels = batch["labels"]
    predictions = np.argmax(logits, axis=-1)
    load_accuracy.add_batch(predictions=predictions, references=labels)
    load_f1.add_batch(predictions=predictions, references=labels)


accuracy = load_accuracy.compute()["accuracy"]
f1 = load_f1.compute(average=None)["f1"]

KeyboardInterrupt: 

In [20]:
print(f'Accuracy: {accuracy:.4f}')
print(f'F1: {f1}')


NameError: name 'accuracy' is not defined

In [21]:
accuracy = load_accuracy.compute()["accuracy"]

NameError: name 'load_accuracy' is not defined