In [11]:
import tensorflow as tf
gpus = tf.config.list_physical_devices('GPU')
tf.config.set_visible_devices(gpus[0], 'GPU')
tf.config.experimental.set_memory_growth(gpus[0], True)
# tf.config.set_logical_device_configuration(
# gpus[0],
# [tf.config.LogicalDeviceConfiguration(memory_limit=9216  )])

In [12]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

model_name = "PlanTL-GOB-ES/gpt2-base-bne"
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3,
                                                             from_pt=True)

Downloading:   0%|          | 0.00/681 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading:   0%|          | 0.00/476M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFGPT2ForSequenceClassification: ['transformer.h.1.attn.masked_bias', 'transformer.h.10.attn.masked_bias', 'transformer.h.5.attn.masked_bias', 'transformer.h.6.attn.masked_bias', 'transformer.h.3.attn.masked_bias', 'transformer.h.8.attn.masked_bias', 'lm_head.weight', 'transformer.h.2.attn.masked_bias', 'transformer.h.11.attn.masked_bias', 'transformer.h.9.attn.masked_bias', 'transformer.h.7.attn.masked_bias', 'transformer.h.0.attn.masked_bias', 'transformer.h.4.attn.masked_bias']
- This IS expected if you are initializing TFGPT2ForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFGPT2ForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassificat

In [17]:
from datasets import load_dataset, DatasetDict

raw_dataset = DatasetDict.load_from_disk("../../RoBERTaESP/data")

In [18]:
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['Title', 'Opinion', 'Polarity', 'Attraction', '__index_level_0__'],
        num_rows: 23944
    })
    test: Dataset({
        features: ['Title', 'Opinion', 'Polarity', 'Attraction', '__index_level_0__'],
        num_rows: 5987
    })
})

In [19]:
def tokenize(example):
    tokenized_example = tokenizer(example["Opinion"], truncation=True)
    tokenized_example["label"] = example["Attraction"]
    return tokenized_example


tokenized_dataset = raw_dataset.map(tokenize, batched=True, remove_columns=raw_dataset["train"].column_names)

  0%|          | 0/24 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


  0%|          | 0/6 [00:00<?, ?ba/s]

In [20]:
tokenized_dataset["train"][0]

{'input_ids': [2,
  454,
  1936,
  369,
  7658,
  644,
  364,
  341,
  2762,
  313,
  533,
  7658,
  11062,
  68,
  859,
  7082,
  342,
  387,
  22319,
  644,
  28172,
  66,
  623,
  344,
  1416,
  1233,
  8883,
  11062,
  68,
  4742,
  2762,
  5209,
  341,
  5089,
  344,
  17984,
  467,
  1713,
  342,
  341,
  2090,
  27152,
  313,
  1488,
  55],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'label': 0}

In [21]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

In [22]:
batch_size = 8

tokenizer.add_special_tokens({'pad_token': '[PAD]'})

tf_train_dataset = tokenized_dataset["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "label"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=batch_size,
)

tf_eval_dataset = tokenized_dataset["test"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "label"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=batch_size,
)

Using pad_token, but it is not set yet.


ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.

In [23]:
model.config.num_labels

3

In [None]:
from tensorflow.keras import mixed_precision
policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_global_policy(policy)

In [3]:
from transformers import create_optimizer

num_epochs = 3
num_train_steps = len(tf_train_dataset) * num_epochs

optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

NameError: name 'tf_train_dataset' is not defined

In [None]:
from transformers.keras_callbacks import PushToHubCallback
callback = PushToHubCallback(output_dir="classificationEsp3_Attraction", tokenizer=tokenizer)

model.fit(
    tf_train_dataset,
    validation_data=tf_eval_dataset,
    callbacks=[callback],
    epochs=num_epochs
)

Cloning https://huggingface.co/javilonso/classificationEsp1_Attraction into local empty directory.


Epoch 1/3
Epoch 2/3
Epoch 3/3


In [None]:
import numpy as np
from datasets import load_metric
 
load_accuracy = load_metric("accuracy")
load_f1 = load_metric("f1")
for batch in tf_eval_dataset:
    logits = model.predict(batch)["logits"]
    labels = batch["labels"]
    predictions = np.argmax(logits, axis=-1)
    load_accuracy.add_batch(predictions=predictions, references=labels)
    load_f1.add_batch(predictions=predictions, references=labels)


accuracy = load_accuracy.compute()["accuracy"]
f1 = load_f1.compute(average="None")["f1"]

In [None]:
print(f'Accuracy: {accuracy:.2f}')
print(f'F1: {f1:.2f}')


In [None]:
# model.push_to_hub("classificationEsp1")
# tokenizer.push_to_hub("classificationEsp1")

In [4]:
# Tests ------

check_point = "javilonso/classificationEsp1_Attraction"

from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
model = TFAutoModelForSequenceClassification.from_pretrained(check_point)
tokenizer = AutoTokenizer.from_pretrained(check_point)


from datasets import load_dataset, DatasetDict

raw_dataset = DatasetDict.load_from_disk(".././data")

def tokenize(example):
    tokenized_example = tokenizer(example["Opinion"], truncation=True)
    tokenized_example["label"] = example["Attraction"]
    return tokenized_example


tokenized_dataset = raw_dataset.map(tokenize, batched=True, remove_columns=raw_dataset["train"].column_names)


from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")


tf_train_dataset = tokenized_dataset["train"].to_tf_dataset(
    columns=["attenti on_mask", "input_ids", "label"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=16,
)

tf_eval_dataset = tokenized_dataset["test"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "label"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=16,
)

import numpy as np
from datasets import load_metric
 
load_accuracy = load_metric("accuracy")
load_f1 = load_metric("f1")
for batch in tf_eval_dataset:
    logits = model.predict(batch)["logits"]
    labels = batch["labels"]
    predictions = np.argmax(logits, axis=-1)
    load_accuracy.add_batch(predictions=predictions, references=labels)
    load_f1.add_batch(predictions=predictions, references=labels)


accuracy = load_accuracy.compute()["accuracy"]
f1 = load_f1.compute(average=None)["f1"]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at javilonso/classificationEsp1_Attraction.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
Loading cached processed dataset at ../data/train/cache-1f28644c3516951f.arrow
Loading cached processed dataset at ../data/test/cache-82babf341eaded68.arrow


In [5]:
print(f'Accuracy: {accuracy:.2f}')
print(f'F1: {f1}')


Accuracy: 0.99
F1: [0.9903655  0.97865945 0.99335863]
