In [1]:
!pip install transformers



In [0]:
import tensorflow as tf
import tensorflow_datasets
from transformers import *

tf.random.set_seed(123)

In [3]:
# Levantamos el tokenizador y el modelo para clasificar
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = TFBertForSequenceClassification.from_pretrained('bert-base-cased')

## y vamos a usar el dataset de mrpc: The Microsoft Research Paraphrase Corpus 
## son pares de frases en ingles, la tarea consiste en decidir si son semanticamente 
## equivalentes o no. 
data = tensorflow_datasets.load('glue/mrpc')

INFO:absl:Overwrite dataset info from restored data version.
INFO:absl:Reusing dataset glue (/root/tensorflow_datasets/glue/mrpc/1.0.0)
INFO:absl:Constructing tf.data.Dataset for split None, from /root/tensorflow_datasets/glue/mrpc/1.0.0


In [4]:
## veamos un ejemplo: tenemos el indice (idx), la clase (0: no son equivalentes, 1 : son equivalentes)
## y las dos frases
list(data["train"])[0]

{'idx': <tf.Tensor: shape=(), dtype=int32, numpy=1680>,
 'label': <tf.Tensor: shape=(), dtype=int64, numpy=0>,
 'sentence1': <tf.Tensor: shape=(), dtype=string, numpy=b'The identical rovers will act as robotic geologists , searching for evidence of past water .'>,
 'sentence2': <tf.Tensor: shape=(), dtype=string, numpy=b'The rovers act as robotic geologists , moving on six wheels .'>}

In [0]:
# Para facilitar todo, transformers trae una funcion para darle formato y convertir cada tarea del glue
# a un dataset para tensorflow. Esto facilita mucho comparar modelos, testear ideas y meter modificaciones
# sin preocuparse mucho por el preprocesamiento del dataset
train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, max_length=128, task='mrpc')
valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, max_length=128, task='mrpc')
train_dataset = train_dataset.batch(32)
valid_dataset = valid_dataset.batch(64)

In [0]:
## Nos devuelve un iterable, veamos que tiene adentro:
prueba_x, prueba_y = list(train_dataset)[0]

In [7]:
## una belleza: ya tiene armadas las masks, los inputs tokenizados y pasados a index
## y los id de tipo!
prueba_x["attention_mask"][0]

<tf.Tensor: shape=(128,), dtype=int32, numpy=
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)>

In [0]:
## compilamos el modelo
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-05, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])


In [9]:
# Entrenamos y evaluamos
history = model.fit(train_dataset, epochs=3, validation_data=valid_dataset)


Epoch 1/3
Epoch 2/3
Epoch 3/3


In [11]:
## Vamos a hacer una pruebita rapida: 
sentence_0 = "I like studying every day."
sentence_1 = "I enjoy studying every day."
sentence_2 = "I never want to study."
inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors='tf')
inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors='tf')


pred_1 = model(inputs_1)[0].numpy().argmax(1)[0]
pred_2 = model(inputs_2)[0].numpy().argmax(1)[0]

print("sentence_1 is", "a paraphrase" if pred_1 else "not a paraphrase", "of sentence_0")
print("sentence_2 is", "a paraphrase" if pred_2 else "not a paraphrase", "of sentence_0")

sentence_1 is a paraphrase of sentence_0
sentence_2 is not a paraphrase of sentence_0
