In [None]:
! pip install transformers
! pip install datasets
! pip install evaluate
! pip install scipy
! pip install sklearn
! pip install tensorflow

In [11]:
from datasets import load_dataset
from evaluate import load
from transformers import AutoTokenizer
from transformers import AutoModel, TFAutoModelForSequenceClassification
import tensorflow as tf
from datasets import Dataset
from transformers import create_optimizer
from transformers.keras_callbacks import KerasMetricCallback
import numpy as np

In [12]:
model_checkpoint = "prajjwal1/bert-tiny"
learning_rate = 1e-5
num_epochs = 1
batch_size = 14
max_length = 512
num_labels = 1
task = "stsb"

In [None]:
dataset = load_dataset("yelp_review_full")
print(dataset)

In [None]:
metric = load("glue", task)
# references = [0., 1., 2., 3., 4., 5.]
# predictions = [0., 1., 2., 3., 4., 5.]
print(metric)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/285 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [None]:
def tokenize_data(data):
  return tokenizer(data["text"], truncation=True)

In [None]:
encoded_dataset = dataset.map(tokenize_data, batched=True)
print(encoded_dataset)

In [None]:
id2label = {1:'1 star', 2:'2 star', 3:'3 stars', 4:'4 stars', 5:'5 stars'}
label2id = {val: key for key, val in id2label.items()}

model = AutoModel.from_pretrained(
    model_checkpoint, num_labels=num_labels, id2label=id2label, label2id=label2id
)

In [None]:
train_dataset = encoded_dataset.filter(lambda example, indice: indice < 100, with_indices=True)
test_dataset = encoded_dataset.filter(lambda example, indice: indice < 10, with_indices=True)


In [None]:
print(train_dataset)
print(test_dataset)

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 100
    })
    test: Dataset({
        features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 100
    })
})
DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 10
    })
    test: Dataset({
        features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 10
    })
})


In [None]:
tf_train_dataset = model.prepare_tf_dataset(
    train_dataset['train'],
    shuffle=True,
    batch_size=batch_size,
    tokenizer=tokenizer
)

tf_validation_dataset = model.prepare_tf_dataset(
    test_dataset['test'],
    shuffle=False,
    batch_size=batch_size,
    tokenizer=tokenizer
)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
print(tf_train_dataset.element_spec)
print(tf_validation_dataset.element_spec)

In [None]:
batches_per_epoch = len(encoded_dataset["train"]) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)

optimizer, schedule = create_optimizer(
    init_lr=learning_rate, num_warmup_steps=0, num_train_steps=total_train_steps
)
model.compile(optimizer=optimizer)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [None]:
# from tensorflow.keras.optimizers import Adam
# from tensorflow.keras.losses import SparseCategoricalCrossentropy

# # model.compile(optimizer=Adam(learning_rate=learning_rate))
# model.compile(
#     optimizer="adam",
#     loss=SparseCategoricalCrossentropy(from_logits=True),
#     metrics=["accuracy"],
# )

In [None]:
# model.fit(
#     tf_train_dataset,
#     validation_data=tf_validation_dataset,
# )

In [None]:
def compute_metrics(eval_predictions):
    predictions, labels = eval_predictions
    if task != "stsb":
        predictions = np.argmax(predictions, axis=1)
    else:
        predictions = predictions[:, 0]
    return metric.compute(predictions=predictions, references=labels)


metric_callback = KerasMetricCallback(
    metric_fn=compute_metrics, eval_dataset=tf_validation_dataset
)

In [None]:
# from transformers.keras_callbacks import PushToHubCallback
# from tensorflow.keras.callbacks import TensorBoard

# model_name = model_checkpoint.split("/")[-1]
# push_to_hub_model_id = f"{model_name}-finetuned-{task}"

# tensorboard_callback = TensorBoard(log_dir="./text_classification_model_save/logs")

# push_to_hub_callback = PushToHubCallback(
#     output_dir="./text_classification_model_save",
#     tokenizer=tokenizer,
#     hub_model_id=push_to_hub_model_id,
# )

# callbacks = [metric_callback]

model.fit(
    tf_train_dataset,
    validation_data=tf_validation_dataset,
    epochs=num_epochs
)