In [49]:
import numpy as np
from scipy.special import softmax

from transformers import Trainer
from transformers import TrainingArguments
from transformers import AutoModelForSequenceClassification

from datasets import load_metric
from datasets import load_dataset
from datasets import load_from_disk

from sklearn.metrics import accuracy_score, recall_score, f1_score

from transformers import AutoTokenizer, DataCollatorWithPadding, BertForSequenceClassification

In [50]:
import os
print(os.getcwd())

/content


In [51]:
# !pip install datasets

In [52]:
raw_datasets = load_from_disk("data")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["text"], padding=True, truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # pad all the examples to the length of the longest element when we batch elements together — dynamic padding.

Map:   0%|          | 0/4952 [00:00<?, ? examples/s]

In [53]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 11552
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4952
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5791
    })
})

In [54]:
raw_datasets['train']

Dataset({
    features: ['text', 'label'],
    num_rows: 11552
})

In [55]:
metric = load_metric("glue", "mrpc")

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [56]:
# !pip install transformers[torch]
# !pip install accelerate -U

In [57]:
# !pip install accelerate==0.24.0

In [58]:
import accelerate
print(accelerate.__version__)

0.24.0


In [59]:
training_args = TrainingArguments(
    output_dir="test-trainer",
    evaluation_strategy="epoch",
    num_train_epochs=10,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=512,
    # During the first 500 training steps, the learning rate gradually increases from 0 (or a small base rate) to the specified learning rate.
    # This gradual increase helps in stabilizing the training process and often leads to better performance, as it prevents the model from making too large updates too quickly.
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='logs',
    logging_steps=10,  # how frequently the training progress is logged
    save_strategy="epoch",  # Set save strategy to match evaluation strategy
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)
# model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
model = BertForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [60]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4525,0.429035,0.809168,0.80382


TrainOutput(global_step=722, training_loss=0.5098302537714675, metrics={'train_runtime': 227.2055, 'train_samples_per_second': 50.844, 'train_steps_per_second': 3.178, 'total_flos': 596554984705920.0, 'train_loss': 0.5098302537714675, 'epoch': 1.0})

In [62]:
predictions = trainer.predict(tokenized_datasets["test"])
print(predictions.predictions.shape, predictions.label_ids.shape)

(5791, 2) (5791,)


In [63]:
predictions

PredictionOutput(predictions=array([[-0.01370484, -0.49870977],
       [ 0.87771946, -1.6862223 ],
       [-0.49065596,  0.36398625],
       ...,
       [-0.27373514,  0.2231947 ],
       [ 0.3937695 , -1.1255963 ],
       [ 0.40439972, -1.1857532 ]], dtype=float32), label_ids=array([0, 0, 0, ..., 0, 0, 0]), metrics={'test_loss': 0.6783167719841003, 'test_accuracy': 0.635986876187187, 'test_f1': 0.5528213831141281, 'test_runtime': 19.3903, 'test_samples_per_second': 298.655, 'test_steps_per_second': 1.186})

In [64]:
# Apply softmax to convert logits to probabilities
probabilities = softmax(predictions.predictions, axis=1)

# Get the predicted class labels
predicted_labels = np.argmax(probabilities, axis=1)

print("Probabilities:\n", probabilities)
print("Predicted Labels:\n", predicted_labels)

Probabilities:
 [[0.618929   0.38107097]
 [0.9285046  0.07149544]
 [0.29845995 0.70154005]
 ...
 [0.37826243 0.6217376 ]
 [0.82044506 0.17955494]
 [0.83063763 0.16936238]]
Predicted Labels:
 [0 0 1 ... 1 0 0]


In [65]:
true_labels = tokenized_datasets["test"]["label"]

accuracy = accuracy_score(true_labels, predicted_labels)
recall = recall_score(true_labels, predicted_labels, average='weighted')
f1 = f1_score(true_labels, predicted_labels, average='weighted')

print("Accuracy:", accuracy)
print("Recall:", recall)
print("F1 Score:", f1)


Accuracy: 0.635986876187187
Recall: 0.635986876187187
F1 Score: 0.6420656585871106
