In [None]:
!pip install transformers datasets

In [None]:
from transformers import (
    DistilBertConfig,
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
)

MODEL_PATH = "distilbert-base-uncased"
config = DistilBertConfig.from_pretrained(MODEL_PATH, num_labels=1)
tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_PATH)
model = DistilBertForSequenceClassification.from_pretrained(MODEL_PATH, config=config)

In [None]:
import datasets
from datasets import load_dataset

stsb_train = load_dataset("glue", "stsb", split="train")
stsb_validation = load_dataset("glue", "stsb", split="validation")
stsb_validation = stsb_validation.shuffle(seed=42)
stsb_val = datasets.Dataset.from_dict(stsb_validation[:750])
stsb_test = datasets.Dataset.from_dict(stsb_validation[750:])

In [None]:
import pandas as pd

pd.DataFrame(stsb_train)

In [None]:
stsb_train.shape, stsb_val.shape, stsb_test.shape

In [None]:
enc_train = stsb_train.map(
    lambda e: tokenizer(e["sentence1"], e["sentence2"], padding=True, truncation=True),
    batched=True,
    batch_size=1000,
)
enc_val = stsb_val.map(
    lambda e: tokenizer(e["sentence1"], e["sentence2"], padding=True, truncation=True),
    batched=True,
    batch_size=1000,
)
enc_test = stsb_test.map(
    lambda e: tokenizer(e["sentence1"], e["sentence2"], padding=True, truncation=True),
    batched=True,
    batch_size=1000,
)

In [None]:
import pandas as pd

pd.DataFrame(enc_train)

In [None]:
from transformers import TrainingArguments, Trainer

In [None]:
training_args = TrainingArguments(
    # The output directory where the model predictions and checkpoints will be written
    output_dir="./stsb-model",
    do_train=True,
    do_eval=True,
    #  The number of epochs, defaults to 3.0
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    # Number of steps used for a linear warmup
    warmup_steps=100,
    weight_decay=0.01,
    # TensorBoard log directory
    logging_strategy="steps",
    logging_dir="./logs",
    logging_steps=50,
    # other options : no, steps
    evaluation_strategy="steps",
    save_strategy="epoch",
    fp16=True,
    load_best_model_at_end=True,
)

In [None]:
from torch import cuda

device = "cuda" if cuda.is_available() else "cpu"

In [None]:
import numpy as np
from scipy.stats import pearsonr
from scipy.stats import spearmanr


def compute_metrics(pred):
    preds = np.squeeze(pred.predictions)
    return {
        "MSE": ((preds - pred.label_ids) ** 2).mean().item(),
        "RMSE": (np.sqrt(((preds - pred.label_ids) ** 2).mean())).item(),
        "MAE": (np.abs(preds - pred.label_ids)).mean().item(),
        "Pearson": pearsonr(preds, pred.label_ids)[0],
        "Spearman's Rank": spearmanr(preds, pred.label_ids)[0],
    }

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=enc_train,
    eval_dataset=enc_val,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

In [None]:
train_result = trainer.train()
metrics = train_result.metrics

In [None]:
s1, s2 = "A plane is taking off.", "An air plane is taking off."

In [None]:
encoding = tokenizer(
    s1, s2, return_tensors="pt", padding=True, truncation=True, max_length=512
)
input_ids = encoding["input_ids"].to(device)
attention_mask = encoding["attention_mask"].to(device)
outputs = model(input_ids, attention_mask=attention_mask)
outputs.logits.item()

In [None]:
s1, s2 = "The men are playing soccer.", "A man is riding a motorcycle."

In [None]:
encoding = tokenizer(
    "hey how are you there",
    "hey how are you",
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=512,
)
input_ids = encoding["input_ids"].to(device)
attention_mask = encoding["attention_mask"].to(device)
outputs = model(input_ids, attention_mask=attention_mask)
outputs.logits.item()

In [None]:
q = [trainer.evaluate(eval_dataset=data) for data in [enc_train, enc_val, enc_test]]
pd.DataFrame(q, index=["train", "val", "test"]).iloc[:, :6]

In [None]:
model_path = "sentence-pair-regression-model"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)