# Data2vec vs. SBERT
https://www.kaggle.com/datasets/shivamkushwaha/bbc-full-text-document-classification

In [1]:
!pip install transformers memory_profiler sentence_transformers scikit-learn-intelex datasets
import time
import datetime
tic = time.time()



In [2]:
import torch
import random
from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_metric
%load_ext memory_profiler
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("/content/bbc-text.csv")

In [4]:
print(df.category.unique())
print(pd.factorize(df.category)[1])

['tech' 'business' 'sport' 'entertainment' 'politics']
Index(['tech', 'business', 'sport', 'entertainment', 'politics'], dtype='object')


## Set Seed in Front

In [5]:
def set_seed(seed: int):
  random.seed(seed)
  np.random.seed(seed)
  if is_torch_available():
      torch.manual_seed(seed)
      torch.cuda.manual_seed_all(seed)

  if is_tf_available():
      import tensorflow as tf
 
      tf.random.set_seed(seed)
 
set_seed(42)

## Build Classifier with Transformer

In [6]:
def TextClassification_with_Transformer(model_name: str, Data: pd.Series, Target:pd.Series, test_size: np.float64, max_length: int, num_labels: int, num_epochs: int, metrics_name: str):

  # Make data
  X = Data
  y = Target
  y = pd.factorize(y)[0]

  # Load Metrics
  metric = load_metric(metrics_name)

  # Split Data
  X_train, X_test, y_train, y_test = train_test_split(X.tolist(), y, test_size=test_size)

  # Call the Tokenizer
  tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)

  # Encode the text
  train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=max_length)
  valid_encodings = tokenizer(X_test, truncation=True, padding=True, max_length=max_length)



  class MakeTorchData(torch.utils.data.Dataset):
      def __init__(self, encodings, labels):
          self.encodings = encodings
          self.labels = labels

      def __getitem__(self, idx):
          item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
          item["labels"] = torch.tensor([self.labels[idx]])
          return item

      def __len__(self):
          return len(self.labels)

  # convert our tokenized data into a torch Dataset
  train_dataset = MakeTorchData(train_encodings, y_train.ravel())
  valid_dataset = MakeTorchData(valid_encodings, y_test.ravel())


  # Call Model
  model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = num_labels).to("cuda")

  # Create Metrics
  def compute_metrics(eval_pred):

    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    return metric.compute(predictions=predictions, references=labels, average="micro")

  # Specifiy the arguments for the trainer  
  training_args = TrainingArguments(
      output_dir='./results',          # output directory
      num_train_epochs=num_epochs,     # total number of training epochs
      per_device_train_batch_size=8,   # batch size per device during training
      per_device_eval_batch_size=20,   # batch size for evaluation
      warmup_steps=500,                # number of warmup steps for learning rate scheduler
      weight_decay=0.01,               # strength of weight decay
      logging_dir='./logs',            # directory for storing logs
      load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
      metric_for_best_model = metrics_name,    # select the base metrics
      logging_steps=400,               # log & save weights each logging_steps
      save_steps=400,
      evaluation_strategy="steps",     # evaluate each `logging_steps`
  ) 

  # Call the Trainer
  trainer = Trainer(
      model=model,                         # the instantiated Transformers model to be trained
      args=training_args,                  # training arguments, defined above
      train_dataset=train_dataset,         # training dataset
      eval_dataset=valid_dataset,          # evaluation dataset
      compute_metrics=compute_metrics,     # the callback that computes metrics of interest
  )


  # Train the model
  trainer.train()

  return trainer, model

## Sentence BERT

In [7]:
sbert_trainer, sbert_model = TextClassification_with_Transformer(model_name = 'sentence-transformers/all-mpnet-base-v2', 
                                                                 Data = df.text, 
                                                                 Target = df.category, 
                                                                 test_size = 0.33, 
                                                                 max_length = 512, 
                                                                 num_labels = 5, 
                                                                 num_epochs = 10, 
                                                                 metrics_name='f1')

Some weights of the model checkpoint at sentence-transformers/all-mpnet-base-v2 were not used when initializing MPNetForSequenceClassification: ['pooler.dense.bias', 'pooler.dense.weight']
- This IS expected if you are initializing MPNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing MPNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of MPNetForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-mpnet-base-v2 and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a

Step,Training Loss,Validation Loss,F1
400,0.7485,0.200239,0.955102
800,0.0824,0.089828,0.985034
1200,0.0297,0.090437,0.985034
1600,0.0028,0.091037,0.985034


***** Running Evaluation *****
  Num examples = 735
  Batch size = 20
Saving model checkpoint to ./results/checkpoint-400
Configuration saved in ./results/checkpoint-400/config.json
Model weights saved in ./results/checkpoint-400/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 735
  Batch size = 20
Saving model checkpoint to ./results/checkpoint-800
Configuration saved in ./results/checkpoint-800/config.json
Model weights saved in ./results/checkpoint-800/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 735
  Batch size = 20
Saving model checkpoint to ./results/checkpoint-1200
Configuration saved in ./results/checkpoint-1200/config.json
Model weights saved in ./results/checkpoint-1200/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 735
  Batch size = 20
Saving model checkpoint to ./results/checkpoint-1600
Configuration saved in ./results/checkpoint-1600/config.json
Model weights saved in ./results/checkpoint-1600/pytorch_model.bin




In [11]:
sbert_trainer.evaluate()

***** Running Evaluation *****
  Num examples = 735
  Batch size = 20


{'epoch': 10.0,
 'eval_f1': 0.9850340136054422,
 'eval_loss': 0.0898284912109375,
 'eval_runtime': 22.6043,
 'eval_samples_per_second': 32.516,
 'eval_steps_per_second': 1.637}

## Data2vec

In [8]:
d2v_trainer, d2v_model = TextClassification_with_Transformer(model_name = 'facebook/data2vec-text-base', 
                                                             Data = df.text, 
                                                             Target = df.category, 
                                                             test_size = 0.33, 
                                                             max_length = 512, 
                                                             num_labels = 5, 
                                                             num_epochs = 10, 
                                                             metrics_name='f1')

loading file https://huggingface.co/facebook/data2vec-text-base/resolve/main/vocab.json from cache at /root/.cache/huggingface/transformers/8a26e82a7994252ab80418013496f0089bbc1f3167ca799b74147245bfa30d89.647b4548b6d9ea817e82e7a9231a320231a1c9ea24053cc9e758f3fe68216f05
loading file https://huggingface.co/facebook/data2vec-text-base/resolve/main/merges.txt from cache at /root/.cache/huggingface/transformers/db31ffa8db62dd6e18780a442fba9502d3ca43d64d1e714a11be2b063c59cfe4.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/facebook/data2vec-text-base/resolve/main/tokenizer.json from cache at /root/.cache/huggingface/transformers/c9deca35b050f45baa1bfef9ac6e3fc10de35be4c63bd065522850597d5dd03b.f6b9dbf7e3ca499065821a947cf6d02c3ba413a3aea9981306737265d081c3ff
loading file https://huggingface.co/facebook/data2vec-text-base/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/facebook/data2vec-text-base/resolve/

Step,Training Loss,Validation Loss,F1
400,0.6582,0.260448,0.941497
800,0.1308,0.193829,0.967347
1200,0.0522,0.145542,0.97415
1600,0.0333,0.176472,0.967347


***** Running Evaluation *****
  Num examples = 735
  Batch size = 20
Saving model checkpoint to ./results/checkpoint-400
Configuration saved in ./results/checkpoint-400/config.json
Model weights saved in ./results/checkpoint-400/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 735
  Batch size = 20
Saving model checkpoint to ./results/checkpoint-800
Configuration saved in ./results/checkpoint-800/config.json
Model weights saved in ./results/checkpoint-800/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 735
  Batch size = 20
Saving model checkpoint to ./results/checkpoint-1200
Configuration saved in ./results/checkpoint-1200/config.json
Model weights saved in ./results/checkpoint-1200/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 735
  Batch size = 20
Saving model checkpoint to ./results/checkpoint-1600
Configuration saved in ./results/checkpoint-1600/config.json
Model weights saved in ./results/checkpoint-1600/pytorch_model.bin




In [12]:
d2v_trainer.evaluate()

***** Running Evaluation *****
  Num examples = 735
  Batch size = 20


{'epoch': 10.0,
 'eval_f1': 0.9741496598639455,
 'eval_loss': 0.145542174577713,
 'eval_runtime': 21.1854,
 'eval_samples_per_second': 34.694,
 'eval_steps_per_second': 1.746}

In [9]:
toc=time.time()
print(datetime.timedelta(seconds = toc-tic))

0:55:21.348891
