# Data2vec vs. SBERT
https://www.kaggle.com/datasets/shivamkushwaha/bbc-full-text-document-classification

In [1]:
# !pip install transformers
# !pip install sentence_transformers
# !pip install scikit-learn-intelex
# !pip3 install memory_profiler
# 

In [2]:
import torch
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
from sklearnex import patch_sklearn
patch_sklearn()
%load_ext memory_profiler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [3]:
df = pd.read_csv("/content/bbc-text.csv")

In [4]:
X = df.text.copy()
y = df.category.copy()
y = pd.factorize(y)[0]

In [5]:
print(df.category.unique())
print(pd.factorize(y)[1])

['tech' 'business' 'sport' 'entertainment' 'politics']
[0 1 2 3 4]


## SBERT

In [6]:
%%time
%%memit
model = SentenceTransformer('all-mpnet-base-v2',device='cuda')
model.max_seq_length = 150


#Sentences are encoded by calling model.encode()
sentence_embeddings = X.apply(model.encode)
sentence_embeddings = pd.DataFrame(sentence_embeddings.tolist())

peak memory: 4549.16 MiB, increment: 3693.93 MiB
CPU times: user 40.5 s, sys: 2.72 s, total: 43.2 s
Wall time: 46.6 s


In [7]:
%%time
X_train, X_test, y_train, y_test = train_test_split(sentence_embeddings, y, test_size=0.2)

CPU times: user 11.5 ms, sys: 32 µs, total: 11.5 ms
Wall time: 11.4 ms


In [8]:
rfc = RandomForestClassifier(n_estimators=500).fit(X_train, y_train)

In [9]:
prediction = rfc.predict(X_test)

In [10]:
print(classification_report(y_test, prediction, target_names=df.category.unique()))

               precision    recall  f1-score   support

         tech       0.96      0.97      0.97        78
     business       0.97      0.95      0.96       116
        sport       0.98      1.00      0.99       100
entertainment       1.00      0.98      0.99        66
     politics       0.94      0.95      0.95        85

     accuracy                           0.97       445
    macro avg       0.97      0.97      0.97       445
 weighted avg       0.97      0.97      0.97       445



## Data2vec

In [11]:
%%time
%%memit
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask


#Load AutoModel from huggingface model repository
tokenizer = AutoTokenizer.from_pretrained("facebook/data2vec-text-base")
model = AutoModel.from_pretrained("facebook/data2vec-text-base", num_labels = df.category.nunique()).to("cuda")

#Tokenize sentences
encoded_input = tokenizer(list(X), padding=True, truncation=True, max_length=150, return_tensors='pt').to("cuda")

#Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

#Perform pooling. In this case, mean pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
sentence_embeddings = pd.DataFrame(sentence_embeddings.tolist())

Some weights of the model checkpoint at facebook/data2vec-text-base were not used when initializing Data2VecTextModel: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing Data2VecTextModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Data2VecTextModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Data2VecTextModel were not initialized from the model checkpoint at facebook/data2vec-text-base and are newly initialized: ['data2vec_text.pooler.dense.weight', 'data2vec_text.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it f

peak memory: 4757.41 MiB, increment: 253.38 MiB
CPU times: user 15.3 s, sys: 1.38 s, total: 16.7 s
Wall time: 16.3 s


In [12]:
%%time
X_train, X_test, y_train, y_test = train_test_split(sentence_embeddings, y, test_size=0.2)

CPU times: user 13.5 ms, sys: 1.03 ms, total: 14.5 ms
Wall time: 13.7 ms


In [13]:
rfc = RandomForestClassifier(n_estimators=500).fit(X_train, y_train)

In [14]:
prediction = rfc.predict(X_test)

In [15]:
print(classification_report(y_test, prediction, target_names=df.category.unique()))

               precision    recall  f1-score   support

         tech       0.88      0.85      0.87        81
     business       0.84      0.87      0.85       107
        sport       0.97      0.96      0.97       108
entertainment       0.92      0.88      0.90        74
     politics       0.82      0.85      0.84        75

     accuracy                           0.89       445
    macro avg       0.89      0.88      0.88       445
 weighted avg       0.89      0.89      0.89       445

