### Задание

Обучить модель векторизации на задаче мультиклассовой классификации текстов.

1. Выбрать датасет для задачи классификации текстов (https://huggingface.co/datasets)
2. Выбрать модель векторизации (https://huggingface.co/models)
3. Обучить классификатор с mean_pooling.
4. Посчитать метрики: F1-Score, Precision, Recall. (https://lightning.ai/docs/torchmetrics/stable/)
5. Обучить классификатор с cls_pooling.
6. Посчитать метрики: F1-Score, Precision, Recall.
7. Сравнить метрики полученные при использовании разных стратегий пулинга.

In [None]:
import torch
import pandas as pd
from torch.utils.data import TensorDataset
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import datasets
from collections import Counter

### Data preprocessing

In [None]:
ds = datasets.load_dataset("rootacess/math-qa-classification")

In [None]:
ds

In [None]:
Counter(ds['train']['category'])

In [None]:
label2id = {
    "general": 0,
    "physics": 1,
    "gain": 2,
    "geometry": 3,
    "probability": 4,
    "other": 5
}
label2id

In [None]:
id2label = {v:k for k,v in label2id.items()}
id2label

In [None]:
text_legnths = [len(t) for t in ds['train']['Problem']]

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 5))
sns.histplot(data=text_legnths)
plt.show()

In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.DataFrame.from_dict(ds['train'][:20])

### Tokenization

In [None]:
from transformers import AutoModel, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('ibm-granite/granite-embedding-30m-english')

def tokenize_data(data, max_length=512):
    texts = data['Problem']
    texts = list(texts)
    tokens = tokenizer.batch_encode_plus(
        texts,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    return tokens


train_tokens = tokenize_data(ds['train'])
val_tokens = tokenize_data(ds['validation'])
test_tokens = tokenize_data(ds['test'])

In [None]:
print(train_tokens.input_ids.shape)

In [None]:
train_dataset = TensorDataset(
    train_tokens['input_ids'],
    train_tokens['attention_mask'],
    torch.tensor(ds['train']['category'], dtype=torch.long),
)

val_dataset = TensorDataset(
    val_tokens['input_ids'],
    val_tokens['attention_mask'],
    torch.tensor(ds['validation']['category'], dtype=torch.long),
)

test_dataset = TensorDataset(
    test_tokens['input_ids'],
    test_tokens['attention_mask'],
    torch.tensor(ds['test']['category'], dtype=torch.long),
)

In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

### Loading Pretrained

In [None]:
device = 'mps' if torch.backends.mps.is_built() else 'cuda' if torch.cuda.is_available() else 'cpu'

print(device)

### Fine-Tuning with Mean Pooling

In [None]:
import torch.nn.functional as F


def avg_train_and_val(
        model,
        loader: DataLoader,
        epoch,
        num_epochs,
        optimizer,
        mode,
):
    total_loss = 0

    for input_ids, attention_mask, labels in tqdm(loader, desc=f'{mode} epoch {epoch}/{num_epochs}...'):
        if mode == 'Training':
            optimizer.zero_grad()

        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        logits = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
        )

        outputs_loss = F.cross_entropy(logits, labels)

        total_loss += outputs_loss.item()

        if mode == 'Training':
            outputs_loss.backward()
            optimizer.step()

    loss = total_loss / len(loader)
    print(f'{mode} epoch {epoch + 1}/{num_epochs}: {mode} Loss: {loss:.4f}')

In [None]:
def avg_train(
        model,
        train_loader,
        val_loader,
        optimizer,
        num_epochs=3,
):
    for epoch in range(num_epochs):
        model.train()

        avg_train_and_val(model, train_loader, epoch, num_epochs, optimizer, mode='Training')

        model.eval()

        with torch.no_grad():
            avg_train_and_val(model, val_loader, epoch, num_epochs, optimizer, mode='Validating')

In [None]:
class Classifier(torch.nn.Module):
    def __init__(self, pretrained, pooling_type):
        super(Classifier, self).__init__()
        self.pooling_type = pooling_type
        self.pretrained = pretrained
        self.classifier = torch.nn.Linear(384, 6)

    def mean_pooling(
        self,
        token_embeddings: torch.Tensor,
        attention_mask: torch.Tensor
    ):
        input_mask_expanded = (
            attention_mask.unsqueeze(-1).expand(token_embeddings.shape).float()
        )
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
            input_mask_expanded.sum(1), min=1e-9
        )

    def cls_pooling(
        self,
        token_embeddings: torch.Tensor,
        attention_mask: torch.Tensor
    ):
      return token_embeddings[:,0]

    def forward(self, input_ids, attention_mask):
        outputs = self.pretrained(
            input_ids=input_ids,
            attention_mask=attention_mask,
        )

        if self.pooling_type == 'mean':
          embeddings = self.mean_pooling(outputs.last_hidden_state, attention_mask)
        elif self.pooling_type == 'cls':
          embeddings = self.cls_pooling(outputs.last_hidden_state, attention_mask)
        logits = self.classifier(embeddings)

        return logits


In [None]:
pretrained = AutoModel.from_pretrained(
    'ibm-granite/granite-embedding-30m-english',
    output_hidden_states=True,
)

classifier = Classifier(pretrained, 'mean')
classifier = classifier.to(device)

optimizer = torch.optim.AdamW(classifier.parameters(), lr=5e-5)

In [None]:
avg_train(classifier, train_loader, val_loader, optimizer)

In [None]:
def eval_model(model, loader):
    preds = []
    labels = []
    model.eval()
    with torch.no_grad():
        for input_ids, attention_mask, label in tqdm(loader):
            logits = model(
                input_ids=input_ids.to(device),
                attention_mask=attention_mask.to(device),
            )

            sentiment = torch.argmax(logits, dim=1).tolist()
            for i in range(len(label)):
                preds.append(sentiment[i])
                labels.append(label[i].item())

    return preds, labels

In [None]:
loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [None]:
preds_mean, labels_mean = eval_model(classifier, loader)

In [None]:
pretrained = AutoModel.from_pretrained(
    'ibm-granite/granite-embedding-30m-english',
    output_hidden_states=True,
)

classifier = Classifier(pretrained, 'cls')
classifier = classifier.to(device)

optimizer = torch.optim.AdamW(classifier.parameters(), lr=5e-5)

In [None]:
avg_train(classifier, train_loader, val_loader, optimizer)

In [None]:
preds_cls, labels_cls = eval_model(classifier, loader)

In [None]:
class_labels = [i[1] for i in sorted(id2label.items())]

## Compare labels and preds here

In [None]:
from sklearn.metrics import classification_report
print(classification_report(labels_mean, preds_mean,target_names=class_labels))

In [None]:
print(classification_report(labels_cls, preds_cls,target_names=class_labels))