#HW 9

## Домашнее задание

1. Возьмите готовую модель из https://huggingface.co/models для классификации сентимента текста.
2. Сделайте предсказания на всем df_val. Посчитайте метрику качества.
3. Дообучите эту модель на df_train. Посчитайте метрику качества на df_val.

Данные на google drive: https://drive.google.com/file/d/1Mev_EEput0LlBj8MDHIJkBtahlJ6J901

---

In [None]:
!pip install transformers

In [None]:
!pip install torchmetrics

In [None]:
# Загрузка библиотек

import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from tqdm import tqdm
from collections import Counter
import torchmetrics
import matplotlib.pyplot as plt

import pandas as pd
import transformers
from transformers import pipeline
from transformers import BertTokenizer, BertForSequenceClassification

---

In [None]:
# Загрузка модели

model_bert = BertForSequenceClassification.from_pretrained('SkolkovoInstitute/russian_toxicity_classifier')
print(model_bert)
print("Parameters full train:", sum([param.nelement() for param in model_bert.parameters()]))

In [None]:
# Пример классификации

sentiment = pipeline("text-classification", model='SkolkovoInstitute/russian_toxicity_classifier')
sentiment("Этот ресторан отличный")

In [None]:
# Пример токенизации

tokenizer = BertTokenizer.from_pretrained('SkolkovoInstitute/russian_toxicity_classifier')

example_text = 'Пример текста для токенизации'

bert_input = tokenizer(example_text, padding='max_length', max_length=10, 
                       truncation=True, return_tensors="pt")


print(bert_input['input_ids'])
print(bert_input['attention_mask'])

In [None]:
example_text = tokenizer.decode(bert_input.input_ids[0])

print(example_text)

---

In [None]:
# Загрузка данных

from google.colab import drive
drive.mount('/content/drive')

train_csv = '/content/drive/My Drive/data/train.csv'
val_csv = '/content/drive/My Drive/data/val.csv'

df_train = pd.read_csv(train_csv)
df_val = pd.read_csv(val_csv)

In [None]:
df_train.head()

In [None]:
df_val.head()

In [None]:
sentiment = pipeline("text-classification", model='SkolkovoInstitute/russian_toxicity_classifier')

idx = 0
print(df_train.iloc[idx]['text'])
print('label is', df_train.iloc[idx]['class'])
print('label by model is', sentiment(df_train.iloc[idx]['text'])[0]['label'], 'with score', sentiment(df_train.iloc[idx]['text'])[0]['score'])

---

In [None]:
# Создадим датасет и даталоадер

# Class TwitterDataset

class TwitterDataset(torch.utils.data.Dataset):
    
    def __init__(self, txts, labels):
        self._labels = labels
        
        self.tokenizer = BertTokenizer.from_pretrained('SkolkovoInstitute/russian_toxicity_classifier')
        #для каждого text возвращает батч с полями:
               #'inputs_ids' -- тензор размера (B,1,max_len) из id токенов
               #'token_type_ids' -- тензор размера (B,1,max_len) из id типов токенов
               #'attention_mask' -- тензор размера (B,1,max_len) из индексов, указывающих, на какие токеты модель должна обратить внима
        self._txts = [self.tokenizer(text, padding='max_length', max_length=10,
                                     truncation=True, return_tensors="pt")
                      for text in txts]
        
    def __len__(self):
        return len(self._txts)
    
    def __getitem__(self, index):
        return self._txts[index], self._labels[index]

In [None]:
y_train = df_train['class'].values
y_val = df_val['class'].values

train_dataset = TwitterDataset(df_train['text'], y_train)
valid_dataset = TwitterDataset(df_val['text'], y_val)

train_loader = torch.utils.data.DataLoader(train_dataset,
                          batch_size=64,
                          shuffle=True,
                          num_workers=0)
valid_loader = torch.utils.data.DataLoader(valid_dataset,
                          batch_size=64,
                          shuffle=False,
                          num_workers=0)

In [None]:
for txt, lbl in train_loader:
    print(txt.keys()) #словарь с ключами'input_ids', 'token_type_ids', 'attention_mask'
    print(txt['input_ids'].shape) #тензор размера (B,1,max_len) из id токенов
    print(txt['attention_mask'].shape) #тензор размера (B,1,max_len) из индексов, указывающих, на какие токеты модель должна обратить внимание
    break

---

In [None]:
# Модель на val

# f1 score

model_bert = BertForSequenceClassification.from_pretrained('SkolkovoInstitute/russian_toxicity_classifier')

valid_f1 = torchmetrics.F1Score()

for val_input, val_label in valid_loader:
    val_label = val_label
    mask = val_input['attention_mask'] 
    input_id = val_input['input_ids'].squeeze(1)
    
    output = model_bert(input_id, mask)[0]
    
    valid_f1(output, val_label)
    
print(f'Val f1_score: {valid_f1.compute().item():.3f}')

---

In [None]:
# Дообучение и новые метрики

model = BertForSequenceClassification.from_pretrained('SkolkovoInstitute/russian_toxicity_classifier')
print(model)
print("Parameters full train:", sum([param.nelement() for param in model.parameters()]))
print("Parameters transfer learning:", sum([param.nelement() for param in model.classifier.parameters()]))

In [None]:
# Компиляция и обучение последнего слоя

criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.classifier.parameters(), lr=0.001) 

In [None]:
# Подсчет метрик

train_f1 = torchmetrics.F1Score()
valid_f1 = torchmetrics.F1Score()

epochs = 5

for epoch in range(epochs):
    model.train()
    total_loss_train = 0.0
    for train_input, train_label in tqdm(train_loader):
        mask = train_input['attention_mask']
        input_id = train_input['input_ids'].squeeze(1)
        train_label = train_label

        output = model(input_id, mask)[0]
                
        batch_loss = criterion(output, train_label)
        total_loss_train += batch_loss.item()
                
        train_f1(output, train_label)
        
        model.zero_grad()
        batch_loss.backward()
        optimizer.step()
            
    model.eval()
    total_loss_val = 0.0
    for val_input, val_label in valid_loader:
        val_label = val_label
        mask = val_input['attention_mask']
        input_id = val_input['input_ids'].squeeze(1)

        output = model(input_id, mask)[0]

        batch_loss = criterion(output, val_label)
        total_loss_val += batch_loss.item()
                    
        valid_f1(output, val_label)
            
    print(
        f'Epochs: {epoch + 1} | Train Loss: {total_loss_train / len(train_dataset): .3f} \
        | Train f1: {train_f1.compute().item(): .3f} \
        | Val Loss: {total_loss_val / len(valid_dataset): .3f} \
        | Val f1: {valid_f1.compute().item(): .3f}')
    train_f1.reset()
    valid_f1.reset()

In [None]:
# f1

valid_f1 = torchmetrics.F1Score()
model.eval()

for val_input, val_label in valid_loader:
    val_label = val_label
    mask = val_input['attention_mask'] 
    input_id = val_input['input_ids'].squeeze(1)
    
    output = model(input_id, mask)[0]
    
    valid_f1(output, val_label)
    
print(f'Val f1_score: {valid_f1.compute().item():.3f}')

---

In [None]:
# Добавление sigm

class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):
        super().__init__()
        self.bert = BertForSequenceClassification.from_pretrained('SkolkovoInstitute/russian_toxicity_classifier')
        self.sigm = nn.Sigmoid()

    def forward(self, x, mask):
        pooled_output = self.bert(input_ids=x, attention_mask=mask,return_dict=False)[0]  #(B, 2)
        final_layer = self.sigm(pooled_output)
        return final_layer

In [None]:
model_sigm = BertClassifier()
print(model_sigm)
print("Parameters full train:", sum([param.nelement() for param in model_sigm.parameters()]))
print("Parameters transfer learning:", sum([param.nelement() for param in model_sigm.bert.classifier.parameters()]))

In [None]:
# Компиляция и дообучение

criterion = nn.CrossEntropyLoss()

optimizer = Adam(model_sigm.bert.classifier.parameters(), lr=0.001)

In [None]:
train_f1 = torchmetrics.F1Score()
valid_f1 = torchmetrics.F1Score()

epochs = 5

for epoch in range(epochs):
    model_sigm.train()
    total_loss_train = 0.0
    for train_input, train_label in tqdm(train_loader):
        mask = train_input['attention_mask']
        input_id = train_input['input_ids'].squeeze(1)
        train_label = train_label

        output = model_sigm(input_id, mask)
                
        batch_loss = criterion(output, train_label)
        total_loss_train += batch_loss.item()
                
        train_f1(output, train_label)
        
        model_sigm.zero_grad()
        batch_loss.backward()
        optimizer.step()
            
    model_sigm.eval()
    total_loss_val = 0.0
    for val_input, val_label in valid_loader:
        val_label = val_label
        mask = val_input['attention_mask']
        input_id = val_input['input_ids'].squeeze(1)

        output = model_sigm(input_id, mask)

        batch_loss = criterion(output, val_label)
        total_loss_val += batch_loss.item()
                    
        valid_f1(output, val_label)
            
    print(
        f'Epochs: {epoch + 1} | Train Loss: {total_loss_train / len(train_dataset): .3f} \
        | Train f1: {train_f1.compute().item(): .3f} \
        | Val Loss: {total_loss_val / len(valid_dataset): .3f} \
        | Val f1: {valid_f1.compute().item(): .3f}')
    train_f1.reset()
    valid_f1.reset()

In [None]:
# Метрика дообученной модели на валидационном датасете

valid_f1 = torchmetrics.F1Score()
model_sigm.eval()

for val_input, val_label in valid_loader:
    val_label = val_label
    mask = val_input['attention_mask'] 
    input_id = val_input['input_ids'].squeeze(1)
    
    output = model_sigm(input_id, mask)
    
    valid_f1(output, val_label)
    
print(f'Val f1_score: {valid_f1.compute().item():.3f}')