In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('/kaggle/input/emotion-anal/data.csv')

# Очистка данных
df['Answer'] = df['Answer'].str.lower().str.replace('[^\w\s]', '', regex=True)

# Разделение данных на признаки и метки
X = df['Answer']
y = df.iloc[:, 1:]

# Разделение данных на тренировочную и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
def tokenize_data(texts, labels, tokenizer, max_length=256):
    input_ids = []
    attention_masks = []
    
    for text in texts:
        encoded = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_length,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels.values, dtype=torch.float)
    
    return input_ids, attention_masks, labels

In [4]:
import torch
from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

X_train_ids, X_train_masks, y_train_tensors = tokenize_data(X_train, y_train, tokenizer)
X_test_ids, X_test_masks, y_test_tensors = tokenize_data(X_test, y_test, tokenizer)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]



In [5]:
from transformers import RobertaForSequenceClassification

model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=y_train.shape[1])

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [8]:
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from transformers import AdamW
from sklearn.metrics import accuracy_score, classification_report

# Создание DataLoader для тренировочных данных
train_data = TensorDataset(X_train_ids, X_train_masks, y_train_tensors)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=8)

# Создание DataLoader для тестовых данных
test_data = TensorDataset(X_test_ids, X_test_masks, y_test_tensors)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=8)

optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)



In [9]:
def train_model(model, dataloader, optimizer, device, epochs=3):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for step, batch in enumerate(dataloader):
            batch_input_ids, batch_attention_masks, batch_labels = batch
            batch_input_ids = batch_input_ids.to(device)
            batch_attention_masks = batch_attention_masks.to(device)
            batch_labels = batch_labels.to(device)
            
            model.zero_grad()
            
            outputs = model(
                input_ids=batch_input_ids,
                attention_mask=batch_attention_masks,
                labels=batch_labels
            )
            loss = outputs.loss
            total_loss += loss.item()
            
            loss.backward()
            optimizer.step()
        
        avg_loss = total_loss / len(dataloader)
        print(f'Epoch {epoch + 1}, Loss: {avg_loss}')

In [18]:
train_model(model, train_dataloader, optimizer, device, 5)

Epoch 1, Loss: 0.023406963090638857
Epoch 2, Loss: 0.02223952181284895
Epoch 3, Loss: 0.01997888132275359
Epoch 4, Loss: 0.019106505860297662
Epoch 5, Loss: 0.01794999240381593


In [16]:
def evaluate_model(model, dataloader, device):
    model.eval()
    predictions = []
    true_labels = []
    
    with torch.no_grad():
        for batch in dataloader:
            batch_input_ids, batch_attention_masks, batch_labels = batch
            batch_input_ids = batch_input_ids.to(device)
            batch_attention_masks = batch_attention_masks.to(device)
            batch_labels = batch_labels.to(device)
            
            outputs = model(
                input_ids=batch_input_ids,
                attention_mask=batch_attention_masks
            )
            logits = outputs.logits
            predictions.append(logits.cpu().numpy())
            true_labels.append(batch_labels.cpu().numpy())
    
    predictions = np.concatenate(predictions, axis=0)
    true_labels = np.concatenate(true_labels, axis=0)
    
    return predictions, true_labels

In [19]:
import numpy as np
from sklearn.metrics import f1_score

predictions, true_labels = evaluate_model(model, test_dataloader, device)

predicted_labels = (predictions > 0.5).astype(int)

acc = []
f1 = []
for i, col in enumerate(y.columns):
    acc.append(accuracy_score(true_labels[:, i], predicted_labels[:, i]))
    f1.append(f1_score(true_labels[:, i], predicted_labels[:, i], zero_division=True))

print("Средняя accuracy по всем классам:", np.mean(acc))
print("Средняя F1 по всем классам: ", np.mean(f1))

Средняя accuracy по всем классам: 0.9309175920514319
Средняя F1 по всем классам:  0.515602152738415


### Выводы
В предыдущем уроке были обучены логистическая регрессия и SVM на фичах TF-IDF. Данные модели показали качества accuracy примерно 90%, после оптимизации гиперпараметров было достигнуто f1 0.34.  

Дообученная NLP модель roberta после примерно 20 эпох обучения показала accuracy 93%, а f1 0.51, что означает, что модель улучшилась.