# Нейросетевое решение задачи

#### Загрузка эмбэддингов

In [70]:
import numpy as np
import pandas as pd
import gensim
import gensim.downloader as api
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
from pymorphy2 import MorphAnalyzer
import json
from sklearn.model_selection import train_test_split

In [71]:
stop_words = set(stopwords.words('russian'))
stop_words.update({
    'это', 'очень', 'вообще', 'всё', 'ещё', 'просто', 'почему', 
    'которые', 'который', 'пока', 'хотя', 'вроде', 'тебе', 'твой',
    'чтото', 'такой', 'такие', 'такое', 'какой', 'какие', 'какое',
    'таким', 'такими', 'такому', 'каким', 'какими', 'какому',
    'свой', 'свои', 'свое', 'своим', 'своими', 'своему'
})

morph = MorphAnalyzer()

def preprocess_text(text, use_lemmatization=True, min_length=2):
    text = text.lower()
    text = re.sub(r'[^\w\s]', ' ', text)
    tokens = word_tokenize(text, language='russian')
    cleaned_tokens = []
    for token in tokens:
        if (token not in stop_words and 
            token.isalpha() and 
            len(token) >= min_length):
            
            if use_lemmatization and morph:
                lemma = morph.parse(token)[0].normal_form
                cleaned_tokens.append(lemma)
            else:
                cleaned_tokens.append(token)
    
    return cleaned_tokens

In [72]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, word_to_idx, max_len):
        self.texts = texts
        self.labels = labels
        self.word_to_idx = word_to_idx
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        if isinstance(text, str):
            tokens = text.split()
        else:
            tokens = text
            
        seq = []
        for word in tokens:
            if word in self.word_to_idx:
                seq.append(self.word_to_idx[word])
            else:
                seq.append(1)
        
        if len(seq) > self.max_len:
            seq = seq[:self.max_len]
        else:
            padding_length = self.max_len - len(seq)
            seq = seq + [0] * padding_length

        return torch.tensor(seq, dtype=torch.long), torch.tensor(label, dtype=torch.long)
    

In [73]:
class EmbeddingProcessor:
    def __init__(self, model_name='word2vec-ruscorpora-300'):
        self.model_name = model_name
        self.model = None
        self.emb_dim = 300

    def load_model(self):
        try:
            self.model = api.load(self.model_name)
        except Exception as e:
            print(f'Model loading error: {e}')

    def create_embedding_matrix(self, df, text_column_name):
        if not self.model:
            self.load_model()

        processed_data = df[text_column_name].apply(preprocess_text).tolist()

        vocab = set()
        for t in processed_data:
            vocab.update(t)

        vocab_size = len(vocab) + 2
        word_to_idx = {'<PAD>': 0, '<UNK>': 1}
        embedding_matrix = np.zeros((vocab_size, self.emb_dim))

        for i, word in enumerate(vocab, 2):
            word_to_idx[word] = i

            try:
                emb_vec = self.model[word]
                embedding_matrix[i] = emb_vec
            except KeyError:
                embedding_matrix[i] = np.random.normal(scale=0.6, size=(self.emb_dim))

        embedding_matrix[0] = np.zeros(self.emb_dim)
        embedding_matrix[1] = np.random.normal(scale=0.6, size=(self.emb_dim))
        
        return embedding_matrix, word_to_idx, vocab, processed_data
    
    def save_data(self, embedding_matrix, word_to_idx, file_prefix):
        np.save(f'../data/{file_prefix}_embedding_matrix.npy', embedding_matrix)
        with open(f'{file_prefix}_word_to_index.json', 'w', encoding='utf-8') as f:
            json.dump(word_to_idx, f, ensure_ascii=False, indent=2)
    
    def load_data(self, file_prefix):
        embedding_matrix = np.load(f'../data/{file_prefix}_embedding_matrix.npy')
        with open(f'{file_prefix}_word_to_index.json', 'w', encoding='utf-8') as f:
            word_to_idx = json.load(f)
        
        return embedding_matrix, word_to_idx 

In [74]:
def process_data_for_train(df, text_column_name, label_column_name, max_len=100, batch_size=32):
    processor = EmbeddingProcessor()
    
    embedding_matrix, word_to_idx, vocab, processed_texts = processor.create_embedding_matrix(
        df, text_column_name
    )
    
    texts = df[text_column_name].tolist()
    labels = df[label_column_name].tolist()
    
    dataset = TextDataset(texts, labels, word_to_idx, max_len)
    
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    
    train_dataset, val_dataset = torch.utils.data.random_split(
        dataset, [train_size, val_size],
        generator=torch.Generator().manual_seed(42)
    )
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
    processor.save_data(embedding_matrix, word_to_idx, 'toxic_comments')

    return train_loader,val_loader, embedding_matrix, word_to_idx, len(vocab) + 1

In [75]:
class TextClassifier(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim, num_classes, num_layers=2, dropout=0.3):
        super(TextClassifier, self).__init__()
        self.vocab_size, self.embedding_dim = embedding_matrix.shape

        self.embedding = nn.Embedding.from_pretrained(
            torch.FloatTensor(embedding_matrix),
            freeze=False,
            padding_idx=0
        )

        self.attention = nn.Sequential(
            nn.Linear(hidden_dim * 2, hidden_dim),
            nn.Tanh(),
            nn.Linear(hidden_dim, 1)
        )

        self.lstm = nn.LSTM(
            input_size=self.embedding_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True,
            dropout=dropout if num_layers > 1 else 0
        )

        self.classifier = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(hidden_dim * 2, hidden_dim),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim // 2),
            nn.Dropout(dropout // 2),
            nn.Linear(hidden_dim // 2, num_classes)
        )

    def forward(self, x):
        embedded = self.embedding(x)

        lstm_out, (hidden, cell) = self.lstm(embedded)

        attention_weights = torch.softmax(self.attention(lstm_out).squeeze(-1), dim=1)
        context_vector = torch.sum(lstm_out * attention_weights.unsqueeze(-1), dim=1)

        output = self.classifier(context_vector)

        return output


In [76]:
def train_model(model, train_loader, val_loader, num_epochs=10, learning_rate=0.001):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-4)
    scheduler = torch.optim.lr_scheduler.OneCycleLR(
        optimizer, 
        max_lr=learning_rate,
        epochs=num_epochs,
        steps_per_epoch=len(train_loader)
    )

    train_losses = []
    val_accuracies = []
    
    train_losses = []
    val_accuracies = []
    best_accuracy = 0
    
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        correct = 0
        total = 0
        
        for batch_idx, (data, targets) in enumerate(train_loader):
            data, targets = data.to(device), targets.to(device)
            
            optimizer.zero_grad()
            outputs = model(data)
            loss = criterion(outputs, targets)
            loss.backward()
            
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            scheduler.step()
            
            total_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += targets.size(0)
            correct += (predicted == targets).sum().item()
        
        model.eval()
        val_correct = 0
        val_total = 0
        
        with torch.no_grad():
            for data, targets in val_loader:
                data, targets = data.to(device), targets.to(device)
                outputs = model(data)
                _, predicted = torch.max(outputs.data, 1)
                val_total += targets.size(0)
                val_correct += (predicted == targets).sum().item()
        
        avg_loss = total_loss / len(train_loader)
        train_acc = 100 * correct / total
        val_acc = 100 * val_correct / val_total
        
        train_losses.append(avg_loss)
        val_accuracies.append(val_acc)
        
        if val_acc > best_accuracy:
            best_accuracy = val_acc
            torch.save(model.state_dict(), '../models/best_nn_model.pth')
        
        print(f'Epoch {epoch+1}/{num_epochs}:')
        print(f'  Train Loss: {avg_loss:.4f}, Train Acc: {train_acc:.2f}%')
        print(f'  Val Acc: {val_acc:.2f}%, Best Val Acc: {best_accuracy:.2f}%')
        print(f'  Learning Rate: {scheduler.get_last_lr()[0]:.6f}')
        print('-' * 50)
    
    return train_losses, val_accuracies

In [77]:
df = pd.read_csv('../data/raw/labeled.csv')

train_loader, val_loader, embedding_matrix, _, _ = process_data_for_train(
    df, 
    'comment', 
    'toxic', 
    max_len=50,
)

model = TextClassifier(
    embedding_matrix,
    hidden_dim=256,
    num_classes=2
)

train_losses, val_accuracies = train_model(
    model,
    train_loader,
    val_loader,
    10
)

Epoch 1/10:
  Train Loss: 0.6881, Train Acc: 57.80%
  Val Acc: 65.28%, Best Val Acc: 65.28%
  Learning Rate: 0.000280
--------------------------------------------------
Epoch 2/10:
  Train Loss: 0.6479, Train Acc: 65.30%
  Val Acc: 68.16%, Best Val Acc: 68.16%
  Learning Rate: 0.000761
--------------------------------------------------
Epoch 3/10:
  Train Loss: 0.6189, Train Acc: 69.18%
  Val Acc: 71.87%, Best Val Acc: 71.87%
  Learning Rate: 0.001000
--------------------------------------------------
Epoch 4/10:
  Train Loss: 0.5619, Train Acc: 74.10%
  Val Acc: 76.14%, Best Val Acc: 76.14%
  Learning Rate: 0.000950
--------------------------------------------------
Epoch 5/10:
  Train Loss: 0.4941, Train Acc: 80.02%
  Val Acc: 76.27%, Best Val Acc: 76.27%
  Learning Rate: 0.000811
--------------------------------------------------
Epoch 6/10:
  Train Loss: 0.4326, Train Acc: 84.24%
  Val Acc: 74.54%, Best Val Acc: 76.27%
  Learning Rate: 0.000611
-------------------------------------