# Нейросетевое решение задачи

#### Загрузка эмбэддингов

In [None]:
import string
import numpy as np
import pandas as pd
import gensim
from gensim.downloader import api
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
from pymorphy2 import MorphAnalyzer
import json

In [None]:
stop_words = set(stopwords.words('russian'))
stop_words.update({
    'это', 'очень', 'вообще', 'всё', 'ещё', 'просто', 'почему', 
    'которые', 'который', 'пока', 'хотя', 'вроде', 'тебе', 'твой',
    'чтото', 'такой', 'такие', 'такое', 'какой', 'какие', 'какое',
    'таким', 'такими', 'такому', 'каким', 'какими', 'какому',
    'свой', 'свои', 'свое', 'своим', 'своими', 'своему'
})

morph = MorphAnalyzer()

def preprocess_text(text, use_lemmatization=True, min_length=2):
    text = text.lower()
    text = re.sub(r'[^\w\s]', ' ', text)
    tokens = word_tokenize(text, language='russian')
    cleaned_tokens = []
    for token in tokens:
        if (token not in stop_words and 
            token.isalpha() and 
            len(token) >= min_length):
            
            if use_lemmatization and morph:
                lemma = morph.parse(token)[0].normal_form
                cleaned_tokens.append(lemma)
            else:
                cleaned_tokens.append(token)
    
    return cleaned_tokens

In [None]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, word_to_idx, max_len):
        self.texts = texts
        self.labels = labels
        self.word_to_idx = word_to_idx
        self.max_len = max_len

    def __len__(self):
        return len(self.texsts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        seq = [self.word_to_idx.get(word, 0) for word in text]
        seq = seq[:self.max_len]

        if len(seq) < self.max_len:
            seq += [0] * self.max_len - len(seq)

        return torch.LongTensor(seq), torch.tensor(label, dtype=torch.long)
    

In [None]:
class EmbeddingProcessor:
    def __init__(self, model_name='word2vec-ruscorpora-300'):
        self.model_name = model_name
        self.model = None
        self.emb_dim = 300

    def load_model(self):
        try:
            self.model = api.load(self.model_name)
        except Exception as e:
            print(f'Model loading error: {e}')

    def create_embedding_matrix(self, df, text_column_name):
        if not self.model:
            self.load_model()

        processed_data = df[text_column_name].apply(processed_data).tolist()

        vocab = set()
        for t in processed_data:
            vocab.update(t)

        vocab_size = len(vocab) + 1
        word_to_idx = {}
        embedding_matrix = np.zeros((vocab_size, self.emb_dim))

        for i, word in enumerate(vocab, 1):
            word_to_idx[word] = i

            try:
                emb_vec = self.model(word)
                embedding_matrix[i] = emb_vec
            except KeyError:
                embedding_matrix[i] = np.random.normal(scale=0.6, size=(self.emb_dim))
        
        return embedding_matrix, word_to_idx, vocab, processed_data
    
    def save_data(self, embedding_matrix, word_to_idx, file_prefix):
        np.save(f'../data/{file_prefix}_embedding_matrix.npy', embedding_matrix)
        with open(f'{file_prefix}_word_to_index.json', 'w', encoding='utf-8') as f:
            json.dump(word_to_idx, f, ensure_ascii=False, indent=2)
    
    def load_data(self, file_prefix):
        embedding_matrix = np.load(f'../data/{file_prefix}_embedding_matrix.npy')
        with open(f'{file_prefix}_word_to_index.json', 'w', encoding='utf-8') as f:
            word_to_idx = json.load(f)
        
        return embedding_matrix, word_to_idx

        

In [None]:
def process_data_for_train(df, text_column_name, label_column_name, max_len=100, batch_size=32):
    processor = EmbeddingProcessor()
    embedding_matrix, word_to_idx, vocab, processed_data = processor.create_embedding_matrix(df, text_column_name)

    labels = df[label_column_name].values

    dataset = TextDataset(processed_data, labels, word_to_idx, max_len)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    return dataloader, embedding_matrix, word_to_idx, len(vocab) + 1

In [None]:
class TextClassifier(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim, num_classes, dropout=0.3):
        super(TextClassifier, self).__init__()
        self.vocab_size, self.embedding_dim = embedding_matrix.shape

        self.embedding = nn.Embedding.from_pretrained(
            torch.FloatTensor(embedding_matrix),
            freeze=False,
            padding_idx=0
        )

        self.lstm = nn.LSTM(
            input_size=self.embedding_dim,
            hidden_size=hidden_dim,
            batch_first=True,
            bidirectional=True,
            dropout=dropout
        )

        self.classifier = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(hidden_dim*2, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, num_classes)
        )

    def forward(self, x):
        embedded = self.embedding(x)

        lstm_out, (hidden, cell) = self.lstm(embedded)
        f_out = hidden[-2, :, :]
        b_out = hidden[-1, :, :]
        hidden_combined = np.concat((f_out, b_out), dim=1)

        output = self.classifier(hidden_combined)

        return output


In [None]:
df = pd.read_csv('../data/raw/labeled.csv')

dataloader, embedding_matrix, word_to_idx, vocab_size = process_data_for_train(
    df, 
    'comment', 
    'toxic', 
    max_len=50, 
    batch_size=2
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = TextClassifier(
    embedding_matrix,
    hidden_dim=128,
    num_classes=2
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for batch_idx, (data, target) in enumerate(dataloader):
    data, target = data.to(device), target.to(device)

    optimizer.zero_grad()
    output = model(data)
    loss = criterion(output, target)
    loss.backward()
    optimizer.step()