In [12]:
import ast
import itertools
import json

import requests
from matplotlib import pyplot as plt
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.utils import compute_class_weight
from tqdm import tqdm
from bs4 import BeautifulSoup
import lxml
import pandas as pd
import numpy as np
import youtokentome as yttm
import torch
from torch import nn
from torch.nn import functional as F
from torch.autograd import Variable
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split


In [2]:
df = pd.read_csv('data/2ch_posts.csv')

# Byte-pair encoding

In [3]:
with open('data/token_train.txt', 'w', encoding='utf8') as f:
    for text in tqdm(df['text']):
        f.write(text + '\n')

100%|██████████| 13344/13344 [00:00<00:00, 555543.57it/s]


In [4]:
yttm.BPE.train(data='data/token_train.txt', vocab_size=5000, model='models/bpe.model')

<youtokentome.youtokentome.BPE at 0x2080f2ade50>

In [2]:
bpe = yttm.BPE('models/bpe.model')

In [6]:
print(bpe.encode(['привет', 'как', 'дела', 'сап двач мур-мур-мур-мур', 'омежка', 'оп', 'тня', 'тян', 'сычевать', 'бамп'], output_type=yttm.OutputType.SUBWORD))

[['▁привет'], ['▁как'], ['▁дела'], ['▁са', 'п', '▁двач', '▁му', 'р', '-', 'му', 'р', '-', 'му', 'р', '-', 'му', 'р'], ['▁омежка'], ['▁оп'], ['▁т', 'ня'], ['▁тян'], ['▁сы', 'че', 'вать'], ['▁бамп']]


# Токенизация

In [54]:
df = pd.read_csv('data/2ch_labeled_llama.csv', index_col='id')

In [8]:
df['text'] = bpe.encode(df['text'].tolist())

In [9]:
df

Unnamed: 0_level_0,text,toxic
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"[1423, 967, 698, 371, 475, 556, 284, 2441, 371...",1
1,"[2348, 436, 312, 1145, 905]",0
2,"[270, 312, 954, 1384, 1419, 37]",1
3,"[1350, 811, 1065, 288, 2425, 2109, 585, 269, 2...",1
4,"[3071, 410, 3988, 2709]",1
...,...,...
13147,"[1033, 33, 2202, 483, 284, 310, 377, 349, 2950...",1
13148,"[313, 1768, 29, 310, 377, 349, 1397, 3548, 270...",0
13149,"[483, 361, 355, 1049, 284, 2098, 2421, 2504, 6...",0
13150,"[1882, 395, 929]",1


In [10]:
df.to_csv('data/2ch_labeled_llama_bpe.csv', index_label='id')

# Обучение модели

In [13]:
df = pd.read_csv('data/2ch_labeled_llama_bpe.csv', index_col='id')
df['text'] = df['text'].apply(lambda x: ast.literal_eval(x))

In [14]:
train, test = train_test_split(df, test_size=0.2, random_state=42)

# Split the datasets into features (text) and targets (toxic)
train_texts, train_labels = train['text'].tolist(), train['toxic'].tolist()
test_texts, test_labels = test['text'].tolist(), test['toxic'].tolist()

In [15]:
# Convert lists to PyTorch tensors
train_texts = [torch.tensor(seq) for seq in train_texts]
test_texts = [torch.tensor(seq) for seq in test_texts]


In [16]:
# Pad sequences
train_texts = pad_sequence(train_texts, batch_first=True, padding_value=0).to(dtype=torch.int64)
test_texts = pad_sequence(test_texts, batch_first=True, padding_value=0).to(dtype=torch.int64)

In [17]:
# Get the actual maximum length after padding
maxlen = train_texts.shape[1]

In [18]:
# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_labels)
test_labels = torch.tensor(test_labels)

In [19]:
class TextClassifier(nn.ModuleList):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, hidden_lin_dim, train_mode):
        super(TextClassifier, self).__init__()
        self.train_mode = train_mode

        # Embedding layer
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim, padding_idx=0)
        
        # LSTM layer 
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, batch_first=True)
        
        # Output layer
        self.fc1 = nn.Linear(hidden_dim, hidden_lin_dim)
        if self.train_mode:
            self.dropout = nn.Dropout(p=0.2)
        self.batchnorm = nn.BatchNorm1d(hidden_lin_dim, momentum=0.9)
        self.fc2 = nn.Linear(hidden_lin_dim, 2)
        self.activation1 = nn.Sigmoid()
        self.activation2 = nn.Softmax()
        
        torch.nn.init.xavier_uniform(self.fc1.weight)
        torch.nn.init.xavier_uniform(self.fc2.weight)
    
        
    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = self.fc1(x[:, -1, :])
        x = self.batchnorm(x)
        x = self.activation1(x)
        if self.train_mode:
            x = self.dropout(x)
        x = self.fc2(x)
        x = self.activation2(x)
        
        return x

In [20]:
hidden_lin_dim = 15
embedding_dim = 30
hidden_dim = 15
batch_size = 450
vocab_size = 5000

train = torch.utils.data.TensorDataset(train_texts, train_labels)
test = torch.utils.data.TensorDataset(test_texts,test_labels)

train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size)
test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = TextClassifier(vocab_size, embedding_dim, hidden_dim, hidden_lin_dim, True).to(device)

# Balance the weights of the loss function since the dataset is imbalanced
class_weights = compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels.numpy())

loss_function = nn.CrossEntropyLoss(weight=torch.FloatTensor(class_weights).to(device))
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

epochs = 100000

for epoch in range(epochs):
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        model.zero_grad()
        output = model(inputs)
        loss = loss_function(output, labels)
        loss.backward()
        optimizer.step()

    # Testing
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    # Testing
    correctTrain = 0
    totalTrain = 0
    with torch.no_grad():
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            totalTrain += labels.size(0)
            correctTrain += (predicted == labels).sum().item()
    
    print('Epoch {}/{}...'.format(epoch+1, epochs),
          'Loss: {:.4f}...'.format(loss.item()),
          'Train Accuracy: {:.2f}...'.format(correctTrain / totalTrain * 100),
          'Test Accuracy: {:.2f}...'.format(correct / total * 100))

  torch.nn.init.xavier_uniform(self.fc1.weight)
  torch.nn.init.xavier_uniform(self.fc2.weight)
  return self._call_impl(*args, **kwargs)


Epoch 1/100000... Loss: 0.7367... Train Accuracy: 59.61... Test Accuracy: 57.07...
Epoch 2/100000... Loss: 0.7007... Train Accuracy: 53.57... Test Accuracy: 57.58...
Epoch 3/100000... Loss: 0.6979... Train Accuracy: 45.02... Test Accuracy: 50.90...
Epoch 4/100000... Loss: 0.6958... Train Accuracy: 41.35... Test Accuracy: 44.99...


KeyboardInterrupt: 

# Сохранение модели

In [14]:
torch.save(model.state_dict(), 'models/model.pt')

In [24]:
model = TextClassifier(vocab_size, embedding_dim, hidden_dim, hidden_lin_dim, False).to(device)
model.load_state_dict(torch.load('models/model.pt'))
model.eval()

  torch.nn.init.xavier_uniform(self.fc1.weight)
  torch.nn.init.xavier_uniform(self.fc2.weight)


TextClassifier(
  (0): Embedding(5000, 30, padding_idx=0)
  (1): LSTM(30, 15, batch_first=True)
  (2): Linear(in_features=15, out_features=15, bias=True)
  (3): BatchNorm1d(15, eps=1e-05, momentum=0.9, affine=True, track_running_stats=True)
  (4): Linear(in_features=15, out_features=2, bias=True)
  (5): Sigmoid()
  (6): Softmax(dim=None)
)

In [25]:
def predict(text):
    text = bpe.encode([text])
    text = torch.tensor(text)
    text = pad_sequence(text, batch_first=True, padding_value=0).to(dtype=torch.int64).to(device)
    output = model(text)
    _, predicted = torch.max(output.data, 1)
    return "Токсичные отходы" if predicted.item() == 1 else "Нормальный пост"

In [77]:
predict('Сап двачику, как найти тян?')

  return self._call_impl(*args, **kwargs)


'Нормальный пост'

In [78]:
predict('ОП, ты серьёзно? Очередной двачевский тред про тян')

  return self._call_impl(*args, **kwargs)


'Токсичные отходы'

In [79]:
# Перформанс от github copilot:
predict('Ебал мать ОПа, сука, ебал её в рот, ебал её в жопу, ебал её в пизду, ебал её в уши, ебал её в нос, ебал её в глаза, ебал её в волосатую пизду, ебал её в волосатую жопу, ебал её в волосатый рот, ебал её в волосатые уши, ебал её в волосатый нос, ебал её в волосатые глаза, ебал её в волосатые волосы, ебал её в волосатые ногти, ебал её в волосатые ресницы, ебал её в волосатые брови, ебал её в вол�')

  return self._call_impl(*args, **kwargs)


'Токсичные отходы'

In [80]:
predict('Почему коммунизм не случился?')

  return self._call_impl(*args, **kwargs)


'Нормальный пост'

In [81]:
predict('Я все пропустил, что за тема с вечеринки евлеевой? Есть пак со всеми фотками?')

  return self._call_impl(*args, **kwargs)


'Нормальный пост'

In [82]:
predict("это результаты полугода работы в зале, шизик. Генетика у него, лол Еще и оправдывается мол дальше не шел")

  return self._call_impl(*args, **kwargs)


'Нормальный пост'

  return self._call_impl(*args, **kwargs)


'Нормальный пост'