In [None]:
import torch
import os
import torchvision
import numpy as np
import matplotlib.pyplot as plt
import torch.nn as nn
import torchvision.models as models
from torchvision.datasets import ImageFolder
from torchvision.transforms import ToTensor
from torch.utils.data.dataloader import DataLoader, Dataset
from torch.utils.data import random_split
import torchvision.models as models
import zipfile
from torchvision import datasets, transforms
import torch.optim as optim
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from pymystem3 import Mystem
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_linear_schedule_with_warmup



In [None]:
# pip install transformers

In [None]:
comm = pd.read_csv('train.csv', sep=';')

In [None]:
comm_test = pd.read_csv('test.csv', sep=';')

In [None]:
comm_test.head()

Unnamed: 0,id,comment
0,15000,"Или эти программисты, зарабатывающие 3кк с, вс..."
1,15001,"0,3 с коррекцией, т.е в очках или линзах.\n"
2,15002,"...\n\nДа, здесь все идет. Это моя страница об..."
3,15003,Да. Но отчасти в этом есть вина и самой теслы....
4,15004,нужен баланс между труд отдых зачем? бывали сл...


In [None]:
comm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       15000 non-null  int64 
 1   comment  15000 non-null  object
 2   toxic    15000 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 351.7+ KB


In [None]:
comm.duplicated().sum()

0

In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words('russian'))
mystem = Mystem()
def prep_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'[^а-яё]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    lemmatized_text = ' '.join(mystem.lemmatize(text))
    tokens = [word for word in lemmatized_text.split() if word not in stop_words]
    tokens = list(dict.fromkeys(tokens))
    return tokens
comm['comment'] = comm['comment'].apply(prep_text)
comm

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,id,comment,toxic
0,0,"[бесполезно, пытаться, доносить, человек, кома...",0
1,1,"[свинья, уметь, читать]",1
2,2,"[червепидорский, страна, парашный, конфедераци...",1
3,3,"[это, зайти, сюда, специально, информация, ниг...",1
4,4,"[дополнение, дентрен, чилийский, грязь, которы...",0
...,...,...,...
14995,14995,"[обращать, внимание, дополнение, категория, по...",0
14996,14996,"[борцовский, арена, собираться, заключать, сде...",0
14997,14997,"[заниматься, свинособака, бесплатно, секс, нед...",1
14998,14998,"[хуйло, почему, считать, постить, аватара, это...",1


In [None]:
comm_test['comment'] = comm_test['comment'].apply(prep_text)


In [None]:
comm_data = comm.drop(['id','toxic'],axis=1)

In [None]:
comm_target = comm.drop(['id','comment'],axis=1)

In [None]:
comm_data_train, comm_data_test, comm_target_train, comm_target_test = train_test_split(comm_data, comm_target, test_size=0.2, random_state=42)
comm_data_val, comm_data_test, comm_target_val, comm_target_test = train_test_split(comm_data_test, comm_target_test, test_size=0.5, random_state=42)

In [None]:
class CommentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        label = self.labels.iloc[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [None]:
tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
max_len = 128

In [None]:
train_dataset = CommentDataset(comm_data_train['comment'], comm_target_train, tokenizer, max_len)
val_dataset = CommentDataset(comm_data_val['comment'], comm_target_val, tokenizer, max_len)
test_dataset = CommentDataset(comm_data_test['comment'], comm_target_test, tokenizer, max_len)

In [None]:
test = CommentDataset(comm_test['comment'], comm_test['id'],tokenizer, max_len)

In [None]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)

In [None]:
test = DataLoader(test, batch_size=16, shuffle = False)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained("DeepPavlov/rubert-base-cased", num_labels=2)
model = model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)


def train_epoch(model, data_loader, optimizer, device):
    model = model.train()
    total_loss = 0
    correct_predictions = 0
    total_examples = 0

    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        total_loss += loss.item()
        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == labels)
        total_examples += labels.size(0)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        optimizer.zero_grad()

    return total_loss / total_examples, correct_predictions.double() / total_examples

def eval_model(model, data_loader, device):
    model = model.eval()
    total_loss = 0
    correct_predictions = 0
    total_examples = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            total_loss += loss.item()
            _, preds = torch.max(logits, dim=1)
            correct_predictions += torch.sum(preds == labels)
            total_examples += labels.size(0)

    return total_loss / total_examples, correct_predictions.double() / total_examples

num_epochs = 3
for epoch in range(num_epochs):
    print(f'Epoch {epoch + 1}/{num_epochs}')
    print('-' * 10)

    train_loss, train_acc = train_epoch(model, train_loader, optimizer, device)
    print(f'Train loss: {train_loss} Train accuracy: {train_acc}')

    val_loss, val_acc = eval_model(model, val_loader, device)
    print(f'Val loss: {val_loss} Val accuracy: {val_acc}')

  return self.fget.__get__(instance, owner)()
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
----------




Train loss: 0.022120505399846783 Train accuracy: 10.2845
Val loss: 0.015840234244863194 Val accuracy: 9.918666666666667
Epoch 2/3
----------
Train loss: 0.015173710732487961 Train accuracy: 10.200333333333333
Val loss: 0.01801094836369157 Val accuracy: 9.790666666666667
Epoch 3/3
----------
Train loss: 0.011007535487180576 Train accuracy: 10.182833333333333
Val loss: 0.020808576577653486 Val accuracy: 9.966666666666667


In [None]:
def predict_model(model, data_loader, device):
    model = model.eval()
    predictions = []
    ids = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            _, preds = torch.max(logits, dim=1)

            predictions.extend(preds.cpu().numpy())
            ids.extend(labels.cpu().numpy())

    return predictions, ids

In [None]:
predictions, ids = predict_model(model, test, device)


In [None]:
results_df = pd.DataFrame({'ID' : ids,'prediction': predictions})


In [None]:
results_df

Unnamed: 0,ID,prediction
0,15000,0
1,15001,0
2,15002,1
3,15003,0
4,15004,0
...,...,...
5407,20407,0
5408,20408,0
5409,20409,0
5410,20410,0


In [None]:
results_df.to_csv('saaample.csv',index = False)
