**Обучим модель классифицировать токсичность**

In [16]:
import pandas as pd
import re
import numpy as np
import string
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_curve, classification_report, f1_score, precision_score, recall_score
import torch
from tqdm import tqdm 
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, Sampler, TensorDataset
from transformers import BertTokenizer, BertForMaskedLM, BertForSequenceClassification, \
 get_linear_schedule_with_warmup, AdamW


In [5]:
df = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')
df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


In [4]:
def preprocess(text):  # Предобработка текста
    text = text.lower() 
    text = text.strip()  
    text = re.compile('<.*?>').sub('', text) 
    text = re.sub('\s+', ' ', text)  
    text = re.sub(r'\[[0-9]*\]',' ',text) 
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d',' ',text) 
    text = re.sub(r'\s+',' ',text) 
    return text

In [6]:
df['toxicity'] = df.iloc[:, 2:].sum(axis=1)  # Зададим токсичность как категорию в отдельную колонку
df = df[['comment_text', 'toxicity']]
df['comment_text'] = df['comment_text'].apply(preprocess)  # Обработаем текст
df['toxicity'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


0    143346
1      6360
3      4209
2      3480
4      1760
5       385
6        31
Name: toxicity, dtype: int64

In [None]:
df = df.loc[~df['toxicity'].isin([5, 6])]  # Уберем категории 5 и 6, т.к. их мало
df[df['toxicity'] == 0] = df[df['toxicity'] == 0].sample(7000)  # Уменьшим количество нетоксичных фраз
df = df[df['comment_text'].notna()]  # Уберем NaN
df.toxicity = df.toxicity.astype('int')  # Приведем колонку к типу int

In [12]:
df.toxicity.value_counts()

0    7000
1    6360
3    4209
2    3480
4    1760
Name: toxicity, dtype: int64

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
berttoxic = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels = 5, 
    output_hidden_states = True,
)


In [19]:
def convert_to_dataset_torch(data: pd.DataFrame):
    input_ids = []
    attention_masks = []
    
    for row in tqdm(data, total=data.shape[0]):
        encoded_dict = tokenizer.encode_plus(row, 
                                             max_length=512,
                                             pad_to_max_length=True,
                                             return_attention_mask=True, 
                                             return_tensors='pt', 
                                             truncation=True)

        input_ids.append(encoded_dict['input_ids'])

        attention_masks.append(encoded_dict['attention_mask'])
        
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    input_ids.to(dtype=torch.long)
    attention_masks.to(dtype=torch.long)
    
    return input_ids, attention_masks 

In [17]:
def dataprep(data: pd.DataFrame, batch_size: int, sampler: Sampler, labels=None): 
        inps, masks = convert_to_dataset_torch(data)
        if labels is not None:
            labels = torch.tensor(labels)
            encoded = TensorDataset(inps, masks, labels)
        else:
            encoded = TensorDataset(inps, masks)
        
        dataloader = DataLoader(
            encoded,  
            sampler = sampler(encoded),
            batch_size = batch_size,
            num_workers = 0,
            drop_last=True
        )

        return dataloader

In [None]:
train_dataloader = dataprep(
    data=df['comment_text'],
    labels=df['toxicity'],
    batch_size=16,
    sampler=RandomSampler
)


In [None]:
optimizer = AdamW(berttoxic.parameters(),lr=2e-5,eps=1e-8)

In [15]:
epochs = 5
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, 
                                            num_training_steps = total_steps)

In [None]:
def trainloop(model, optimizer, scheduler, dataloader, device, epochs):
        for epoch_i in tqdm(range(epochs)):
            print("")
            print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
            print('Training...')

            total_train_loss = 0
            total_train_accuracy = 0

            acc = []
            losses = []
            model.train()

            for step, batch in enumerate(train_dataloader):
                b_input_ids = batch[0].to(DEVICE)
                b_input_mask = batch[1].to(DEVICE)
                b_labels = batch[2].to(DEVICE)
                optimizer.zero_grad()        

                m = model(input_ids=b_input_ids, 
                                     attention_mask=b_input_mask, 
                                     labels=b_labels)
                loss = m.loss
                total_train_loss += loss.item()
                losses.append(loss.item())


                y_pred = np.argmax(m.logits.detach().cpu().numpy(), axis=1).flatten()
                accuracy = accuracy_score(b_labels.cpu(), y_pred)
                acc.append(accuracy)
                total_train_accuracy += accuracy
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)



                scheduler.step()
                loss.backward()
                optimizer.step()

                if len(losses) == 20:
                    print("Loss: {}".format(sum(losses)/len(losses)))
                    print("Accuracy: {}".format(sum(acc)/len(acc)))
                    losses = []
                    acc = []

            avg_train_loss = total_train_loss / len(train_dataloader)            

            print("")
            print("  Average training loss: {0:.2f}".format(avg_train_loss))

In [None]:
trained_model = trainloop(
    model=berttoxic,
    optimizer=optimizer,
    scheduler=scheduler,
    dataloader=train_dataloader,
    device='cuda:0',
    epochs=3
)


In [18]:
torch.save(trained_model, 'toxic5class5eps.pth')