In [None]:
!pip install transformers 
!pip install comet-ml
!pip install pytorch-lightning
!git clone https://github.com/bohdan1/AbusiveLanguageDataset

In [2]:
import pandas as pd 
import numpy as np
import os
import torch 
from torch import nn
import torch.nn.functional as F
import pytorch_lightning as pl
from torch.utils.data import random_split
from torch.utils.data import Dataset, DataLoader
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import BertForSequenceClassification, BertTokenizerFast, Trainer, TrainingArguments


In [3]:
data_path = './AbusiveLanguageDataset/'
data = pd.read_csv(os.path.join(data_path,'data.csv'))
labeled = pd.read_csv(os.path.join(data_path,'labled.csv'))

In [25]:
data.head()

Unnamed: 0,id,video_id,author,text
0,UgygtiZyJIQoKYMr5-14AaABAg,_k-gDXrfu-s,Taki Mizu,Нужно было из града всю толпу поливать﻿
1,UgxdRnsYt37Ega0lMfJ4AaABAg,_k-gDXrfu-s,Дональд Трамп,"Слава Беркуту,спасибо зато, что хотели уберечь..."
2,UgyM1RlUb-cSyMCfGTt4AaABAg,_k-gDXrfu-s,Светлана Агзамова,беркуту позор﻿
3,Ugy_jfinirWXeqK3f5p4AaABAg,_k-gDXrfu-s,Sonya Nishpal,как можно быть такими жестокими?(! извиняюсь з...
4,Ugx85dWKg6e-RcpEuv94AaABAg,_k-gDXrfu-s,Bro Rik,в итоге продали свою жопу) за что дохли как со...


In [4]:
labels = labeled['text']
targets = labeled['abusive']

In [5]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
    labels, targets, test_size=0.2, random_state=42)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)


In [17]:
from transformers import BertForSequenceClassification, AdamW, BertConfig, BertTokenizer
from transformers import AutoTokenizer, AutoModel

model = BertForSequenceClassification.from_pretrained(
    "DeepPavlov/bert-base-bg-cs-pl-ru-cased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)# Tell pytorch to run this model on the GPU.
tokenizer = BertTokenizer.from_pretrained("DeepPavlov/bert-base-bg-cs-pl-ru-cased", max_length = 512)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/bert-base-bg-cs-pl-ru-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
train_encodings = (tokenizer(list(train_texts), truncation=True, padding=True,max_length = 512))
val_encodings = (tokenizer(list(val_texts), truncation=True, padding=True,max_length = 512))
test_encodings = (tokenizer(list(test_texts), truncation=True, padding=True,max_length = 512))

train_labels = list(train_labels*1)
test_labels = list(test_labels*1)
val_labels = list(val_labels*1)

In [9]:
class ToxDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ToxDataset(train_encodings, train_labels)
val_dataset = ToxDataset(val_encodings, val_labels)
test_dataset = ToxDataset(test_encodings, test_labels)

In [10]:
def save_model(save_path  = 'saved_model'):
  model.save_pretrained(save_path)
  tokenizer.save_pretrained(save_path)
save_model()

In [23]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    evaluate_during_training=True,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)



In [26]:
trainer.evaluate()

{'epoch': 5.0,
 'eval_accuracy': 0.74375,
 'eval_f1': 0.6132075471698113,
 'eval_loss': 1.5390530824661255,
 'eval_precision': 0.6190476190476191,
 'eval_recall': 0.6074766355140186,
 'total_flos': 3496771151462400}