In [1]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

MODEL_NAME = 'bert-base-uncased'
MAX_LEN = 128
BATCH_SIZE = 32
EPOCHS = 3
LEARNING_RATE = 2e-5
MODEL_SAVE_PATH = 'model/bert_toxic_model.pth'

df = pd.read_csv('data/train.csv')
df = df.sample(frac=0.2, random_state=42)
comments = df['comment_text'].values
labels = df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values

X_train, X_val, y_train, y_val = train_test_split(comments, labels, test_size=0.2, random_state=42)

tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

class CommentDataset(Dataset):
    def __init__(self, comments, labels, tokenizer, max_len):
        self.comments = comments
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.comments)

    def __getitem__(self, idx):
        comment = self.comments[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            comment,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'comment_text': comment,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.float)
        }

train_dataset = CommentDataset(X_train, y_train, tokenizer, MAX_LEN)
val_dataset = CommentDataset(X_val, y_val, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=6)
model = model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
criterion = torch.nn.BCEWithLogitsLoss().to(device)


def train_epoch(model, data_loader, optimizer, criterion, device):
    model = model.train()
    losses = []
    correct_predictions = 0
    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["labels"].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.logits, labels)
        
        correct_predictions += ((outputs.logits.sigmoid() > 0.5).float() == labels).sum().item()
        losses.append(loss.item())
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    return correct_predictions / (len(data_loader.dataset) * labels.size(1)), np.mean(losses)

def eval_model(model, data_loader, criterion, device):
    model = model.eval()
    losses = []
    correct_predictions = 0
    
    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["labels"].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs.logits, labels)
            
            correct_predictions += ((outputs.logits.sigmoid() > 0.5).float() == labels).sum().item()
            losses.append(loss.item())
    
    return correct_predictions / (len(data_loader.dataset) * labels.size(1)), np.mean(losses)


history = {'train_acc': [], 'train_loss': [], 'val_acc': [], 'val_loss': []}

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    train_acc, train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
    val_acc, val_loss = eval_model(model, val_loader, criterion, device)
    
    print(f'Train loss: {train_loss}, accuracy: {train_acc}')
    print(f'Validation loss: {val_loss}, accuracy: {val_acc}')
    
    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)

torch.save(model.state_dict(), MODEL_SAVE_PATH)
print(f'Model saved to {MODEL_SAVE_PATH}')



/home/toros/projects/fp/myenv/bin/python


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Train loss: 0.0839381535061376, accuracy: 0.9752066115702479
Validation loss: 0.048235390153713524, accuracy: 0.9819833933886887
Epoch 2/3
Train loss: 0.0405847345870385, accuracy: 0.9852728056088677
Validation loss: 0.043212368363747374, accuracy: 0.9837589430257455
Epoch 3/3
Train loss: 0.031039373273436064, accuracy: 0.9883801391772095
Validation loss: 0.04468099536956288, accuracy: 0.9833672776646301
Model saved to model/bert_toxic_model.pth
