In [None]:
pip install sentencepiece


In [None]:
pip install torch

In [None]:
pip install transformers

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import MT5Tokenizer, MT5ForSequenceClassification
import pandas as pd
from tqdm import tqdm
import os

# Dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

train_df = pd.read_csv('toxic_eng/train.csv')
test_df = pd.read_csv('toxic_eng/test.csv')

train_texts = train_df['comment_text'].tolist()
train_labels = train_df['toxic'].tolist()
test_texts = test_df['comment_text'].tolist()
test_labels = test_df['toxic'].tolist()

tokenizer = MT5Tokenizer.from_pretrained("google/mt5-base")

MAX_LEN = 128
full_train_dataset = TextDataset(train_texts, train_labels, tokenizer, max_length=MAX_LEN)
test_dataset = TextDataset(test_texts, test_labels, tokenizer, max_length=MAX_LEN)

# trenvovaciu mnozinu este dame na trenovaciu a validacnu
train_size = int(0.9 * len(full_train_dataset))
val_size = len(full_train_dataset) - train_size
train_dataset, val_dataset = random_split(full_train_dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)

def save_checkpoint(model, optimizer, epoch, checkpoint_dir, model_name, best_val_loss=None):
    os.makedirs(checkpoint_dir, exist_ok=True)
    checkpoint_path = os.path.join(checkpoint_dir, f"{model_name}_epoch{epoch+1}.pt")
    torch.save({
        'epoch': epoch + 1,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'best_val_loss': best_val_loss
    }, checkpoint_path)
    print(f"Checkpoint saved at: {checkpoint_path}")

def compute_val_loss(model, val_loader, criterion, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs.logits, labels)
            total_loss += loss.item()
    return total_loss / len(val_loader)

# nacitame checkpoint len ked mame 
def load_checkpoint(model, optimizer, checkpoint_dir, model_name):
    checkpoint_path = os.path.join(checkpoint_dir, f"{model_name}_best.pt")
    if os.path.exists(checkpoint_path):
        checkpoint = torch.load(checkpoint_path)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        epoch = checkpoint['epoch']
        best_val_loss = checkpoint['best_val_loss']
        print(f"pokracjeme epohom{epoch + 1} s najlepsou validacnou stratou: {best_val_loss:.4f}")
        return epoch, best_val_loss
    else:
        print("nemame ziaden checkpoin ideme znova .")
        return 0, float('inf')

def train_with_early_stopping(model, train_loader, val_loader, criterion, optimizer, device, num_epochs, checkpoint_dir, model_name, patience=3):
    best_val_loss = float('inf')
    patience_counter = 0

    epoch, best_val_loss = load_checkpoint(model, optimizer, checkpoint_dir, model_name)

    for epoch in range(epoch, num_epochs):
        model.train()
        total_loss = 0
        correct = 0
        total = 0
        loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")

        for batch in loop:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            loss = criterion(logits, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            _, predicted = torch.max(logits, dim=1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

            loop.set_postfix(loss=loss.item())

        train_accuracy = correct / total
        val_loss = compute_val_loss(model, val_loader, criterion, device)
        print(f"Epoch {epoch+1}, Train Loss: {total_loss:.4f}, Train Acc: {train_accuracy:.4f}, Val Loss: {val_loss:.4f}")

        save_checkpoint(model, optimizer, epoch, checkpoint_dir, model_name, best_val_loss)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            best_path = os.path.join(checkpoint_dir, f"{model_name}_best.pt")
            torch.save({
                'epoch': epoch + 1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'best_val_loss': best_val_loss
            }, best_path)
            print(f"Best model saved at: {best_path}")
        else:
            patience_counter += 1
            print(f"Early stopping patience: {patience_counter}/{patience}")
            if patience_counter >= patience:
                print("Early stopping triggered.")
                break


def evaluate_mt5_model(model, test_loader, device):
    model.eval()
    correct = 0
    total = 0
    test_loader_tqdm = tqdm(test_loader, desc="Evaluating", leave=False)
    with torch.no_grad():
        for batch in test_loader_tqdm:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask)
            logits = outputs.logits
            _, preds = torch.max(logits, 1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    accuracy = correct / total
    print(f"Test Accuracy: {accuracy:.4f}")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MT5ForSequenceClassification.from_pretrained("google/mt5-base", num_labels=2).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
criterion = nn.CrossEntropyLoss()




The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'T5Tokenizer'. 
The class this function is called from is 'MT5Tokenizer'.
Some weights of MT5ForSequenceClassification were not initialized from the model checkpoint at google/mt5-base and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
train_with_early_stopping(
    model, train_loader, val_loader,
    criterion, optimizer,
    device, num_epochs=10,
    checkpoint_dir="checkpoints_full",
    model_name="mt5_toxicity_full",
    patience=3
)

evaluate_mt5_model(model, test_loader, device)


In [None]:
def load_best_model(model_class, checkpoint_dir, model_name, device):
    checkpoint_path = os.path.join(checkpoint_dir, f"{model_name}_best.pt")
    if os.path.exists(checkpoint_path):
        print(f"Loading best model from {checkpoint_path}")
        checkpoint = torch.load(checkpoint_path, map_location=device)
        model = model_class.from_pretrained("google/mt5-base", num_labels=2)
        model.load_state_dict(checkpoint['model_state_dict'])
        model.to(device)
        return model
    else:
        raise FileNotFoundError(f"Best checkpoint not found at {checkpoint_path}")

best_model = load_best_model(MT5ForSequenceClassification, "checkpoints_full", "mt5_toxicity_full", device)
evaluate_mt5_model(best_model, test_loader, device)


Loading best model from checkpoints_full/mt5_toxicity_full_best.pt


Some weights of MT5ForSequenceClassification were not initialized from the model checkpoint at google/mt5-base and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
                                                             

Test Accuracy: 0.9405




In [None]:
def load_best_model(model_class, checkpoint_dir, model_name, device):
    checkpoint_path = os.path.join(checkpoint_dir, f"{model_name}.pt")
    if os.path.exists(checkpoint_path):
        print(f"Loading best model from {checkpoint_path}")
        checkpoint = torch.load(checkpoint_path, map_location=device)
        model = model_class.from_pretrained("google/mt5-base", num_labels=2)
        model.load_state_dict(checkpoint['model_state_dict'])
        model.to(device)
        return model
    else:
        raise FileNotFoundError(f"Best checkpoint not found at {checkpoint_path}")

best_model = load_best_model(MT5ForSequenceClassification, "checkpoints_full", "mt5_toxicity_full_epoch10", device)
evaluate_mt5_model(best_model, test_loader, device)


Loading best model from checkpoints_full/mt5_toxicity_full_epoch10.pt


Some weights of MT5ForSequenceClassification were not initialized from the model checkpoint at google/mt5-base and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
                                                             

Test Accuracy: 0.9423




In [None]:
import os

checkpoint_dir = "checkpoints"
files = os.listdir(checkpoint_dir)
print("Obsah priečinka checkpoints:")
for file in files:
    print(f"- {file}")


In [None]:
import torch
from transformers import MT5ForSequenceClassification
from sklearn.metrics import classification_report, confusion_matrix
from tqdm import tqdm
import os
# Load full dataset
train_df = pd.read_csv('toxic_eng/train.csv')
test_df = pd.read_csv('toxic_eng/test.csv')

train_texts = train_df['comment_text'].tolist()
train_labels = train_df['toxic'].tolist()
test_texts = test_df['comment_text'].tolist()
test_labels = test_df['toxic'].tolist()

tokenizer = MT5Tokenizer.from_pretrained("google/mt5-base")
def load_best_model(model_class, checkpoint_dir, model_name, device):
    checkpoint_path = os.path.join(checkpoint_dir, f"{model_name}.pt")
    if os.path.exists(checkpoint_path):
        print(f"Loading best model from {checkpoint_path}")
        checkpoint = torch.load(checkpoint_path, map_location=device)
        model = model_class.from_pretrained("google/mt5-base", num_labels=2)
        model.load_state_dict(checkpoint['model_state_dict'])
        model.to(device)
        return model
    else:
        raise FileNotFoundError(f"Best checkpoint not found at {checkpoint_path}")

def evaluate_mt5_model(model, test_loader, device):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Vyhodnocovanie"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    print(f"\nPresnosť na testovacích dátach: {torch.sum(torch.tensor(all_preds) == torch.tensor(all_labels)).item() / len(all_labels):.4f}")
    print("\nKlasifikačná správa:")
    print(classification_report(all_labels, all_preds, digits=4))
    print("\nConfusion Matrix:")
    print(confusion_matrix(all_labels, all_preds))


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
best_model = load_best_model(MT5ForSequenceClassification, "checkpoints_full", "mt5_toxicity_full_epoch10", device)

evaluate_mt5_model(best_model, test_loader, device)


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'T5Tokenizer'. 
The class this function is called from is 'MT5Tokenizer'.


Loading best model from checkpoints_full/mt5_toxicity_full_epoch10.pt


Some weights of MT5ForSequenceClassification were not initialized from the model checkpoint at google/mt5-base and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Vyhodnocovanie:   1%|          | 7/625 [00:11<16:58,  1.65s/it]