In [1]:
# Načitanie potrebných knižníc
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, classification_report
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# Načítanie dát
train_df = pd.read_csv("clean_train.xls")
val_df = pd.read_csv("clean_val.xls")
test_df = pd.read_csv("clean_test.xls")


In [3]:
# Načítanie tokenizéra pre DistilBERT
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", use_fast=True)

In [4]:
# Priprava datasetu
def prepare_dataset(df):
    encodings = tokenizer(df['text'].tolist(), truncation=True, padding='max_length', max_length=128)
    labels = df['labels'].astype(int).values  
    
    return TensorDataset(
        torch.tensor(encodings['input_ids']),
        torch.tensor(encodings['attention_mask']),
        torch.tensor(labels)
    )

train_dataset = prepare_dataset(train_df)
val_dataset = prepare_dataset(val_df)
test_dataset = prepare_dataset(test_df)


In [5]:
# Načítanie modelu
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", 
    num_labels=len(train_df['labels'].unique())
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# Presun modelu na GPU ak je dostupná
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [7]:
# Tréningová funkcia
def train_model(model, train_dataset, val_dataset, epochs=10, batch_size=8, lr=2e-5, patience=2):
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    history = {'train_loss': [], 'val_loss': [], 'val_acc': [], 'val_f1': []}
    best_f1 = 0
    no_improve = 0
    
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        
        for input_ids, attention_mask, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}"):            
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
        
        avg_train_loss = train_loss / len(train_loader)
        history['train_loss'].append(avg_train_loss)
        
        model.eval()
        val_loss = 0
        all_preds = []
        all_labels = []
        
        with torch.no_grad():
            for input_ids, attention_mask, labels in val_loader:
                input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
                
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                val_loss += outputs.loss.item()
                
                preds = torch.argmax(outputs.logits, dim=1)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
        
        avg_val_loss = val_loss / len(val_loader)
        val_acc = accuracy_score(all_labels, all_preds)
        val_f1 = f1_score(all_labels, all_preds, average='weighted')
        
        history['val_loss'].append(avg_val_loss)
        history['val_acc'].append(val_acc)
        history['val_f1'].append(val_f1)
        
        print(f"\nEpoch {epoch+1} Metrics:")
        print(f"  Train Loss: {avg_train_loss:.4f}")
        print(f"  Val Loss: {avg_val_loss:.4f}")
        print(f"  Val Acc: {val_acc:.4f}")
        print(f"  Val F1: {val_f1:.4f}")
        
        if val_f1 > best_f1:
            best_f1 = val_f1
            no_improve = 0
            print(" New best model saved!")
        else:
            no_improve += 1
            print(f" No improvement ({no_improve}/{patience})")
            
            if no_improve >= patience:
                print(f"\n Early stopping - epoch {epoch+1}!")
                break

    return history

In [None]:

# Spustenie trénovania
print("Trénovanie modelu DistilBERT...")
history = train_model(model, train_dataset, val_dataset, epochs=10, patience=3)

# Vyhodnotenie na testovacej sade
print("\nVyhodnocovanie na testovacích dátach...")
test_loader = DataLoader(test_dataset, batch_size=8)
model.eval()  

all_preds = []
all_labels = []
test_loss = 0

with torch.no_grad():
    for input_ids, attention_mask, labels in test_loader:
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        test_loss += outputs.loss.item()
        
        preds = torch.argmax(outputs.logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Výpočet testovacích metrík
test_acc = accuracy_score(all_labels, all_preds)
test_f1 = f1_score(all_labels, all_preds, average='weighted')
avg_test_loss = test_loss / len(test_loader)

# Výpis výsledkov
print("\nFinalné testovacie výsledky pre DistilBERT:")
print(f"  Test Loss: {avg_test_loss:.4f}")
print(f"  Test Accuracy: {test_acc:.4f}")
print(f"  Test F1: {test_f1:.4f}")

# Confusion matrix
cm = confusion_matrix(all_labels, all_preds)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Human 0', 'AI 1'],
            yticklabels=['Human 0', 'AI 1'])
plt.title('Confusion Matrix - DistilBERT')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.savefig('confusion_matrix_distilbert.png')
plt.show()

# Classification report
print("\nMetriky pre DistilBERT:")
print(classification_report(all_labels, all_preds, target_names=['Human 0', 'AI 1']))

# Uloženie modelu
print("\nUloženie finálneho modelu DistilBERT")
os.makedirs("distilbert_model", exist_ok=True)
model.save_pretrained("./distilbert_model")
tokenizer.save_pretrained("./distilbert_model")

print("\nTrénovanie DistilBERT je dokončené")