Kód na tréning modelu BERTweet 

pred začatím tréningu bolo potrebné stiahnuť chýbajúce knižnice = 
pip install torch |
pip install torch transformers | pip install matplotlib

In [None]:
# Načitanie potrebných knižních 
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import confusion_matrix
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:

# Načítanie dát 
train_df = pd.read_csv("clean_train.xls")
val_df = pd.read_csv("clean_val.xls")
test_df = pd.read_csv("clean_test.xls")


In [None]:

# Načítanie tokenizéra pre model BERTweet z Hugging Face Hubu
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=True)

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0


In [4]:
# Priprava datasetu pre tréning modelu 
# Tokenizácia textu - konverzia textu na číselné reprezentácie

def prepare_dataset(df):
    encodings = tokenizer(df['text'].tolist(), truncation=True, padding='max_length', max_length=128)
    labels = df['labels'].astype(int).values  
    
    return TensorDataset(
        torch.tensor(encodings['input_ids']),
        torch.tensor(encodings['attention_mask']),
        torch.tensor(labels)
    )

train_dataset = prepare_dataset(train_df)
val_dataset = prepare_dataset(val_df)
test_dataset = prepare_dataset(test_df)

In [5]:
# 3. Načítanie modelu
model = AutoModelForSequenceClassification.from_pretrained(
    "vinai/bertweet-base", 
    num_labels=len(train_df['labels'].unique())
)  


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Tréningová funkcia pre model s využitím early stopping

def train_model(model, train_dataset, val_dataset, epochs=10, batch_size=8, lr=2e-5, patience=2):

    # Vytvorenie dátových loaderov
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    
    # Inicializácia optimalizátora a histórie trénovania
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    history = {
        'train_loss': [],
        'val_loss': [],
        'val_acc': [],
        'val_f1': []
    }
    
    best_f1 = 0  # Najlepšie dosiahnuté F1 skóre
    no_improve = 0  # Počet epoch bez zlepšenia
    
    # Hlavný tréningový cyklus
    for epoch in range(epochs):
        
        model.train()
        train_loss = 0
        
        # Iterácia cez tréningové dáta po batchoch
        for input_ids, attention_mask, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}"):            
            
            optimizer.zero_grad()   # Resetovanie gradientov pred každým batchom
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss # Hodnoty straty
            loss.backward()     # Výpočet gradientov
            optimizer.step()    # Aktualizácia váh
            
            # Sčítanie straty pre sledovanie priemernej straty v epoch
            train_loss += loss.item()   
        
        # Výpočet priemernej tréningovej straty
        avg_train_loss = train_loss / len(train_loader)
        history['train_loss'].append(avg_train_loss)
        
        # Validačná fáza
        model.eval()
        val_loss = 0
        all_preds = []
        all_labels = []
        
        # Deaktivácia výpočtu gradientov - šetrí pamäť a zvyšuje rýchlosť
        with torch.no_grad():
            # Prechádza validačné dáta
            for input_ids, attention_mask, labels in val_loader:    
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                val_loss += outputs.loss.item()    #Sčítavanie straty pri validačných dátach
                
                # Predikcie 
                preds = torch.argmax(outputs.logits, dim=1)
                all_labels.extend(labels.numpy())
        
        # Výpočet validačných metrík
        avg_val_loss = val_loss / len(val_loader)
        val_acc = accuracy_score(all_labels, all_preds)
        val_f1 = f1_score(all_labels, all_preds, average='weighted')
        
        # Uloženie metrík do histórie
        history['val_loss'].append(avg_val_loss)
        history['val_acc'].append(val_acc)
        history['val_f1'].append(val_f1)
        
        # Výpis metrík pre aktuálnu epochu
        print(f"\nEpoch {epoch+1} Metrics:")
        print(f"  Train Loss: {avg_train_loss:.4f}")
        print(f"  Val Loss: {avg_val_loss:.4f}")
        print(f"  Val Acc: {val_acc:.4f}")
        print(f"  Val F1: {val_f1:.4f}")
        
        # Zastavenie modelu - early stopping
        if val_f1 > best_f1:
            best_f1 = val_f1
            no_improve = 0
            print(" New best model saved!")
        else:
            no_improve += 1
            print(f" No improvement ({no_improve}/{patience})")
            
            if no_improve >= patience:
                print(f"\n Early stopping - epoch {epoch+1}!")
                break

    return history

In [None]:
# 5. Spustenie trénovania
print("Trénovanie modelu...")
history = train_model(model, train_dataset, val_dataset, epochs=10, patience=3)

# 6. Vyhodnotenie na testovacej sade
print("\nVyhodnocovanie na validačných dátach...")
test_loader = DataLoader(test_dataset, batch_size=8)
model.eval()  

all_preds = []
all_labels = []
test_loss = 0

with torch.no_grad():
    for input_ids, attention_mask, labels in test_loader:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        test_loss += outputs.loss.item()
        
        preds = torch.argmax(outputs.logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Výpočet testovacích metrík
test_acc = accuracy_score(all_labels, all_preds)
test_f1 = f1_score(all_labels, all_preds, average='weighted')
avg_test_loss = test_loss / len(test_loader)

# Výpis výsledkov
print("\nFinalné testovacie výsledky:")
print(f"  Test Loss: {avg_test_loss:.4f}")
print(f"  Test Accuracy: {test_acc:.4f}")
print(f"  Test F1: {test_f1:.4f}")


# Výpočet confusion matrix
cm = confusion_matrix(all_labels, all_preds)

# Vyzualizácia matice
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Human 0', 'AI 1'],  # Upravte podľa vašich tried
            yticklabels=['Human 0', 'AI 1'])
plt.title('Confusion Matrix')
plt.xlabel('Predikované triedy')
plt.ylabel('Skutočné triedy')
plt.savefig('confusion_matrix.png')  # Uloženie matice ako obrázok
plt.show()

# Výpis metrik
print("\nMetriky:")
print(classification_report(all_labels, all_preds, target_names=['Human 0', 'AI 1']))


# 7. Uloženie finálneho modelu a tokenizéra
print("\nUloženie finálého modelu")
os.makedirs("final_model", exist_ok=True)
model.save_pretrained("./final_model")
tokenizer.save_pretrained("./final_model")

print("\nTrénovanie je dokončené")



Trénovanie modelu...


Epoch 1:   0%|          | 2/2589 [00:05<2:03:47,  2.87s/it]


KeyboardInterrupt: 