In [1]:
import torch
from transformers import RobertaTokenizer, RobertaModel
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import re

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [2]:
# Stiahnutie dataset
train_data = pd.read_csv('data/train.csv')
val_data = pd.read_csv('data/val.csv')

def clean_text(text):
    #Previesť text na malé písmená
    text = text.lower()
    
    #Odstránenie adries URL
    text = re.sub(r'http\S+', '', text)
    
    #Odstránenie hashtagov (iba symbol „#“, text ponechajte)
    text = re.sub(r'#', '', text)
    
    #Odstránenie špeciálnych znakov a čísel
    text = ''.join(c for c in text if c.isalnum() or c.isspace())
    
    #Odstránenie zbytočných medzier
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

train_data['cleaned_tweet'] = train_data['tweet'].apply(clean_text)
val_data['cleaned_tweet'] = val_data['tweet'].apply(clean_text)

In [4]:
#Vytvorenie triedy pre dátasety
class TweetDataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        tweet = self.data.iloc[idx]['cleaned_tweet']
        label = self.data.iloc[idx]['label']
        
        encoding = self.tokenizer(
            tweet,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [5]:
#Stiahnutie tokenizéra a základného modelu RoBERT
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
#Definovanie triedy pre klasifikátor modelu
class RobertaClassifier(nn.Module):
    def __init__(self, model, num_classes=2):
        super(RobertaClassifier, self).__init__()
        self.roberta = model
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(model.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]  # [CLS] token
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits


In [7]:
#Previesť label (real -> 0, fake -> 1)
label_encoder = LabelEncoder()
train_data['label'] = label_encoder.fit_transform(train_data['label'])
val_data['label'] = label_encoder.transform(val_data['label'])

In [8]:
#Definovanie hyperparametrov
MAX_LEN = 128
BATCH_SIZE = 16

#Vytváranie datasetov a súborov loadov
train_dataset = TweetDataset(train_data, tokenizer, MAX_LEN)
val_dataset = TweetDataset(val_data, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

In [9]:
#Definícia zariadenia
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = RobertaClassifier(model).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

In [10]:
#Funkcia pre výpočet presnosti
def calculate_accuracy(preds, labels):
    return (preds.argmax(dim=1) == labels).float().mean()

In [12]:
#Definovanie epoch
NUM_EPOCHS = 10

#Školenie a validacia modelu
for epoch in range(NUM_EPOCHS):
    model.train()
    train_loss, train_acc = 0, 0

    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        train_acc += calculate_accuracy(outputs, labels).item()

    train_loss /= len(train_loader)
    train_acc /= len(train_loader)

    model.eval()
    val_loss, val_acc = 0, 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)

            val_loss += loss.item()
            val_acc += calculate_accuracy(outputs, labels).item()

    val_loss /= len(val_loader)
    val_acc /= len(val_loader)

    print(f"Epoch {epoch + 1}/{NUM_EPOCHS}")
    print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}")
    print(f"Val Loss: {val_loss:.4f}, Val Accuracy: {val_acc:.4f}")

Epoch 1/10
Train Loss: 0.2687, Train Accuracy: 0.8884
Val Loss: 0.0126, Val Accuracy: 0.9960
Epoch 2/10
Train Loss: 0.1260, Train Accuracy: 0.9532
Val Loss: 0.0126, Val Accuracy: 0.9960
Epoch 3/10
Train Loss: 0.0622, Train Accuracy: 0.9767
Val Loss: 0.0126, Val Accuracy: 0.9960
Epoch 4/10
Train Loss: 0.0339, Train Accuracy: 0.9891
Val Loss: 0.0126, Val Accuracy: 0.9960
Epoch 5/10
Train Loss: 0.0309, Train Accuracy: 0.9887
Val Loss: 0.0126, Val Accuracy: 0.9960
Epoch 6/10
Train Loss: 0.0209, Train Accuracy: 0.9924
Val Loss: 0.0126, Val Accuracy: 0.9960
Epoch 7/10
Train Loss: 0.0177, Train Accuracy: 0.9939
Val Loss: 0.0126, Val Accuracy: 0.9960
Epoch 8/10
Train Loss: 0.0172, Train Accuracy: 0.9947
Val Loss: 0.0126, Val Accuracy: 0.9960
Epoch 9/10
Train Loss: 0.0133, Train Accuracy: 0.9960
Val Loss: 0.0126, Val Accuracy: 0.9960
Epoch 10/10
Train Loss: 0.0126, Train Accuracy: 0.9960
Val Loss: 0.0126, Val Accuracy: 0.9960


In [15]:
#Uloženie modelu
torch.save(model.state_dict(), 'robert_model.pth')