In [1]:
#Import knižníc
import pandas as pd
import numpy as np
import re
import torch
from torch.utils.data import Dataset, DataLoader
import os
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [2]:
# Stiahnutie dataset
train_data = pd.read_csv('data/train.csv')
val_data = pd.read_csv('data/val.csv')

In [3]:
def clean_text(text):
    #Previesť text na malé písmená
    text = text.lower()
    
    #Odstránenie adries URL
    text = re.sub(r'http\S+', '', text)
    
    #Odstránenie hashtagov (iba symbol „#“, text ponechajte)
    text = re.sub(r'#', '', text)
    
    #Odstránenie špeciálnych znakov a čísel
    text = ''.join(c for c in text if c.isalnum() or c.isspace())
    
    #Odstránenie zbytočných medzier
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [4]:
train_data['cleaned_tweet'] = train_data['tweet'].apply(clean_text)
val_data['cleaned_tweet'] = val_data['tweet'].apply(clean_text)

In [5]:
#Stiahnutie tokenizéra
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')



In [6]:
#Tokenizácia textu s obmedzením dĺžky
def tokenize_text(text, tokenizer, max_length=128):
    tokens = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    return tokens['input_ids'], tokens['attention_mask']

In [7]:
#Vytvorenie nových stĺpcov pre tokeny
train_data['input_ids'] = train_data['cleaned_tweet'].apply(lambda x: tokenize_text(x, tokenizer)[0])
train_data['attention_mask'] = train_data['cleaned_tweet'].apply(lambda x: tokenize_text(x, tokenizer)[1])

val_data['input_ids'] = val_data['cleaned_tweet'].apply(lambda x: tokenize_text(x, tokenizer)[0])
val_data['attention_mask'] = val_data['cleaned_tweet'].apply(lambda x: tokenize_text(x, tokenizer)[1])

In [8]:
#Previesť label ('real' -> 0, 'fake' -> 1)
train_data['label'] = train_data['label'].map({'real': 0, 'fake': 1})
val_data['label'] = val_data['label'].map({'real': 0, 'fake': 1})

In [9]:
#Vytvorenie triedy pre dátasety
class NewsDataset(Dataset):
    def __init__(self, data):
        self.input_ids = torch.cat(data['input_ids'].values.tolist(), dim=0)
        self.attention_masks = torch.cat(data['attention_mask'].values.tolist(), dim=0)
        self.labels = torch.tensor(data['label'].values)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx],
            'labels': self.labels[idx]
        }

#Vytvorenie dátasetov
train_dataset = NewsDataset(train_data)
val_dataset = NewsDataset(val_data)

#Vytvorenie DataLoader
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [10]:
#Definícia zariadenia
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#Stiahnutie modelu BERT
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [11]:
#Stiahnutie optimalizátor
optimizer = AdamW(model.parameters(), lr=2e-5)

#Definovanie epoch
epochs = 10



In [12]:
#Kontrola preds a labelov
def compute_accuracy(preds, labels):
    if isinstance(preds, torch.Tensor):
        preds = preds.detach().cpu().numpy()
    if isinstance(labels, torch.Tensor):
        labels = labels.detach().cpu().numpy()

    return accuracy_score(labels, preds)

In [13]:
#Funkcia pre skolenie modelu
def train_model(model, train_loader, optimizer, device):
    model.train()
    total_loss, total_accuracy = 0, 0

    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

        loss = outputs.loss
        logits = outputs.logits

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        preds = torch.argmax(logits, dim=1).detach().cpu().numpy()
        labels = labels.detach().cpu().numpy()

        total_accuracy += compute_accuracy(preds, labels)

    avg_loss = total_loss / len(train_loader)
    avg_accuracy = total_accuracy / len(train_loader)
    return avg_loss, avg_accuracy

In [14]:
#Funkcia doučovanie modelu
def validate_model(model, val_loader, device):
    model.eval()
    total_loss, total_accuracy = 0, 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

            loss = outputs.loss
            logits = outputs.logits

            total_loss += loss.item()

            preds = torch.argmax(logits, dim=1).detach().cpu().numpy()
            labels = labels.detach().cpu().numpy()

            total_accuracy += compute_accuracy(preds, labels)

    avg_loss = total_loss / len(val_loader)
    avg_accuracy = total_accuracy / len(val_loader)
    return avg_loss, avg_accuracy

In [15]:
#Kontrola jedného batchu
for batch in train_loader:
    print("Input IDs shape:", batch['input_ids'].shape)
    print("Attention mask shape:", batch['attention_mask'].shape)
    print("Labels shape:", batch['labels'].shape)
    break

Input IDs shape: torch.Size([16, 128])
Attention mask shape: torch.Size([16, 128])
Labels shape: torch.Size([16])


In [16]:
#Školenie a validacia modelu
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")

    #Školenie
    train_loss, train_acc = train_model(model, train_loader, optimizer, device)
    print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}")

    #Validacia
    val_loss, val_acc = validate_model(model, val_loader, device)
    print(f"Val Loss: {val_loss:.4f}, Val Accuracy: {val_acc:.4f}")

Epoch 1/10
Train Loss: 0.2854, Train Accuracy: 0.8845
Val Loss: 0.1620, Val Accuracy: 0.9389
Epoch 2/10
Train Loss: 0.1114, Train Accuracy: 0.9605
Val Loss: 0.1336, Val Accuracy: 0.9436
Epoch 3/10
Train Loss: 0.0537, Train Accuracy: 0.9829
Val Loss: 0.1736, Val Accuracy: 0.9492
Epoch 4/10
Train Loss: 0.0324, Train Accuracy: 0.9890
Val Loss: 0.1553, Val Accuracy: 0.9506
Epoch 5/10
Train Loss: 0.0165, Train Accuracy: 0.9950
Val Loss: 0.1756, Val Accuracy: 0.9548
Epoch 6/10
Train Loss: 0.0092, Train Accuracy: 0.9975
Val Loss: 0.2267, Val Accuracy: 0.9454
Epoch 7/10
Train Loss: 0.0118, Train Accuracy: 0.9964
Val Loss: 0.2396, Val Accuracy: 0.9411
Epoch 8/10
Train Loss: 0.0123, Train Accuracy: 0.9961
Val Loss: 0.1825, Val Accuracy: 0.9538
Epoch 9/10
Train Loss: 0.0049, Train Accuracy: 0.9988
Val Loss: 0.2276, Val Accuracy: 0.9566
Epoch 10/10
Train Loss: 0.0076, Train Accuracy: 0.9975
Val Loss: 0.1946, Val Accuracy: 0.9562


In [None]:
#Uloženie modelu
save_directory = "bert_seq"

model.save_pretrained(save_directory)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer.save_pretrained(save_directory)

torch.save(optimizer.state_dict(), os.path.join(save_directory, 'optimizer.pt'))

config = model.config
config.save_pretrained(save_directory)