In [2]:
import os
import torch
import pandas as pd
import numpy as np
import tldextract
from torch import nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, classification_report
from transformers import RobertaTokenizer, RobertaModel

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# =============================================================================
# Directory Setup
# =============================================================================
SAVE_DIR = "saved_model_fakenewsnet_roberta_metadata"
os.makedirs(SAVE_DIR, exist_ok=True)

# =============================================================================
# Step 1: Load and Prepare Data (using all meta data)
# =============================================================================
fake_news = pd.read_csv('data/fake_with_metadata.csv')
real_news = pd.read_csv('data/real_with_metadata.csv')

# Drop unnecessary columns if they exist
fake_news.drop(columns=['id'], errors='ignore', inplace=True)
real_news.drop(columns=['id'], errors='ignore', inplace=True)

# Assign labels: 1 for Fake and 0 for Real
fake_news['labels'] = 1
real_news['labels'] = 0

# -----------------------------------------------------
# Combine the data based on meta columns
# -----------------------------------------------------
meta_cols = ['title', 'meta_title', 'meta_description', 'news_url']
fake_selected = fake_news[meta_cols + ['labels']]
real_selected = real_news[meta_cols + ['labels']]
data = pd.concat([fake_selected, real_selected], ignore_index=True)

# Drop records missing main text ('title') and fill missing meta fields
data.dropna(subset=['title'], inplace=True)
for col in ['meta_title', 'meta_description', 'news_url']:
    data[col] = data[col].fillna("")

# =============================================================================
# Step 2: Combine Meta Data into a Single Field
# =============================================================================
def combine_metadata(row):
    texts = [str(row.get(col, "")) for col in ['title', 'meta_title', 'meta_description']]
    return " ".join(texts).strip()

data['combined_text'] = data.apply(combine_metadata, axis=1)

# =============================================================================
# Step 3: Extract URL Features
# =============================================================================
def extract_url_features(url):
    if pd.isna(url) or not isinstance(url, str):
        return {'url_length': 0, 'has_query': 0, 'has_hyphens': 0, 'has_numbers': 0}
    return {
        'url_length': len(url),
        'has_query': 1 if '?' in url else 0,
        'has_hyphens': 1 if '-' in url else 0,
        'has_numbers': 1 if any(c.isdigit() for c in url) else 0,
    }

url_features = data['news_url'].apply(extract_url_features)
url_features_df = pd.DataFrame(url_features.tolist())

In [4]:
# =============================================================================
# Step 4: Vectorize Combined Meta Data Text Using TF-IDF
# =============================================================================
vectorizer = TfidfVectorizer(max_features=5000)
meta_text_features_vectorized = vectorizer.fit_transform(data['combined_text']).toarray()

# =============================================================================
# Step 5: Normalize URL Features
# =============================================================================
scaler = StandardScaler()
url_features_scaled = pd.DataFrame(scaler.fit_transform(url_features_df), 
                                   columns=url_features_df.columns)

# Save the scaler and vectorizer for future use
torch.save(scaler, os.path.join(SAVE_DIR, "scaler.pth"))
torch.save(vectorizer, os.path.join(SAVE_DIR, "vectorizer.pth"))
print("Saved scaler and vectorizer")

Saved scaler and vectorizer


In [5]:
# =============================================================================
# Step 6: Align Data Dimensions
# =============================================================================
min_length = min(len(meta_text_features_vectorized), len(url_features_scaled))
meta_text_features_vectorized = meta_text_features_vectorized[:min_length]
url_features_scaled = url_features_scaled.iloc[:min_length]
data = data.iloc[:min_length]

# =============================================================================
# Step 7: Apply SMOTE to Handle Class Imbalance
# =============================================================================
combined_features = np.hstack([meta_text_features_vectorized, url_features_scaled])
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(combined_features, data['labels'])



In [6]:
# =============================================================================
# Step 8: Split Data into Training & Testing Sets
# =============================================================================
num_text_features = meta_text_features_vectorized.shape[1]
train_texts, test_texts, train_labels, test_labels, train_urls, test_urls = train_test_split(
    X_resampled[:, :num_text_features],
    y_resampled,
    X_resampled[:, num_text_features:],
    test_size=0.2,
    random_state=42
)

# Convert labels and URL features to tensors
train_labels = torch.tensor(np.array(train_labels), dtype=torch.long)
test_labels = torch.tensor(np.array(test_labels), dtype=torch.long)
train_urls = torch.tensor(np.array(train_urls), dtype=torch.float)
test_urls = torch.tensor(np.array(test_urls), dtype=torch.float)

In [7]:
# =============================================================================
# Step 9: Tokenize Text for RoBERTa
# =============================================================================
# Since SMOTE was applied on TF-IDF features, we convert these back into text.
train_text_strings = [
    " ".join(vectorizer.inverse_transform(text_row.reshape(1, -1))[0])
    for text_row in train_texts
]
test_text_strings = [
    " ".join(vectorizer.inverse_transform(text_row.reshape(1, -1))[0])
    for text_row in test_texts
]

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
train_encodings = tokenizer(
    train_text_strings,
    truncation=True,
    padding=True,
    max_length=512,
    return_tensors="pt"
)
test_encodings = tokenizer(
    test_text_strings,
    truncation=True,
    padding=True,
    max_length=512,
    return_tensors="pt"
)

# Save the tokenizer
tokenizer.save_pretrained(SAVE_DIR)
print("Saved tokenizer")

Saved tokenizer


In [8]:
# =============================================================================
# Step 10: Define Custom Dataset Class for Combined Features
# =============================================================================
class CombinedDataset(Dataset):
    def __init__(self, encodings, labels, url_features):
        self.encodings = encodings
        self.labels = labels
        self.url_features = url_features

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        item['url_features'] = self.url_features[idx]
        return item

# Create datasets and dataloaders
train_dataset = CombinedDataset(train_encodings, train_labels, train_urls)
test_dataset = CombinedDataset(test_encodings, test_labels, test_urls)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
eval_dataloader = DataLoader(test_dataset, batch_size=16)

In [9]:
# =============================================================================
# Step 11: Define the RoBERTaWithFeatures Model (Including URL Features)
# =============================================================================
class RoBERTaWithFeatures(nn.Module):
    def __init__(self, num_labels=2, url_feature_dim=4):
        super(RoBERTaWithFeatures, self).__init__()
        # Load the RoBERTa encoder
        self.roberta = RobertaModel.from_pretrained("roberta-base")
        hidden_size = self.roberta.config.hidden_size  # typically 768 for roberta-base
        # Fully-connected layer for URL features
        self.url_fc = nn.Linear(url_feature_dim, 32)
        self.dropout = nn.Dropout(0.3)
        # Final classifier head: concatenated features from text and URL (hidden_size + 32)
        self.fc = nn.Linear(hidden_size + 32, num_labels)

    def forward(self, input_ids, attention_mask, url_features):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        # Use the hidden state of the first token as sentence representation
        pooled_output = outputs[0][:, 0]
        url_emb = torch.relu(self.url_fc(url_features))
        combined = torch.cat((pooled_output, url_emb), dim=1)
        combined = self.dropout(combined)
        logits = self.fc(combined)
        return logits

In [10]:
# =============================================================================
# Step 12: Train the Model
# =============================================================================
# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

model = RoBERTaWithFeatures(num_labels=2, url_feature_dim=4).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
loss_fct = nn.CrossEntropyLoss()
num_epochs = 5

print("Starting training...")
for epoch in range(num_epochs):
    model.train()
    loop = tqdm(train_dataloader, leave=True, desc=f"Epoch {epoch+1}/{num_epochs}")
    for batch in loop:
        # Move all batch tensors to the GPU (or CPU)
        batch = {k: v.to(device) for k, v in batch.items()}
        logits = model(batch['input_ids'], batch['attention_mask'], batch['url_features'])
        loss = loss_fct(logits, batch['labels'])
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        loop.set_postfix(loss=loss.item())
print("Training complete.")

# Save the trained model
model_save_path = os.path.join(SAVE_DIR, "roberta_fakenewsnet_model.pth")
torch.save(model.state_dict(), model_save_path)
print("Saved model")

Using device: cuda


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...


Epoch 1/5: 100%|██████████| 1745/1745 [04:52<00:00,  5.97it/s, loss=1.23] 
Epoch 2/5: 100%|██████████| 1745/1745 [04:51<00:00,  5.98it/s, loss=0.202] 
Epoch 3/5: 100%|██████████| 1745/1745 [04:51<00:00,  5.98it/s, loss=0.0684]
Epoch 4/5: 100%|██████████| 1745/1745 [04:51<00:00,  5.99it/s, loss=0.00437]
Epoch 5/5: 100%|██████████| 1745/1745 [04:51<00:00,  5.99it/s, loss=0.000131]


Training complete.
Saved model


In [11]:
# =============================================================================
# Step 13: Evaluate the Model
# =============================================================================
model.eval()
predictions, true_labels = [], []
for batch in tqdm(eval_dataloader, desc="Evaluating"):
    # Move batch to device
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        logits = model(batch['input_ids'], batch['attention_mask'], batch['url_features'])
    preds = torch.argmax(logits, dim=1)
    predictions.extend(preds.cpu().numpy())
    true_labels.extend(batch["labels"].cpu().numpy())

print("\nEvaluation results:")
print("Accuracy:", accuracy_score(true_labels, predictions))
print(classification_report(true_labels, predictions, target_names=['Real', 'Fake']))

Evaluating: 100%|██████████| 437/437 [00:13<00:00, 32.55it/s]


Evaluation results:
Accuracy: 0.9002436577325498
              precision    recall  f1-score   support

        Real       0.94      0.85      0.89      3471
        Fake       0.87      0.95      0.91      3506

    accuracy                           0.90      6977
   macro avg       0.90      0.90      0.90      6977
weighted avg       0.90      0.90      0.90      6977




