In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import tldextract
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay

from transformers import RobertaTokenizer, RobertaForSequenceClassification, get_scheduler
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm


In [2]:
# Step 1: Load and Prepare Data
fake_news = pd.read_csv('data/fake_gossipcop.csv')  # Replace with actual path
real_news = pd.read_csv('data/real_gossipcop.csv')  # Replace with actual path

fake_news.drop(columns=['id'], errors='ignore', inplace=True)
real_news.drop(columns=['id'], errors='ignore', inplace=True)
fake_news['labels'] = 1
real_news['labels'] = 0

# Combine and clean data
data = pd.concat([fake_news[['title', 'labels', 'news_url']], real_news[['title', 'labels', 'news_url']]], ignore_index=True)
data.dropna(subset=['title'], inplace=True)  # Drop missing titles

# Step 2: Extract URL Features
def extract_url_features(url):
    if pd.isna(url) or not isinstance(url, str):
        return {'url_length': 0, 'has_query': 0, 'has_hyphens': 0, 'has_numbers': 0}
    
    ext = tldextract.extract(url)
    return {
        'url_length': len(url),
        'has_query': 1 if '?' in url else 0,
        'has_hyphens': 1 if '-' in url else 0,
        'has_numbers': 1 if any(c.isdigit() for c in url) else 0,
    }

url_features = data['news_url'].apply(extract_url_features)
url_features_df = pd.DataFrame(url_features.tolist())

# Normalize URL features
scaler = StandardScaler()
url_features_scaled = pd.DataFrame(scaler.fit_transform(url_features_df), columns=url_features_df.columns)

In [3]:
# Step 3: Vectorize Titles using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
title_features_vectorized = vectorizer.fit_transform(data['title']).toarray()

# Step 4: Align Data
min_length = min(len(title_features_vectorized), len(url_features_scaled))
title_features_vectorized = title_features_vectorized[:min_length]
url_features_scaled = url_features_scaled.iloc[:min_length]
data = data.iloc[:min_length]

# Step 5: Apply SMOTE
combined_features = np.hstack([title_features_vectorized, url_features_scaled])
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(combined_features, data['labels'])

# Step 6: Split Data into Training & Testing
train_texts, test_texts, train_labels, test_labels, train_urls, test_urls = train_test_split(
    X_resampled[:, :title_features_vectorized.shape[1]], 
    y_resampled, 
    X_resampled[:, title_features_vectorized.shape[1]:], 
    test_size=0.2, 
    random_state=42
)

# Convert labels and url_features to tensors
train_labels = torch.tensor(train_labels.values, dtype=torch.long)
test_labels = torch.tensor(test_labels.values, dtype=torch.long)
train_urls = torch.tensor(train_urls, dtype=torch.float)
test_urls = torch.tensor(test_urls, dtype=torch.float)



In [4]:
# Step 7: Tokenize Text for RoBERTa
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
train_encodings = tokenizer(
    [" ".join(vectorizer.inverse_transform(text_row.reshape(1, -1))[0]) for text_row in train_texts],
    truncation=True, padding=True, max_length=512, return_tensors="pt"
)
test_encodings = tokenizer(
    [" ".join(vectorizer.inverse_transform(text_row.reshape(1, -1))[0]) for text_row in test_texts],
    truncation=True, padding=True, max_length=512, return_tensors="pt"
)

# Step 8: Custom Dataset Class
class CombinedDataset(Dataset):
    def __init__(self, encodings, labels, url_features):
        self.encodings = encodings
        self.labels = labels
        self.url_features = url_features

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        item['url_features'] = self.url_features[idx]
        return item

# Create datasets
train_dataset = CombinedDataset(train_encodings, train_labels, train_urls)
test_dataset = CombinedDataset(test_encodings, test_labels, test_urls)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
eval_dataloader = DataLoader(test_dataset, batch_size=16)

In [5]:
# Step 9: Define RoBERTaWithFeatures Model
class RoBERTaWithFeatures(nn.Module):
    def __init__(self, num_labels=2, url_feature_dim=4):  
        super(RoBERTaWithFeatures, self).__init__()
        self.roberta = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=num_labels)
        self.url_fc = nn.Linear(url_feature_dim, 32)  
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(32 + 768, num_labels)  

    def forward(self, input_ids, attention_mask, url_features):
        outputs = self.roberta.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]  # Take [CLS] token representation

        url_emb = torch.relu(self.url_fc(url_features))  
        combined = torch.cat((pooled_output, url_emb), dim=1)  
        combined = self.dropout(combined)
        logits = self.fc(combined)
        return logits

In [6]:
# Step 10: Train Model
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = RoBERTaWithFeatures().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

num_epochs = 5
loss_fct = nn.CrossEntropyLoss()

for epoch in range(num_epochs):
    model.train()
    loop = tqdm(train_dataloader, leave=True)
    for batch in loop:
        batch = {k: v.to(device) for k, v in batch.items()}
        
        logits = model(batch['input_ids'], batch['attention_mask'], batch['url_features'])
        loss = loss_fct(logits, batch['labels'])

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        loop.set_description(f"Epoch {epoch+1}/{num_epochs}")
        loop.set_postfix(loss=loss.item())

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/5: 100%|█████████████████████████████████████████████████████| 1682/1682 [2:49:11<00:00,  6.04s/it, loss=0.472]
Epoch 2/5: 100%|█████████████████████████████████████████████████████| 1682/1682 [2:49:05<00:00,  6.03s/it, loss=0.268]
Epoch 3/5: 100%|████████████████████████████████████████████████████| 1682/1682 [2:47:41<00:00,  5.98s/it, loss=0.0464]
Epoch 4/5: 100%|█████████████████████████████████████████████████████| 1682/1682 [2:48:52<00:00,  6.02s/it, loss=0.191]
Epoch 5/5: 100%|█████████████████████████████████████████████████████| 1682/1682 [2:49:21<00:00,  6.04s/it, loss=0.231]


In [7]:
# Step 11: Evaluate Model
model.eval()
predictions, true_labels = [], []
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        logits = model(batch['input_ids'], batch['attention_mask'], batch['url_features'])
        preds = torch.argmax(logits, dim=1)
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(batch["labels"].cpu().numpy())

# Display Metrics
print("Accuracy:", accuracy_score(true_labels, predictions))
print(classification_report(true_labels, predictions, target_names=['Real', 'Fake']))

# Save Model and Tokenizer
torch.save(model.state_dict(), "fake_news_roberta_model-gossicop.pth")
tokenizer.save_pretrained("roberta_tokenizer")

print("Model and tokenizer saved successfully!")

Accuracy: 0.9064962093057827
              precision    recall  f1-score   support

        Real       0.89      0.93      0.91      3380
        Fake       0.93      0.88      0.90      3347

    accuracy                           0.91      6727
   macro avg       0.91      0.91      0.91      6727
weighted avg       0.91      0.91      0.91      6727

Model and tokenizer saved successfully!
