In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import tldextract
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Step 1: Load and Prepare Data
news_data = pd.read_csv('data/news_articles_with_metadata.csv')  # Update with actual path

# Drop unnecessary columns
news_data = news_data.drop(columns=['author', 'published', 'text', 'language', 'main_img_url', 'type', 
                                    'title_without_stopwords', 'text_without_stopwords', 'hasImage'], errors='ignore')

# Fix label inconsistencies
news_data = news_data[news_data['label'].apply(lambda x: isinstance(x, str))]
news_data['label'] = news_data['label'].replace('Realk', 'Real')  # Fix any typos

# Convert labels: 1 for Fake, 0 for Real
news_data['labels'] = news_data['label'].apply(lambda x: 1 if x.lower() == 'fake' else 0)

# Keep only relevant columns
data = news_data[['title', 'labels', 'site_url', 'meta_title', 'meta_description']]
data.dropna(subset=['title'], inplace=True)
data['title'] = data['title'].astype(str)  # Ensure titles are strings

# Step 2: Extract URL Features
def extract_url_features(url):
    if pd.isna(url) or not isinstance(url, str):
        return {'url_length': 0, 'has_query': 0, 'has_hyphens': 0, 'has_numbers': 0}
    
    ext = tldextract.extract(url)
    return {
        'url_length': len(url),
        'has_query': 1 if '?' in url else 0,
        'has_hyphens': 1 if '-' in url else 0,
        'has_numbers': 1 if any(c.isdigit() for c in url) else 0,
    }

url_features = data['site_url'].apply(extract_url_features)
url_features_df = pd.DataFrame(url_features.tolist())

# Normalize URL features
scaler = StandardScaler()
url_features_scaled = pd.DataFrame(scaler.fit_transform(url_features_df), columns=url_features_df.columns)

# Step 3: Vectorize Titles using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
title_features_vectorized = vectorizer.fit_transform(data['title'] + ' ' + data['meta_title'].fillna('') + ' ' + data['meta_description'].fillna('')).toarray()

# Step 4: Align Data
min_length = min(len(title_features_vectorized), len(url_features_scaled))
title_features_vectorized = title_features_vectorized[:min_length]
url_features_scaled = url_features_scaled.iloc[:min_length]
data = data.iloc[:min_length]



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.dropna(subset=['title'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['title'] = data['title'].astype(str)  # Ensure titles are strings


In [5]:
# Step 5: Apply SMOTE
combined_features = np.hstack([title_features_vectorized, url_features_scaled])
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(combined_features, data['labels'])

# Step 6: Split Data into Training & Testing
train_texts, test_texts, train_labels, test_labels, train_urls, test_urls = train_test_split(
    X_resampled[:, :title_features_vectorized.shape[1]], 
    y_resampled, 
    X_resampled[:, title_features_vectorized.shape[1]:], 
    test_size=0.2, 
    random_state=42
)

# Convert labels and URL features to tensors
train_labels = torch.tensor(train_labels.values, dtype=torch.long)
test_labels = torch.tensor(test_labels.values, dtype=torch.long)
train_urls = torch.tensor(train_urls, dtype=torch.float)
test_urls = torch.tensor(test_urls, dtype=torch.float)



In [6]:
# Step 7: Tokenize Text for BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(
    [" ".join(vectorizer.inverse_transform(text_row.reshape(1, -1))[0]) for text_row in train_texts],
    truncation=True, padding=True, max_length=512, return_tensors="pt"
)
test_encodings = tokenizer(
    [" ".join(vectorizer.inverse_transform(text_row.reshape(1, -1))[0]) for text_row in test_texts],
    truncation=True, padding=True, max_length=512, return_tensors="pt"
)

In [7]:
# Step 8: Custom Dataset Class
class CombinedDataset(Dataset):
    def __init__(self, encodings, labels, url_features):
        self.encodings = encodings
        self.labels = labels
        self.url_features = url_features

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        item['url_features'] = self.url_features[idx]
        return item

# Create datasets
train_dataset = CombinedDataset(train_encodings, train_labels, train_urls)
test_dataset = CombinedDataset(test_encodings, test_labels, test_urls)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
eval_dataloader = DataLoader(test_dataset, batch_size=16)

# Step 9: Define BertWithFeatures Model
class BertWithFeatures(nn.Module):
    def __init__(self, num_labels=2, url_feature_dim=4):  
        super(BertWithFeatures, self).__init__()
        self.bert = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)
        self.url_fc = nn.Linear(url_feature_dim, 32)  
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(32 + 768, num_labels)  

    def forward(self, input_ids, attention_mask, url_features):
        outputs = self.bert.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output  

        url_emb = torch.relu(self.url_fc(url_features))  
        combined = torch.cat((pooled_output, url_emb), dim=1)  
        combined = self.dropout(combined)
        logits = self.fc(combined)
        return logits


In [8]:
# Step 10: Train Model
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = BertWithFeatures().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

num_epochs = 5
loss_fct = nn.CrossEntropyLoss()

for epoch in range(num_epochs):
    model.train()
    loop = tqdm(train_dataloader, leave=True)
    for batch in loop:
        batch = {k: v.to(device) for k, v in batch.items()}
        
        logits = model(batch['input_ids'], batch['attention_mask'], batch['url_features'])
        loss = loss_fct(logits, batch['labels'])

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        loop.set_description(f"Epoch {epoch+1}/{num_epochs}")
        loop.set_postfix(loss=loss.item())

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/5: 100%|██████████| 130/130 [00:07<00:00, 16.85it/s, loss=0.132]
Epoch 2/5: 100%|██████████| 130/130 [00:07<00:00, 17.59it/s, loss=0.117] 
Epoch 3/5: 100%|██████████| 130/130 [00:07<00:00, 17.65it/s, loss=0.171]  
Epoch 4/5: 100%|██████████| 130/130 [00:07<00:00, 17.60it/s, loss=0.00148]
Epoch 5/5: 100%|██████████| 130/130 [00:07<00:00, 17.62it/s, loss=0.0119] 


In [9]:
# Step 11: Evaluate Model
model.eval()
predictions, true_labels = [], []
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        logits = model(batch['input_ids'], batch['attention_mask'], batch['url_features'])
        preds = torch.argmax(logits, dim=1)
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(batch["labels"].cpu().numpy())

# Display Metrics
print("Accuracy:", accuracy_score(true_labels, predictions))
print(classification_report(true_labels, predictions, target_names=['Real', 'Fake']))

# Save the model state dictionary
model_save_path = "optimal_bert_with_features_newsarticle.pth"
torch.save(model.state_dict(), model_save_path)

# Save the tokenizer
tokenizer_save_path = "tokenizer"
tokenizer.save_pretrained(tokenizer_save_path)

print("Model and tokenizer saved successfully!")


Accuracy: 0.9073359073359073
              precision    recall  f1-score   support

        Real       0.88      0.94      0.91       255
        Fake       0.94      0.87      0.91       263

    accuracy                           0.91       518
   macro avg       0.91      0.91      0.91       518
weighted avg       0.91      0.91      0.91       518

Model and tokenizer saved successfully!
