In [25]:
# Fake News Detection Using BERT
# Step-by-Step Implementation in a Jupyter Notebook

# Step 1: Install Required Libraries
# Uncomment the following lines if you haven't installed these libraries yet:
# !pip install transformers
# !pip install datasets
# !pip install torch
# !pip install scikit-learn

# Step 2: Import Libraries
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset

# Step 3: Load Dataset (ensure the file paths are correct)
fake_news = pd.read_csv('data/fake.csv')  # Replace with the correct path
real_news = pd.read_csv('data/real.csv')  # Replace with the correct path

# Drop the 'id' column (ensure 'id' is the column you want to remove)
fake_news = fake_news.drop(columns=['id'], errors='ignore')  # Ignores if 'id' does not exist
real_news = real_news.drop(columns=['id'], errors='ignore')  # Ignores if 'id' does not exist

# Add labels: 1 for fake, 0 for real
fake_news['labels'] = 1
real_news['labels'] = 0

# Combine datasets into one dataframe
data = pd.concat([fake_news[['title', 'labels']], real_news[['title', 'labels']]], ignore_index=True)

# Clean the data by removing any NaN values in the 'title' column
data.dropna(subset=['title'], inplace=True)

# Convert the titles to strings explicitly (in case some are not strings)
data['title'] = data['title'].astype(str)

# Step 4: Split Data into Training and Testing Sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data['title'], data['labels'], test_size=0.2, random_state=42
)

# Step 5: Load BERT Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenization Function
def tokenize_function(texts):
    return tokenizer(texts, truncation=True, padding=True, max_length=512, return_tensors="pt")

# Ensure train_texts and test_texts are lists of strings
if isinstance(train_texts, pd.Series):
    train_texts = train_texts.tolist()

if isinstance(test_texts, pd.Series):
    test_texts = test_texts.tolist()

# Print some samples to verify
print("Sample train texts:", train_texts[:5])  # Print first 5 samples
print("Sample test texts:", test_texts[:5])    # Print first 5 samples

# Tokenization Function
def tokenize_function(texts):
    return tokenizer(texts, truncation=True, padding=True, max_length=512, return_tensors="pt")

# Tokenize the data
train_encodings = tokenize_function(train_texts)
test_encodings = tokenize_function(test_texts)

# Debugging: Check the shape and keys of the encodings
print("Train encodings keys:", train_encodings.keys())
print("Length of input_ids:", len(train_encodings['input_ids']))
print("Length of attention_mask:", len(train_encodings['attention_mask']))

# Check the structure and length of labels
print("Length of train_labels:", len(train_labels))
print("Sample labels:", train_labels[:5])  # Print the first 5 labels for inspection

# Verify that the lengths match
assert len(train_encodings['input_ids']) == len(train_labels), \
    f"Length mismatch: Encodings: {len(train_encodings['input_ids'])}, Labels: {len(train_labels)}"

# Step 6: Tokenize the Data (Ensure text is in correct format)
train_encodings = tokenize_function(train_texts)  # Now 'train_texts' is a list of strings
test_encodings = tokenize_function(test_texts)    




Sample train texts: ['Alec Baldwin Admits He\'s "Bullied Women," Calls for a Change in Hollywood', 'Sorry Everyone, Dean Unglert Is Probably Not The Next Bachelor · Betches', "Ashley Graham's newest swimwear line uses unedited photos", 'Haunted Hollywood: Tinseltown Terrors, Filmdom Phantoms, and Movieland Mayhem, Second Edition: Ogden: 9781493015771: Amazon.com: Books', 'Bobbi Kristina Brown']
Sample test texts: ['Will Bindi Irwin Get Married to Boyfriend Chandler Powell?', 'When is The Crown season 3 on Netflix? Who is in the cast, and what is going to happen?', '8 Super Relatable Products You’ll Find in Khloe Kardashian’s Purse', "Chrissy Metz Almost Got Lube On Hugh Jackman's Suit At The MTV Movie Awards", 'Billionaire Wissam Al Mana Sends Message About Split From Janet Jackson']
Train encodings keys: dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
Length of input_ids: 18556
Length of attention_mask: 18556
Length of train_labels: 18556
Sample labels: 9251     0
15039  

In [27]:
# Debugging: Check the shape and keys of the encodings
print("Train encodings keys:", train_encodings.keys())
print("Length of input_ids:", len(train_encodings['input_ids']))
print("Length of attention_mask:", len(train_encodings['attention_mask']))

# Check the structure and length of labels
print("Length of train_labels:", len(train_labels))
print("Sample labels:", train_labels[:5])  # Print the first 5 labels for inspection

# Verify that the lengths match
assert len(train_encodings['input_ids']) == len(train_labels), \
    f"Length mismatch: Encodings: {len(train_encodings['input_ids'])}, Labels: {len(train_labels)}"

# Step 7: Create Dataset Class
# Step 1: Ensure that the indices are properly aligned
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data['title'], data['labels'], test_size=0.2, random_state=42
)

# Re-indexing the labels and texts to ensure proper alignment after splitting
train_labels = ttrain_labels = train_labels.reset_index(drop=True)
train_labels.reset_index(drop=True)
test_labels = test_labels.reset_index(drop=True)

# Step 2: Tokenize the texts and ensure proper alignment with the labels
train_encodings = tokenize_function(train_texts.tolist())  
test_encodings = tokenize_function(test_texts.tolist())   

# Ensure proper indexing of encodings (reset the indices to align with train_labels/test_labels)
train_encodings = {key: val[:len(train_labels)] for key, val in train_encodings.items()}
test_encodings = {key: val[:len(test_labels)] for key, val in test_encodings.items()}

# Step 3: Define the Dataset class
class FakeNewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

        # Ensure the lengths of encodings and labels match
        if len(self.encodings['input_ids']) != len(self.labels):
            raise ValueError(f"Mismatch: encodings length ({len(self.encodings['input_ids'])}) does not match labels length ({len(self.labels)})")

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

# Step 4: Create Dataset Objects for Training and Testing
train_dataset = FakeNewsDataset(train_encodings, train_labels)
test_dataset = FakeNewsDataset(test_encodings, test_labels)

# Step 5: Create the DataLoader with the dataset
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# Check the first batch to ensure correct data loading
for batch in train_dataloader:
    print("Batch input ids:", batch['input_ids'].shape)
    print("Batch labels:", batch['labels'].shape)
    break  # Check the first batch and stop


# Step 8: Load BERT Model for Sequence Classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Step 9: Set Up Training Arguments
training_args = TrainingArguments(
    output_dir='./results',          # Output directory for model checkpoints
    evaluation_strategy="epoch",     # Evaluate at the end of each epoch
    save_strategy="epoch",           # Save the model every epoch
    learning_rate=2e-5,              # Learning rate
    per_device_train_batch_size=16,  # Batch size for training
    per_device_eval_batch_size=16,   # Batch size for evaluation
    num_train_epochs=3,              # Number of training epochs
    weight_decay=0.01,               # Weight decay strength
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=10,                # Log every 10 steps
)

# Step 10: Set Up Trainer
trainer = Trainer(
    model=model,                         # Pretrained BERT model
    args=training_args,                  # Training arguments
    train_dataset=train_dataset,         # Training dataset
    eval_dataset=test_dataset            # Evaluation dataset
)

# Step 11: Train the Model
trainer.train()

# Step 12: Evaluate the Model
predictions = trainer.predict(test_dataset)
pred_labels = torch.argmax(torch.tensor(predictions.predictions), dim=1)

# Step 13: Print Evaluation Metrics
print("Accuracy:", accuracy_score(test_labels, pred_labels))
print(classification_report(test_labels, pred_labels, target_names=['Real', 'Fake']))

# Step 14: Save the Trained Model (Optional)
# You can save the trained model to disk if you want to reload it later.
model.save_pretrained('fake_news_bert_model')
tokenizer.save_pretrained('fake_news_bert_tokenizer')

Train encodings keys: dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
Length of input_ids: 18556
Length of attention_mask: 18556
Length of train_labels: 18556
Sample labels: 0    0
1    0
2    0
3    0
4    0
Name: labels, dtype: int64
Batch input ids: torch.Size([16, 72])
Batch labels: torch.Size([16])


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.3769,0.356508
2,0.3562,0.35164
3,0.2381,0.42128


Accuracy: 0.8620689655172413
              precision    recall  f1-score   support

        Real       0.90      0.92      0.91      3481
        Fake       0.75      0.68      0.71      1159

    accuracy                           0.86      4640
   macro avg       0.82      0.80      0.81      4640
weighted avg       0.86      0.86      0.86      4640



('fake_news_bert_tokenizer\\tokenizer_config.json',
 'fake_news_bert_tokenizer\\special_tokens_map.json',
 'fake_news_bert_tokenizer\\vocab.txt',
 'fake_news_bert_tokenizer\\added_tokens.json')