In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
import torch
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Load and Prepare Data
fake_news = pd.read_csv('data/fake.csv')  # Replace with the correct path
real_news = pd.read_csv('data/real.csv')  # Replace with the correct path

# Drop the 'id' column (ensure 'id' is the column you want to remove)
fake_news = fake_news.drop(columns=['id'], errors='ignore')  # Ignores if 'id' does not exist
real_news = real_news.drop(columns=['id'], errors='ignore')  # Ignores if 'id' does not exist

# Add labels: 1 for fake, 0 for real
fake_news['labels'] = 1
real_news['labels'] = 0

# Combine datasets into one dataframe
data = pd.concat([fake_news[['title', 'labels', 'news_url']], real_news[['title', 'labels', 'news_url']]], ignore_index=True)

# Clean the data by removing any NaN values in the 'title' column
data.dropna(subset=['title'], inplace=True)

# Convert the titles to strings explicitly (in case some are not strings)
data['title'] = data['title'].astype(str)

# Step 2: Split Data into Training and Testing Sets
train_texts, test_texts, train_labels, test_labels, train_urls, test_urls = train_test_split(
    data['title'], data['labels'], data['news_url'], test_size=0.2, random_state=42
)

# Step 3: Load BERT Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenization Function
def tokenize_function(texts):
    return tokenizer(texts, truncation=True, padding=True, max_length=512, return_tensors="pt")

# Ensure train_texts and test_texts are lists of strings
if isinstance(train_texts, pd.Series):
    train_texts = train_texts.tolist()

if isinstance(test_texts, pd.Series):
    test_texts = test_texts.tolist()

# Tokenize the data
train_encodings = tokenize_function(train_texts)
test_encodings = tokenize_function(test_texts)

# Step 4: Create Dataset Class
class FakeNewsDataset(Dataset):
    def __init__(self, encodings, labels, urls):
        self.encodings = encodings
        self.labels = labels
        self.urls = urls

        # Ensure the lengths of encodings, labels, and urls match
        if len(self.encodings['input_ids']) != len(self.labels) or len(self.labels) != len(self.urls):
            raise ValueError(f"Mismatch: encodings length ({len(self.encodings['input_ids'])}) does not match labels length ({len(self.labels)})")

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels.iloc[idx], dtype=torch.long)
        item['news_url'] = self.urls.iloc[idx]
        return item

# Step 5: Create Dataset Objects for Training and Testing
train_dataset = FakeNewsDataset(train_encodings, train_labels, train_urls)
test_dataset = FakeNewsDataset(test_encodings, test_labels, test_urls)

# Step 6: Create the DataLoader with the dataset
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# Check the first batch to ensure correct data loading
for batch in train_dataloader:
    print("Batch input ids:", batch['input_ids'].shape)
    print("Batch labels:", batch['labels'].shape)
    print("Batch URLs:", batch['news_url'])  # Display URLs for the first batch
    break  # Check the first batch and stop

# Step 7: Load BERT Model for Sequence Classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Step 8: Set Up Training Arguments
training_args = TrainingArguments(
    output_dir='./results',          # Output directory for model checkpoints
    evaluation_strategy="epoch",     # Evaluate at the end of each epoch
    save_strategy="epoch",           # Save the model every epoch
    learning_rate=2e-5,              # Learning rate
    per_device_train_batch_size=16,  # Batch size for training
    per_device_eval_batch_size=16,   # Batch size for evaluation
    num_train_epochs=3,              # Number of training epochs
    weight_decay=0.01,               # Weight decay strength
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=10,                # Log every 10 steps
)

# Step 9: Set Up Trainer
trainer = Trainer(
    model=model,                         # Pretrained BERT model
    args=training_args,                  # Training arguments
    train_dataset=train_dataset,         # Training dataset
    eval_dataset=test_dataset            # Evaluation dataset
)

# Step 10: Train the Model
trainer.train()

# Step 11: Evaluate the Model
predictions = trainer.predict(test_dataset)
pred_labels = torch.argmax(torch.tensor(predictions.predictions), dim=1)

# Step 12: Print Evaluation Metrics
print("Accuracy:", accuracy_score(test_labels, pred_labels))
print(classification_report(test_labels, pred_labels, target_names=['Real', 'Fake']))

# Step 13: Save the Trained Model (Optional)
# You can save the trained model to disk if you want to reload it later.
model.save_pretrained('fake_news_bert_model')
tokenizer.save_pretrained('fake_news_bert_tokenizer')


Batch input ids: torch.Size([16, 72])
Batch labels: torch.Size([16])
Batch URLs: ['www.usmagazine.com/celebrity-news/news/o-j-simpson-im-not-khloe-kardashians-dad/', 'https://people.com/tv/wells-adams-sarah-hyland-relationship-weirdly-normal/', 'www.ok.co.uk/celebrity-news/1234596/margot-robbie-pregnant-husband-tom-ackerley-expecting-baby-child-i-tonya-wolf-wall-street', 'https://www.tmz.com/2018/07/20/george-clooney-work-scooter-accident/', 'http://www.k92radio.com/news/patricia-clarkson-gets-very-candid-about-justin-timberlakes-penis-size', 'https://www.refinery29.com/en-us/2018/02/190315/julianne-moore-defends-alicia-vikander-on-set', 'hollywoodlife.com/2010/11/03/gwen-stefani-miscarriage-gavin-rossdale-national-enquirer-adoption/', 'thehill.com/policy/national-security/355749-fbi-uncovered-russian-bribery-plot-before-obama-administration', 'https://www.esquire.com/style/mens-fashion/advice/g524/kentucky-derby-attire/', 'https://medium.com/@ChristianWelch18837Uj/heres-why-paris-jack

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.3833,0.349719
2,0.3604,0.354216
3,0.2077,0.435459


Accuracy: 0.8625
              precision    recall  f1-score   support

        Real       0.90      0.92      0.91      3481
        Fake       0.74      0.69      0.72      1159

    accuracy                           0.86      4640
   macro avg       0.82      0.81      0.81      4640
weighted avg       0.86      0.86      0.86      4640



('fake_news_bert_tokenizer\\tokenizer_config.json',
 'fake_news_bert_tokenizer\\special_tokens_map.json',
 'fake_news_bert_tokenizer\\vocab.txt',
 'fake_news_bert_tokenizer\\added_tokens.json')