In [1]:
# Ensure compatible versions of libraries are installed
!pip install --upgrade transformers datasets scikit-learn pandas lightgbm numpy



In [2]:
import torch
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import numpy as np
import pandas as pd
import lightgbm as lgb

from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
df = pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/Datasets/combined/combined_dataset_200samples_50-50split.csv')

# Preprocess labels
df['label'] = df['label'].map({'real': 0, 'fake': 1})

# Optional: concatenate header + article
df['text'] = df['header'] + " " + df['article']

raw_dataset = Dataset.from_dict({'text': df['text'].tolist(), 'label': df['label'].tolist()})

In [4]:
# Define the train-test split ratio
train_test_split_ratio = 0.2

# Split the data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(),
    df['label'].tolist(),
    test_size=train_test_split_ratio,
    stratify=df['label'].tolist(), # ensures that the proportion of classes in your training and test sets is the same as in the original dataset, preventing biased splits
    random_state=42
)

# Create Dataset objects for training and validation sets
train_dataset = Dataset.from_dict({'text': train_texts, 'label': train_labels})
val_dataset = Dataset.from_dict({'text': val_texts, 'label': val_labels})

# Create a DatasetDict
dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset  # It's common to use 'validation' instead of 'test' for the split used during training
})

print("Dataset structure:")
print(dataset_dict)
print("\nExample from training set:")
print(dataset_dict['train'][0])
print("\nExample from validation set:")
print(dataset_dict['validation'][0])


Dataset structure:
DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 160
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 40
    })
})

Example from training set:
{'text': 'Was, wenn er nicht geht? Mehr als 13 Millionen Menschen lebten 2019 in Armut – so viele, wie seit der Wiedervereinigung nicht mehr. Laut Experten wird die Pandemie das Problem weiter verschärfen.\nDie Armut in Deutschland befindet sich derzeit auf einem Rekordhoch. Wie aus dem Armutsbericht des Paritätischen Wohlfahrtsverbandes hervorgeht, lebten im vergangenen Jahr 13,2 Millionen Menschen in Armut. Damit habe die Quote mit 15,9 Prozent den höchsten Wert seit der Wiedervereinigung erreicht. \nBei den besonders betroffenen Gruppen, wie Alleinerziehenden, Arbeitslosen und kinderreichen Familien, habe die Armut von 2018 auf 2019 noch einmal zugenommen. Jeder dritte erwachsene Arme ist dem Bericht zufolge erwerbstätig (33 Prozent) oder in Ren

In [5]:
# --- 3. Load Tokenizer and Model ---

# Use AutoModel instead of AutoModelForSequenceClassification
# because we want raw embeddings, not the classification head.

model_name = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = AutoModel.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [9]:
# --- 4. Preprocess the Dataset (Tokenization and Truncation) ---

def preprocess_function(entry):
    return tokenizer(entry["text"],
                     truncation=True,
                     padding="max_length", # fill [PAD] token to end of sequence
                     max_length=tokenizer.model_max_length # 512 in this case
                     )


tokenized_datasets = dataset_dict.map(preprocess_function, batched=True)
print("Tokenized dataset structure (uncut):")
print(tokenized_datasets)

tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels") # for hugging face trainer
tokenized_datasets.set_format("torch")

print("\nTokenized dataset structure:")
print(tokenized_datasets)

first_entry = tokenized_datasets['train'][0]

print("\nExample of tokenized training data (first entry):\n")

print(f"Input IDs shape: {first_entry['input_ids'].shape}")
print(f"Attention Mask shape: {first_entry['attention_mask'].shape}")
print(f"Labels: {first_entry['labels']}")

print("\nDecoded text from input_ids (first 100 chars):")
print(tokenizer.decode(first_entry['input_ids'], skip_special_tokens=True))

Map:   0%|          | 0/160 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

Tokenized dataset structure (uncut):
DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 160
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 40
    })
})

Tokenized dataset structure:
DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 160
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 40
    })
})

Example of tokenized training data (first entry):

Input IDs shape: torch.Size([512])
Attention Mask shape: torch.Size([512])
Labels: 0

Decoded text from input_ids (first 100 chars):
was, wenn er nicht geht? mehr als 13 millionen menschen lebten 2019 in armut – so viele, wie seit der wiedervereinigung nicht mehr. laut experten wird die pandemie das problem 

In [18]:
# --- 5. Custom Function to Extract Embeddings from BERT ---

# 2. DataLoader for Efficient Batch Processing
from torch.utils.data import DataLoader

# Custom collate_fn to handle the tokenized dataset
# The default collator works fine if you set_format("torch")
# But for clarity, we can define it.
def collate_fn(batch):
    # This function takes a list of individual dataset items (dictionaries)
    # and stacks their components into tensors suitable for a batch.
    # It ensures that input_ids, attention_mask are tensors if they aren't already
    input_ids = torch.stack([torch.tensor(item['input_ids']) if not isinstance(item['input_ids'], torch.Tensor) else item['input_ids'] for item in batch])
    attention_mask = torch.stack([torch.tensor(item['attention_mask']) if not isinstance(item['attention_mask'], torch.Tensor) else item['attention_mask'] for item in batch])
    labels = torch.tensor([item['labels'] for item in batch]) # Labels are usually single integers, so directly convert to tensor
    return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}


def get_bert_embeddings(model, tokenizer, dataset, batch_size=32):
    # 1. Setup and Device Placement
    model.eval() # Set model to evaluation mode
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    all_embeddings = [] # List to store extracted embeddings from all batches
    all_labels = []     # List to store corresponding labels from all batches


    # Creates an iterable over your dataset that yields batches of data
    data_loader = DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn)

    # 3. Inference Loop (No Gradient Calculation)
    with torch.no_grad(): # Disable gradient calculation for inference
        for batch in data_loader: # Iterate over batches provided by the DataLoader
            # Move the batch tensors to the appropriate device (GPU/CPU)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # 4. BERT Model Forward Pass for Hidden States
            # Get model outputs. output_hidden_states=True is crucial to get all layer outputs.
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)

            # outputs.hidden_states is a tuple of 13 tensors for BERT-base:
            # - Element 0: Embedding layer output (before the first Transformer layer)
            # - Elements 1 to 12: Outputs of the 12 Transformer encoder layers
            # We want the last 4 layers, which represent the most abstract and contextualized representations.
            # So, indices -1, -2, -3, -4 correspond to the last four encoder layers.
            last_4_layers_hidden_states = outputs.hidden_states[-4:] # Tuple of 4 tensors, each (batch_size, seq_len, hidden_size)

            # 5. Aggregating Across Layers (Max Pooling)
            # Stacks the 4 tensors along a new dimension (dimension 0).
            # Shape changes from (4 x (batch_size, seq_len, hidden_size)) to (4, batch_size, seq_len, hidden_size)
            stacked_hidden_states = torch.stack(last_4_layers_hidden_states)

            # Takes the maximum value element-wise across the new 'layer' dimension (dim=0).
            # This means for each token and each hidden dimension, it selects the maximum value
            # from its representation in the last 4 layers.
            # Resulting shape: (batch_size, seq_len, hidden_size)
            max_pooled_layers = torch.max(stacked_hidden_states, dim=0).values

            # 6. Aggregating Across Tokens (Attention Masking and Average Pooling)
            # The max_pooled_layers still contain embeddings for padding tokens (0s).
            # We need to exclude these when averaging to get a meaningful sentence embedding.

            # Expands the attention mask (batch_size, seq_len) to match the hidden_size dimension.
            # Shape becomes (batch_size, seq_len, 1), then expanded to (batch_size, seq_len, hidden_size).
            # This allows element-wise multiplication with the embeddings.
            attention_mask_expanded = attention_mask.unsqueeze(-1).expand_as(max_pooled_layers)

            # Multiplies the embeddings by the expanded attention mask.
            # This sets the embeddings of padding tokens to zero, effectively ignoring them.
            masked_embeddings = max_pooled_layers * attention_mask_expanded

            # Sums the embeddings along the sequence length dimension (dim=1) for each sample in the batch.
            # This gives a single vector for each sequence representing the sum of its token embeddings.
            # Resulting shape: (batch_size, hidden_size)
            sum_embeddings = torch.sum(masked_embeddings, dim=1)

            # Counts the number of non-padding tokens in each sequence.
            # Summing the attention mask (which contains 1s for real tokens and 0s for padding)
            # gives the actual length of each original sequence.
            # unsqueeze(-1) adds a dimension for broadcasting during division.
            # Shape: (batch_size, 1)
            num_tokens = torch.sum(attention_mask, dim=1).unsqueeze(-1)

            # Prevents division by zero if an attention mask somehow ends up with zero tokens (unlikely for valid inputs).
            num_tokens = torch.clamp(num_tokens, min=1e-9)

            # Calculates the average embedding for each sequence by dividing the sum of embeddings
            # by the number of actual tokens. This is the final document-level embedding.
            # Resulting shape: (batch_size, hidden_size)
            sentence_embeddings = sum_embeddings / num_tokens

            # 7. Collect Results
            # Appends the numpy representation of the batch's sentence embeddings to a list.
            all_embeddings.append(sentence_embeddings.cpu().numpy())
            # Appends the numpy representation of the batch's labels to a list.
            all_labels.append(labels.cpu().numpy())

    # 8. Final Output
    # Stacks all collected batch embeddings vertically to form a single NumPy array.
    # Resulting shape: (total_samples, hidden_size)
    # Stacks all collected batch labels horizontally to form a single 1D NumPy array.
    # Resulting shape: (total_samples,)
    return np.vstack(all_embeddings), np.hstack(all_labels)


# --- How the function is called ---

# 9. Extracting Embeddings for Training Data
print("\nExtracting BERT embeddings for training data...")
# Calls the function for your training dataset.
# `fine_tuned_bert_embeddings_extractor` is the BERT model backbone (AutoModel)
# that has been fine-tuned on your fake news data.
train_embeddings, train_labels = get_bert_embeddings(bert_model, tokenizer, tokenized_datasets['train'])
print(f"Train Embeddings shape: {train_embeddings.shape}") # Prints the shape of the resulting array (e.g., (N_train_samples, 768))
print(f"Train Labels shape: {train_labels.shape}")       # Prints the shape of the labels array (e.g., (N_train_samples,))

# 10. Extracting Embeddings for Test Data
print("\nExtracting BERT embeddings for test data...")
# Calls the function similarly for your test dataset.
# The 'validation' key here should correspond to your test split (as per your DatasetDict)
test_embeddings, test_labels = get_bert_embeddings(bert_model, tokenizer, tokenized_datasets['validation'])
print(f"Test Embeddings shape: {test_embeddings.shape}") # Prints the shape (e.g., (N_test_samples, 768))
print(f"Test Labels shape: {test_labels.shape}")       # Prints the shape (e.g., (N_test_samples,))


Extracting BERT embeddings for training data...
Train Embeddings shape: (160, 768)
Train Labels shape: (160,)

Extracting BERT embeddings for test data...
Test Embeddings shape: (40, 768)
Test Labels shape: (40,)


In [12]:
# --- 6. Train LightGBM Model ---
print("\nTraining LightGBM model...")

# Initialize LightGBM classifier
# You can tune these parameters further
lgb_clf = lgb.LGBMClassifier(objective='binary',
                             metric='binary_logloss',
                             n_estimators=1000, # Number of boosting rounds
                             learning_rate=0.05,
                             num_leaves=31,
                             max_depth=-1,
                             random_state=42,
                             n_jobs=-1, # Use all available cores
                             colsample_bytree=0.8,
                             subsample=0.8,
                             reg_alpha=0.1,
                             reg_lambda=0.1)

# Train the model
lgb_clf.fit(train_embeddings, train_labels)

print("LightGBM training complete.")


Training LightGBM model...
[LightGBM] [Info] Number of positive: 80, number of negative: 80
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021995 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 42240
[LightGBM] [Info] Number of data points in the train set: 160, number of used features: 768
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
LightGBM training complete.


In [13]:
# --- 7. Evaluate LightGBM Model ---
print("\nEvaluating LightGBM model on test embeddings...")

# Make predictions
lgb_predictions_proba = lgb_clf.predict_proba(test_embeddings)[:, 1] # Probability of positive class
lgb_predictions = (lgb_predictions_proba > 0.5).astype(int) # Convert probabilities to binary predictions

# Calculate metrics
lgb_accuracy = accuracy_score(test_labels, lgb_predictions)
lgb_f1 = f1_score(test_labels, lgb_predictions)
lgb_precision = precision_score(test_labels, lgb_predictions)
lgb_recall = recall_score(test_labels, lgb_predictions)

print(f"LightGBM Accuracy: {lgb_accuracy:.4f}")
print(f"LightGBM F1 Score: {lgb_f1:.4f}")
print(f"LightGBM Precision: {lgb_precision:.4f}")
print(f"LightGBM Recall: {lgb_recall:.4f}")


Evaluating LightGBM model on test embeddings...
LightGBM Accuracy: 0.6500
LightGBM F1 Score: 0.6500
LightGBM Precision: 0.6500
LightGBM Recall: 0.6500




In [14]:
# --- 8. Make Predictions with LightGBM on new data (example) ---
print("\nMaking predictions on new articles with the LightGBM model...")

new_articles = [
    "A cat was seen driving a car down the highway at 100 mph. This incredible event was captured on video.", # Likely fake
    "The prime minister announced new policies to combat climate change, focusing on renewable energy investments and carbon emission reduction targets. Experts praise the move.", # Likely real
    "Breaking news: Scientists confirm the existence of a parallel universe accessible via your kitchen sink. Visit your sink now to experience it!" # Definitely fake
]

# Tokenize new articles using the same truncation strategy
# `return_tensors="pt"` makes these PyTorch tensors directly
new_inputs = tokenizer(new_articles, truncation=True, padding="max_length", max_length=tokenizer.model_max_length, return_tensors="pt")


# --- MODIFIED SECTION FOR EXTRACTING EMBEDDINGS FOR NEW ARTICLES ---
bert_model.eval() # Ensure model is in evaluation mode
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model.to(device)

with torch.no_grad():
    input_ids = new_inputs['input_ids'].to(device)
    attention_mask = new_inputs['attention_mask'].to(device)

    outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)

    last_4_layers_hidden_states = outputs.hidden_states[-4:]
    stacked_hidden_states = torch.stack(last_4_layers_hidden_states)
    max_pooled_layers = torch.max(stacked_hidden_states, dim=0).values

    attention_mask_expanded = attention_mask.unsqueeze(-1).expand_as(max_pooled_layers)
    masked_embeddings = max_pooled_layers * attention_mask_expanded
    sum_embeddings = torch.sum(masked_embeddings, dim=1)
    num_tokens = torch.sum(attention_mask, dim=1).unsqueeze(-1)
    num_tokens = torch.clamp(num_tokens, min=1e-9)

    # This directly gives the batch of new article embeddings
    new_embeddings = (sum_embeddings / num_tokens).cpu().numpy()


# Make predictions using LightGBM
new_lgb_predictions_proba = lgb_clf.predict_proba(new_embeddings)[:, 1]
new_lgb_predictions = (new_lgb_predictions_proba > 0.5).astype(int)

print("\nPredictions on new articles (via LightGBM):")
for i, article in enumerate(new_articles):
    predicted_class_label = 'FAKE' if new_lgb_predictions[i] == 1 else 'REAL'
    print(f"Article: \"{article[:100]}...\"")
    print(f"Predicted Class: {predicted_class_label} (Probability of Fake: {new_lgb_predictions_proba[i]:.4f})")
    print("-" * 50)


Making predictions on new articles with the LightGBM model...

Predictions on new articles (via LightGBM):
Article: "A cat was seen driving a car down the highway at 100 mph. This incredible event was captured on vide..."
Predicted Class: FAKE (Probability of Fake: 0.6583)
--------------------------------------------------
Article: "The prime minister announced new policies to combat climate change, focusing on renewable energy inv..."
Predicted Class: REAL (Probability of Fake: 0.0219)
--------------------------------------------------
Article: "Breaking news: Scientists confirm the existence of a parallel universe accessible via your kitchen s..."
Predicted Class: FAKE (Probability of Fake: 0.9365)
--------------------------------------------------


