In [1]:
# Ensure compatible versions of libraries are installed
!pip install --upgrade transformers datasets scikit-learn pandas lightgbm numpy



In [2]:
import torch
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import numpy as np
import pandas as pd
import lightgbm as lgb

from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
df = pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/Datasets/final_dataset.csv')

# Combine 'header' and 'article' into a single text column
# Handle potential NaN values by converting them to empty strings
df['text'] = df['header'].fillna('') + " " + df['article'].fillna('')

# Rename 'label' column to 'labels' as Hugging Face Trainer expects this
df = df.rename(columns={'label': 'labels'})

# Select only the 'text' and 'labels' columns
df = df[['text', 'labels']]

# Display some info
print(f"DataFrame shape: {df.shape}")
print(df.head())
print(df['labels'].value_counts())

DataFrame shape: (45869, 2)
                                                text  labels
0  Corona-Krise soll ausgenutzt werden, um Mensch...       1
1  Vermerke für »Mutations«-Impfungen bereits vor...       1
2  Merkel droht mit totaler Sperrung: Machtergrei...       1
3  Aber nicht genügend Testmöglichkeiten vorhande...       1
4  AfD-Mitbegründer siegt vor Bundesverfassungsge...       1
labels
0    28056
1    17813
Name: count, dtype: int64


In [4]:
# Split the data into training, validation, and test sets
# It's good practice to have a test set for final evaluation
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['labels'])

print(f"Train set shape: {train_df.shape}")
print(f"Validation set shape: {val_df.shape}")

# Convert Pandas DataFrames to Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Create a DatasetDict
dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
})

print("\nRaw datasets structure:")
print(dataset_dict)

Train set shape: (36695, 2)
Validation set shape: (9174, 2)

Raw datasets structure:
DatasetDict({
    train: Dataset({
        features: ['text', 'labels', '__index_level_0__'],
        num_rows: 36695
    })
    validation: Dataset({
        features: ['text', 'labels', '__index_level_0__'],
        num_rows: 9174
    })
})


In [7]:
# --- 3. Load Tokenizer and Model ---

# Use AutoModel instead of AutoModelForSequenceClassification
# because we want raw embeddings, not the classification head.

model_name = "krisschaaf/roberta-base-fake-news-german"

tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = AutoModel.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/1.25k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.56M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/804 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at krisschaaf/roberta-base-fake-news-german and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# --- 4. Preprocess the Dataset (Tokenization and Truncation) ---

def preprocess_function(entry):
    return tokenizer(entry["text"],
                     truncation=True,
                     padding="max_length", # fill [PAD] token to end of sequence
                     max_length=tokenizer.model_max_length # 512 in this case
                     )


tokenized_datasets = dataset_dict.map(preprocess_function, batched=True)
print("Tokenized dataset structure (uncut):")
print(tokenized_datasets)

tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets.set_format("torch")

print("\nTokenized dataset structure:")
print(tokenized_datasets)

first_entry = tokenized_datasets['train'][0]

print("\nExample of tokenized training data (first entry):\n")

print(f"Input IDs shape: {first_entry['input_ids'].shape}")
print(f"Attention Mask shape: {first_entry['attention_mask'].shape}")
print(f"Labels: {first_entry['labels']}")

print("\nDecoded text from input_ids (first 100 chars):")
print(tokenizer.decode(first_entry['input_ids'], skip_special_tokens=True))

Map:   0%|          | 0/36695 [00:00<?, ? examples/s]

Map:   0%|          | 0/9174 [00:00<?, ? examples/s]

Tokenized dataset structure (uncut):
DatasetDict({
    train: Dataset({
        features: ['text', 'labels', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 36695
    })
    validation: Dataset({
        features: ['text', 'labels', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 9174
    })
})

Tokenized dataset structure:
DatasetDict({
    train: Dataset({
        features: ['labels', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 36695
    })
    validation: Dataset({
        features: ['labels', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 9174
    })
})

Example of tokenized training data (first entry):

Input IDs shape: torch.Size([512])
Attention Mask shape: torch.Size([512])
Labels: 1

Decoded text from input_ids (first 100 chars):
Vergleichszahlen sprechen deutliche Sprache
Studie: Schulschließung ohne Einfluss auf Corona-Pandemie 
                Das Weiße Haus in Washington gab am

In [9]:
import torch
import numpy as np
from torch.utils.data import DataLoader
from transformers import AutoModel, AutoTokenizer # Assuming you're loading these like this elsewhere

# Custom collate_fn to handle the tokenized dataset
# The default collator works fine if you set_format("torch")
# But for clarity, we can define it.
def collate_fn(batch):
    # This function takes a list of individual dataset items (dictionaries)
    # and stacks their components into tensors suitable for a batch.
    # It ensures that input_ids, attention_mask are tensors if they aren't already
    input_ids = torch.stack([torch.tensor(item['input_ids']) if not isinstance(item['input_ids'], torch.Tensor) else item['input_ids'] for item in batch])
    attention_mask = torch.stack([torch.tensor(item['attention_mask']) if not isinstance(item['attention_mask'], torch.Tensor) else item['attention_mask'] for item in batch])
    labels = torch.tensor([item['labels'] for item in batch]) # Labels are usually single integers, so directly convert to tensor
    return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}


def get_bert_embeddings(model, tokenizer, dataset, batch_size=32):
    # 1. Setup and Device Placement
    model.eval() # Set model to evaluation mode
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    all_embeddings = [] # List to store extracted embeddings from all batches
    all_labels = []     # List to store corresponding labels from all batches


    # Creates an iterable over your dataset that yields batches of data
    data_loader = DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn)

    # 3. Inference Loop (No Gradient Calculation)
    with torch.no_grad(): # Disable gradient calculation for inference
        for batch in data_loader: # Iterate over batches provided by the DataLoader
            # Move the batch tensors to the appropriate device (GPU/CPU)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # 4. BERT Model Forward Pass for Hidden States
            # Get model outputs. output_hidden_states=True is crucial to get all layer outputs.
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)

            # outputs.hidden_states is a tuple of 13 tensors for BERT-base:
            # - Element 0: Embedding layer output (before the first Transformer layer)
            # - Elements 1 to 12: Outputs of the 12 Transformer encoder layers
            # We want the last 4 layers, which represent the most abstract and contextualized representations.
            # So, indices -1, -2, -3, -4 correspond to the last four encoder layers.
            last_4_layers_hidden_states = outputs.hidden_states[-4:] # Tuple of 4 tensors, each (batch_size, seq_len, hidden_size)

            # 5. Aggregating Across Layers (Average Pooling instead of Max Pooling)
            # Stacks the 4 tensors along a new dimension (dimension 0).
            # Shape changes from (4 x (batch_size, seq_len, hidden_size)) to (4, batch_size, seq_len, hidden_size)
            stacked_hidden_states = torch.stack(last_4_layers_hidden_states)

            # --- *** MODIFIED LINE HERE: Changed from torch.max to torch.mean *** ---
            # Takes the mean value element-wise across the new 'layer' dimension (dim=0).
            # This means for each token and each hidden dimension, it calculates the average value
            # from its representation in the last 4 layers.
            # Resulting shape: (batch_size, seq_len, hidden_size)
            avg_pooled_layers = torch.mean(stacked_hidden_states, dim=0)
            # --- *** END MODIFIED LINE *** ---


            # 6. Aggregating Across Tokens (Attention Masking and Average Pooling)
            # The avg_pooled_layers still contain embeddings for padding tokens (0s).
            # We need to exclude these when averaging to get a meaningful sentence embedding.

            # Expands the attention mask (batch_size, seq_len) to match the hidden_size dimension.
            # Shape becomes (batch_size, seq_len, 1), then expanded to (batch_size, seq_len, hidden_size).
            # This allows element-wise multiplication with the embeddings.
            attention_mask_expanded = attention_mask.unsqueeze(-1).expand_as(avg_pooled_layers)

            # Multiplies the embeddings by the expanded attention mask.
            # This sets the embeddings of padding tokens to zero, effectively ignoring them.
            masked_embeddings = avg_pooled_layers * attention_mask_expanded

            # Sums the embeddings along the sequence length dimension (dim=1) for each sample in the batch.
            # This gives a single vector for each sequence representing the sum of its token embeddings.
            # Resulting shape: (batch_size, hidden_size)
            sum_embeddings = torch.sum(masked_embeddings, dim=1)

            # Counts the number of non-padding tokens in each sequence.
            # Summing the attention mask (which contains 1s for real tokens and 0s for padding)
            # gives the actual length of each original sequence.
            # unsqueeze(-1) adds a dimension for broadcasting during division.
            # Shape: (batch_size, 1)
            num_tokens = torch.sum(attention_mask, dim=1).unsqueeze(-1)

            # Prevents division by zero if an attention mask somehow ends up with zero tokens (unlikely for valid inputs).
            num_tokens = torch.clamp(num_tokens, min=1e-9)

            # Calculates the average embedding for each sequence by dividing the sum of embeddings
            # by the number of actual tokens. This is the final document-level embedding.
            # Resulting shape: (batch_size, hidden_size)
            sentence_embeddings = sum_embeddings / num_tokens

            # 7. Collect Results
            # Appends the numpy representation of the batch's sentence embeddings to a list.
            all_embeddings.append(sentence_embeddings.cpu().numpy())
            # Appends the numpy representation of the batch's labels to a list.
            all_labels.append(labels.cpu().numpy())

    # 8. Final Output
    # Stacks all collected batch embeddings vertically to form a single NumPy array.
    # Resulting shape: (total_samples, hidden_size)
    # Stacks all collected batch labels horizontally to form a single 1D NumPy array.
    # Resulting shape: (total_samples,)
    return np.vstack(all_embeddings), np.hstack(all_labels)


# --- How the function is called ---

# 9. Extracting Embeddings for Training Data
print("\nExtracting BERT embeddings for training data (avg pooling last 4 layers)...")
# Calls the function for your training dataset.
# `fine_tuned_bert_embeddings_extractor` is the BERT model backbone (AutoModel)
# that has been fine-tuned on your fake news data.
train_embeddings, train_labels = get_bert_embeddings(bert_model, tokenizer, tokenized_datasets['train'])
print(f"Train Embeddings shape: {train_embeddings.shape}") # Prints the shape of the resulting array (e.g., (N_train_samples, 768))
print(f"Train Labels shape: {train_labels.shape}")       # Prints the shape of the labels array (e.g., (N_train_samples,))

# 10. Extracting Embeddings for Test Data
print("\nExtracting BERT embeddings for test data (avg pooling last 4 layers)...")
# Calls the function similarly for your test dataset.
# The 'validation' key here should correspond to your test split (as per your DatasetDict)
test_embeddings, test_labels = get_bert_embeddings(bert_model, tokenizer, tokenized_datasets['validation'])
print(f"Test Embeddings shape: {test_embeddings.shape}") # Prints the shape (e.g., (N_test_samples, 768))
print(f"Test Labels shape: {test_labels.shape}")       # Prints the shape (e.g., (N_test_samples,))


Extracting BERT embeddings for training data (avg pooling last 4 layers)...
Train Embeddings shape: (36695, 768)
Train Labels shape: (36695,)

Extracting BERT embeddings for test data (avg pooling last 4 layers)...
Test Embeddings shape: (9174, 768)
Test Labels shape: (9174,)


In [10]:
import joblib

# Create a dictionary to hold the data
data_to_save = {
    'train_embeddings': train_embeddings,
    'train_labels': train_labels,
    'test_embeddings': test_embeddings,
    'test_labels': test_labels
}

# Save the dictionary to a single file
joblib.dump(data_to_save, '/content/gdrive/MyDrive/Colab Notebooks/Datasets/final/embeddings-and-labels_last-4-layers-avg_roberta-base.pkl')

print("Train and test embeddings and labels saved using Joblib.")

Train and test embeddings and labels saved using Joblib.
