In [1]:
!pip install -U transformers>=4.48.0 datasets lomo-optim pytorch-optimizer
# !pip install flash-attn

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompatible.[0m[31m
[0m

In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
import torch
from torch.utils.data import Dataset, DataLoader
import random

# --- Configuration ---
DATASET_NAME = "math-ai/AutoMathText"  # Or your chosen subset "math-ai/AutoMathText"
DATASET_SUBSET = "web-0.80-to-1.00"
MODEL_NAME = "answerdotai/ModernBERT-base" # Or your chosen pre-trained model
MAX_SEQ_LEN = 512 # Or your chosen maximum sequence length
TOP_K_WORDS = 150  # Number of frequent words for sampling
DATASET_SIZE = 30000
DIFFICULTY_LEVELS = {
    "hard": (1, 10),
    "medium": (10, 50),
    "easy": (50, 100),
}

# --- Load Dataset and Tokenizer ---
dataset = load_dataset(DATASET_NAME, DATASET_SUBSET, split="train", streaming=True)
dataset = dataset.take(DATASET_SIZE)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# --- Calculate Word Frequencies and Embeddings ---

# 1. Calculate word frequencies (can be done offline and cached)
word_counts = {}
for item in dataset:
    tokens = tokenizer.tokenize(item["text"])
    for token in tokens:
        word_counts[token] = word_counts.get(token, 0) + 1

# 2. Get top K frequent words
top_k_words = [
    word
    for word, count in sorted(word_counts.items(), key=lambda item: item[1], reverse=True)
    [:TOP_K_WORDS]
]

# 3. Calculate embeddings for top K words (can be done offline and cached)
# Assuming you have a way to get embeddings, e.g., from the pre-trained model
# For example (this part depends on how you want to get the embeddings):
model = AutoModel.from_pretrained(MODEL_NAME)
embeddings = {}
with torch.no_grad():
    for word in top_k_words:
        inputs = tokenizer(word, return_tensors="pt")
        outputs = model(**inputs)
        embeddings[word] = outputs.last_hidden_state.mean(dim=1).squeeze()

# --- Function to Create Impostor Words ---
def get_impostor_word(word, embeddings, difficulty="medium"):
    if word not in embeddings:
        return random.choice(top_k_words)  # Handle out-of-vocabulary words

    # 4. Calculate distances to other words
    distances = {}
    word_embedding = embeddings[word]
    with torch.no_grad():
        for other_word, other_embedding in embeddings.items():
            if word != other_word:
                # Use cosine similarity, or other distance metrics
                distance = 1 - torch.nn.functional.cosine_similarity(
                    word_embedding, other_embedding, dim=0
                )
                distances[other_word] = distance.item()

    # 5. Sample based on difficulty
    min_rank, max_rank = DIFFICULTY_LEVELS[difficulty]
    sorted_words = sorted(distances.keys(), key=lambda w: distances[w])
    impostor_word = random.choice(
        sorted_words[min_rank:max_rank]
    )  # Adjust range based on difficulty

    return impostor_word

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/18.5k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.13M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (12113 > 8192). Running this sequence through the model will result in indexing errors


config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/599M [00:00<?, ?B/s]

In [3]:
class ImpostorDataset(Dataset):
    def __init__(self, dataset, tokenizer, embeddings, max_seq_len):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.embeddings = embeddings
        self.max_seq_len = max_seq_len
        # store the data in a list since IterableDataset has no len
        self.data = list(self.dataset)

    def __len__(self):
        # return length of data list
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        text = item["text"]

        tokens = self.tokenizer.tokenize(text)

        # --- Truncate or pad the sequence ---
        if len(tokens) > self.max_seq_len - 2:
            tokens = tokens[: self.max_seq_len - 2]

        # --- Add special tokens ---
        tokens = ["[CLS]"] + tokens + ["[SEP]"]

        # --- Variable Impostor Rate ---
        impostor_rate = random.uniform(0.0, 0.40)  # Example: 0% to 50%

        # --- Create impostor tokens ---
        labels = [0] * len(tokens)  # 0 indicates original token
        corrupted_tokens = []

        for i, token in enumerate(tokens):
            if random.random() < impostor_rate and token not in ["[CLS]", "[SEP]"]:
                # Randomly choose difficulty
                difficulty = random.choice(list(DIFFICULTY_LEVELS.keys()))
                # Get impostor word
                impostor_word = get_impostor_word(
                    token, self.embeddings, difficulty
                )

                corrupted_tokens.append(impostor_word)
                labels[i] = 1  # 1 indicates impostor token
            else:
                corrupted_tokens.append(token)

        # --- Convert to IDs ---
        input_ids = self.tokenizer.convert_tokens_to_ids(corrupted_tokens)

        # --- Pad sequences ---
        padding_length = self.max_seq_len - len(input_ids)
        input_ids = input_ids + ([self.tokenizer.pad_token_id] * padding_length)
        labels = labels + ([0] * padding_length)

        return {
            "input_ids": torch.tensor(input_ids),
            "attention_mask": torch.tensor([1] * len(corrupted_tokens) + ([0] * padding_length)),
            "labels": torch.tensor(labels),
        }

In [4]:
from transformers import AutoModelForSequenceClassification, AutoConfig
import torch.nn as nn

class ImpostorVerifier(nn.Module):
    def __init__(self, model_name, max_seq_len):
        super().__init__()
        self.model = AutoModel.from_pretrained(model_name)
        self.config = AutoConfig.from_pretrained(model_name)

        # Add a linear layer for probability output
        self.config.num_labels = 1
        self.impostor_layer = nn.Linear(self.config.hidden_size, 1)

        # Use a pooling layer
        self.pooler = nn.AdaptiveMaxPool1d(1)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
        )

        # Get the token outputs
        token_outputs = outputs.last_hidden_state

        # Pool the sequence outputs
        pooled_outputs = self.pooler(token_outputs.permute(0, 2, 1)).squeeze(-1)

        # Impostor probability prediction
        impostor_logits = self.impostor_layer(token_outputs).squeeze(-1)
        impostor_probs = torch.sigmoid(impostor_logits)

        # --- Calculate sequence quality score (using log-likelihood) ---
        sequence_quality = -torch.sum(torch.log(1 - impostor_probs + 1e-12), dim=1)

        # Calculate loss if labels are provided
        loss = None
        if labels is not None:
            loss_fn = nn.BCEWithLogitsLoss()
            token_losses = loss_fn(impostor_logits, labels.float())

            # --- Apply padding mask to token losses ---
            masked_token_losses = token_losses * attention_mask # Element-wise multiplication
            loss = torch.sum(masked_token_losses) / torch.sum(attention_mask) # Average over non-padding tokens

        return {
            "loss": loss,
            "token_probs": impostor_probs,
            "sequence_quality": sequence_quality,
        }

In [5]:
from pytorch_optimizer import create_optimizer, get_wsd_schedule
from tqdm.auto import tqdm
import torch.optim as optim # Import the base optimizer package

# --- Hyperparameters ---
LEARNING_RATE = 1e-5
NUM_EPOCHS = 1
BATCH_SIZE = 16 # Adjust based on your GPU memory
WEIGHT_DECAY = 1e-2

# --- Load pre-trained embeddings ---
# ... (load your pre-calculated embeddings here) ...

# --- Create Dataset and DataLoader ---
train_dataset = ImpostorDataset(
    dataset, tokenizer, embeddings, MAX_SEQ_LEN
)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE)

# --- Model Initialization ---
model = ImpostorVerifier(MODEL_NAME, MAX_SEQ_LEN)
model.to("cuda")  # Move model to GPU

model = torch.compile(model)

optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
# optimizer = create_optimizer(
#     model,
#     'adalomo',
#     lr=LEARNING_RATE,
#     weight_decay=WEIGHT_DECAY,
# )

num_training_steps = (DATASET_SIZE // BATCH_SIZE + 1) * NUM_EPOCHS

scheduler = get_wsd_schedule(
    optimizer, num_warmup_steps=10, num_stable_steps=num_training_steps, num_decay_steps=num_training_steps
)

# --- Training Loop ---
for epoch in range(NUM_EPOCHS):
    model.train()
    total_loss = 0
    # Wrap the dataloader with tqdm
    with tqdm(train_dataloader, desc=f"Epoch {epoch + 1}") as pbar:
        for batch in pbar:
            batch = {k: v.to("cuda") for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs["loss"]

            loss.backward()
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

            # Update tqdm's display
            pbar.set_postfix({"Batch Loss": loss.item()})
            total_loss += loss.item()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch} average loss: {avg_train_loss}")

Epoch 1:   0%|          | 0/1875 [00:00<?, ?it/s]

Epoch 0 average loss: 0.0995991471350193


In [9]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn as nn
import random

# --- Load Tokenizer and Model ---
tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")  # Or other tokenizer
model.to("cuda")
model.eval()

# --- Hardcoded Examples ---
examples = [
    {
        "text": "Habibullah Akbar",
        "expected_quality": "irrelevant sequence",
    },
    {
        "text": "Cat sat on the mat",
        "expected_quality": "irrelevant sequence",
    },
    {
        "text": "The cat sat on the mat, basking in the warm sunlight streaming through the window, its tail gently flicking back and forth as it dozed off into a peaceful nap.",
        "expected_quality": "irrelevant sequence",
    },
    {
        "text": "As a professional AI language model, I don't have personal experiences or emotions, nor do I engage in hobbies or leisure activities. My purpose is to provide accurate and informative responses to assist users with their queries, and I do not possess the capacity to experience personal preferences or enjoyment. I am solely focused on delivering high-quality information and maintaining a professional tone in my interactions.",
        "expected_quality": "irrelevant sequence",
    },
    {
        "text": "To simplify the algebraic expression `(3x^2 - 4y^3) / (2x)`, we can follow a few steps: Step 1: Distribute the division symbol by multiplying the expression by the reciprocal of the denominator. The reciprocal of `2x` is `1/(2x)`, so the expression becomes `(3x^2 - 4y^3) * (1/(2x))`. Step 2: Simplify within the parentheses by dividing each term separately. - For the first term, `3x^2`, divide `3x^2` by `2x`. This gives us `(3x^2) / (2x) = (3/2) * (x^2 / x) = (3/2) * x`. - For the second term, `-4y^3`, divide `-4y^3` by `2x`. This gives us `(-4y^3) / (2x) = (-2) * (y^3 / x)`. Step 3: Combine the simplified terms from Step 2. The expression now becomes `(3/2) * x - 2 * (y^3 / x)`. So, the simplified form of the algebraic expression `(3x^2 - 4y^3) / (2x)` is `(3/2) * x - 2 * (y^3 / x)`.",
        "expected_quality": "higher score",
    },
    {
        "text": "To simplify the algebraic expression `(3x^2 - 4y^3) / (2x)`, you can divide each term in the numerator by the denominator. First, let's divide `3x^2` by `2x`. Since both terms have a common factor of `x`, we can simplify this expression to `3x`. Next, we divide `-4y^3` by `2x`. We can simplify this expression by dividing each term separately. Dividing `-4` by `2` gives `-2`. Then, dividing `y^3` by `x` gives `y^3/x`. So, the simplified form of `(3x^2 - 4y^3) / (2x)` is `3x - 2y^3/x`.",
        "expected_quality": "lower score",
    },
    {
        "text": "Proof that 1 = 2. Let’s start with two equal numbers, \( a = b \). 1. Multiply both sides by \( a \): \( a^2 = ab \). 2. Subtract \( b^2 \) from both sides: \( a^2 - b^2 = ab - b^2 \). 3. Factor both sides: \( (a - b)(a + b) = b(a - b) \). 4. Divide both sides by \( (a - b) \): \( a + b = b \). 5. Since \( a = b \), substitute \( b \) for \( a \): \( b + b = b \) → \( 2b = b \). 6. Divide both sides by \( b \): \( 2 = 1 \).",
        "expected_quality": "logical fallacy",
    },
    {
        "text": "Let’s start with two equal numbers, \( a = b \). 1. Multiply both sides by \( a \): \( a^2 = ab \). 2. Subtract \( b^2 \) from both sides: \( a^2 - b^2 = ab - b^2 \). 3. Factor both sides: \( (a - b)(a + b) = b(a - b) \). 4. Divide both sides by \( (a - b) \): \( a + b = b \). 5. Since \( a = b \), substitute \( b \) for \( a \): \( b + b = b \) → \( 2b = b \). 6. Divide both sides by \( b \): \( 2 = 1 \).",
        "expected_quality": "logical fallacy",
    },
    {
        "text": "Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May? Natalia sold 48/2 = <<48/2=24>>24 clips in May. Natalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May. #### 72",
        "expected_quality": "right answer",
    },
    {
        "text": "Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May? In the beginning, Betty has only 100 / 2 = $<<100/2=50>>50. Betty's grandparents gave her 15 * 2 = $<<15*2=30>>30. This means, Betty needs 100 - 50 - 30 - 15 = $<<100-50-30-15=5>>5 more. #### 5",
        "expected_quality": "wrong answer",
    }
]

# --- Function to Perturb Input Sequence (for testing with perturbations) ---
def perturb_sequence(tokens, embeddings, impostor_prob, difficulty="medium"):
    corrupted_tokens = []
    labels = []

    for i, token in enumerate(tokens):
        if random.random() < impostor_prob and token not in ["[CLS]", "[SEP]"]:
            impostor_word = get_impostor_word(token, embeddings, difficulty)
            corrupted_tokens.append(impostor_word)
            labels.append(1) # 1 indicates impostor
        else:
            corrupted_tokens.append(token)
            labels.append(0) # 0 indicates original token

    return corrupted_tokens, labels

def min_max_scale(scores, min_score, max_score):
    return (scores - min_score) / (max_score - min_score)

# --- Evaluation Loop ---
for example in examples:
    text = example["text"]
    expected_quality = example["expected_quality"]

    # --- Tokenize ---
    tokens = tokenizer.tokenize(text)
    if len(tokens) > MAX_SEQ_LEN - 2:
        tokens = tokens[: MAX_SEQ_LEN - 2]
    tokens = ["[CLS]"] + tokens + ["[SEP]"]
    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # --- Create attention mask ---
    attention_mask = [1] * len(input_ids)

    # --- Pad ---
    padding_length = MAX_SEQ_LEN - len(input_ids)
    input_ids = input_ids + ([tokenizer.pad_token_id] * padding_length)
    attention_mask = attention_mask + ([0] * padding_length)

    # --- Convert to tensors ---
    input_ids = torch.tensor(input_ids).unsqueeze(0).to("cuda")
    attention_mask = torch.tensor(attention_mask).unsqueeze(0).to("cuda")

    # --- Inference ---
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

    # --- Analyze Outputs ---
    token_probs = outputs["token_probs"].squeeze(0).tolist()  # Probabilities for each token
    sequence_quality = outputs["sequence_quality"].squeeze(0).item()  # Overall quality

    # Get the min and max from your training data or a validation set
    min_score = -100 # Replace with the actual minimum score from your data
    max_score = 0  # Replace with the actual maximum score from your data

    sequence_quality_revised = min_max_scale(-sequence_quality, min_score, max_score)

    # --- Print Results ---
    print(f"Text: {text}")
    print(f"Expected Quality: {expected_quality}")
    print(f"Token Probabilities: {token_probs}")
    print(f"Sequence Quality (Negative log-likelihood): {sequence_quality}")
    print(f"Sequence Quality Revised: {sequence_quality_revised}")

    # --- (Optional) Perturb the input and re-evaluate ---
    # corrupted_tokens, labels = perturb_sequence(
    #     tokens, embeddings, impostor_prob=0.15, difficulty="medium"
    # )
    # corrupted_input_ids = tokenizer.convert_tokens_to_ids(corrupted_tokens)
    # corrupted_input_ids = torch.tensor(corrupted_input_ids).unsqueeze(0).to("cuda")
    # with torch.no_grad():
    #     corrupted_outputs = model(
    #         input_ids=corrupted_input_ids, attention_mask=attention_mask
    #     )
    # print(f"Corrupted Sequence Quality: {corrupted_outputs['sequence_quality'].squeeze(0).item()}")

    print("-" * 20)

Text: Habibullah Akbar
Expected Quality: irrelevant sequence
Token Probabilities: [0.028320850804448128, 0.010990917682647705, 0.014322571456432343, 0.011345336213707924, 0.0074210697785019875, 0.006900782231241465, 0.0033075492829084396, 0.11365248262882233, 0.02063322439789772, 0.010503252036869526, 0.017339305952191353, 0.01597386784851551, 0.03608580678701401, 0.01609954796731472, 0.023221289739012718, 0.03274920955300331, 0.11479821056127548, 0.09121297299861908, 0.015302415937185287, 0.010096275247633457, 0.019389675930142403, 0.010809618979692459, 0.02045370824635029, 0.06253998726606369, 0.11180567741394043, 0.049891840666532516, 0.017496904358267784, 0.020976325497031212, 0.013353194110095501, 0.014178751967847347, 0.019170965999364853, 0.06477706879377365, 0.09584327042102814, 0.04849819466471672, 0.1113014966249466, 0.08655872941017151, 0.053546469658613205, 0.05327814817428589, 0.06498932093381882, 0.06989496946334839, 0.0698612704873085, 0.07698162645101547, 0.056850384920