In [None]:
!pip install --upgrade torch torchvision torchaudio



In [None]:
!pip install -U transformers>=4.48.0 datasets triton lightning lightning[extra] pytorch_optimizer pandas scipy heavyball
# !pip install flash_attn==1.0.5

In [None]:
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from transformers import (
    AutoTokenizer,
    AutoModel,
    AutoConfig,
)
from datasets import load_dataset
import torch
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
from huggingface_hub import upload_folder
import os
from torch.optim.lr_scheduler import LambdaLR
from pytorch_optimizer import load_optimizer
import heavyball

import torch._dynamo
torch._dynamo.config.suppress_errors = True

torch.set_float32_matmul_precision('high')

In [None]:
cache_dir = "/content/huggingface_cache"
os.makedirs(cache_dir, exist_ok=True)

# Set ALL Hugging Face related cache directories
os.environ["TRANSFORMERS_CACHE"] = os.path.join(cache_dir, "transformers")
os.environ["HF_DATASETS_CACHE"] = os.path.join(cache_dir, "datasets")
os.environ["HF_HOME"] = os.path.join(cache_dir, "hf_home")
os.environ["HF_ASSETS_CACHE"] = os.path.join(cache_dir, "assets")
os.environ["HUGGINGFACE_HUB_CACHE"] = os.path.join(cache_dir, "hub")
os.environ["HF_MODULES_CACHE"] = os.path.join(cache_dir, "modules")

# Create all directories
for dir_path in [os.environ["TRANSFORMERS_CACHE"],
                os.environ["HF_DATASETS_CACHE"],
                os.environ["HF_HOME"],
                os.environ["HF_ASSETS_CACHE"],
                os.environ["HUGGINGFACE_HUB_CACHE"],
                os.environ["HF_MODULES_CACHE"]]:
    os.makedirs(dir_path, exist_ok=True)

# Force datasets to use the new cache
from datasets import config
config.HF_DATASETS_CACHE = os.environ["HF_DATASETS_CACHE"]

# --- Configuration ---
class Config:
    DATASET_NAME = "kreasof-ai/SPL-100K-AutoMathText-llm-deviated"
    MODEL_NAME = "answerdotai/ModernBERT-base"
    DIFFICULTY_LEVELS = {
        "hard": (1, 10),
        "medium": (10, 50),
        "easy": (50, 100),
    }
    BATCH_SIZE = 32
    LEARNING_RATE = 1e-3
    WEIGHT_DECAY = 1e-2
    LOSS_SCALING_FACTOR = 1
    NUM_EPOCHS = 1
    COMPILE_MODE = "reduce-overhead"  # Options: "default", "reduce-overhead", "max-autotune"
    USE_COMPILE = False  # Easily toggle compilation
    DYNAMIC_SHAPES = False  # Set True for variable-length inputs
    HF_USERNAME = "kreasof-ai"  # Replace with your HuggingFace username
    MODEL_REPO_ID = "kreasof-ai/SPL-better-dataset-base-f32"  # Format: "username/model-name"
    PUSH_TO_HUB = True

cfg = Config()

# --- Enhanced Lightning Module ---
class ImpostorVerifier(pl.LightningModule):
    def __init__(self, config):
        super().__init__()
        self.save_hyperparameters()
        self.config = config
        self._init_model()
        self.automatic_optimization=False

    def _init_model(self):
        # Initialize core components
        self.model = AutoModel.from_pretrained(self.config.MODEL_NAME)
        self.config_hf = AutoConfig.from_pretrained(self.config.MODEL_NAME)
        self.impostor_layer = torch.nn.Sequential(
                                torch.nn.Linear(self.config_hf.hidden_size, self.config_hf.hidden_size * 4),
                                torch.nn.GELU(),
                                torch.nn.Linear(self.config_hf.hidden_size * 4, 2)
                              )

        # Initial compilation if requested
        if self.config.USE_COMPILE:
            compile_kwargs = {
                "mode": self.config.COMPILE_MODE,
                "dynamic": self.config.DYNAMIC_SHAPES
            }
            self.model = torch.compile(self.model, **compile_kwargs)
            self.impostor_layer = torch.compile(self.impostor_layer, **compile_kwargs)

    def forward(self, input_ids, attention_mask):
        torch.compiler.cudagraph_mark_step_begin()
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        token_outputs = outputs.last_hidden_state
        return self.impostor_layer(token_outputs)

    def focal_loss(self, inputs, targets, alpha=0.25, gamma=2.0, reduction="mean"):
        # Standard cross entropy loss
        ce_loss = F.cross_entropy(inputs, targets, reduction="none")

        # Get probabilities for the correct classes
        pt = torch.exp(-ce_loss)

        # Apply the focusing term
        focal_weight = (1 - pt) ** gamma

        # Apply the alpha balancing term if using it
        if alpha is not None:
            focal_weight = alpha * focal_weight

        # Final focal loss
        focal_loss = focal_weight * ce_loss

        # Apply reduction
        if reduction == "mean":
            return focal_loss.mean()
        elif reduction == "sum":
            return focal_loss.sum()
        else:
            return focal_loss

    def training_step(self, batch, batch_idx):
        logits = self.forward(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])

        # Convert labels to integers (required for CrossEntropy)
        labels = batch["labels"].long()

        # Compute loss, ignoring padding tokens
        loss = self.focal_loss(logits.view(-1, 2), labels.view(-1)) * self.config.LOSS_SCALING_FACTOR
        masked_loss = (loss * batch["attention_mask"].view(-1)).sum() / batch["attention_mask"].sum()

        self.log("train_loss", masked_loss, prog_bar=True)

        opt = self.optimizers()
        opt.zero_grad()
        self.manual_backward(masked_loss)
        self.clip_gradients(opt, gradient_clip_val=1.0, gradient_clip_algorithm="norm")
        opt.step()

        return masked_loss

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(
            self.parameters(),
            lr=self.config.LEARNING_RATE,
            weight_decay=self.config.WEIGHT_DECAY
        )

        return optimizer

model = ImpostorVerifier(cfg)

config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/599M [00:00<?, ?B/s]

In [None]:
!wget -O last.ckpt https://huggingface.co/kreasof-ai/SPL-better-dataset-base-f32/resolve/main/last.ckpt

--2025-03-25 08:09:50--  https://huggingface.co/kreasof-ai/SPL-better-dataset-base-f32/resolve/main/last.ckpt
Resolving huggingface.co (huggingface.co)... 13.35.202.121, 13.35.202.34, 13.35.202.97, ...
Connecting to huggingface.co (huggingface.co)|13.35.202.121|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs-us-1.hf.co/repos/6c/ff/6cffa8b6c27cbcc536282092112361fb913f630544af4ecbb2b487260c61430c/993477d78625daa38693ed89dff23e70aa1905c37b6f0ef118a4bc728232e237?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27last.ckpt%3B+filename%3D%22last.ckpt%22%3B&Expires=1742893790&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0Mjg5Mzc5MH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zLzZjL2ZmLzZjZmZhOGI2YzI3Y2JjYzUzNjI4MjA5MjExMjM2MWZiOTEzZjYzMDU0NGFmNGVjYmIyYjQ4NzI2MGM2MTQzMGMvOTkzNDc3ZDc4NjI1ZGFhMzg2OTNlZDg5ZGZmMjNlNzBhYTE5MDVjMzdiNmYwZWYxMThhNGJjNzI4MjMyZTIzNz9yZXNwb25zZS1jb250ZW5

In [None]:
examples = [
    {
        "text": "Habibullah Akbar",
        "expected_quality": "irrelevant sequence",
    },
    {
        "text": "Cat sat on the mat",
        "expected_quality": "irrelevant sequence",
    },
    {
        "text": "The cat sat on the mat, basking in the warm sunlight streaming through the window, its tail gently flicking back and forth as it dozed off into a peaceful nap.",
        "expected_quality": "irrelevant sequence",
    },
    {
        "text": "As a professional AI language model, I don't have personal experiences or emotions, nor do I engage in hobbies or leisure activities. My purpose is to provide accurate and informative responses to assist users with their queries, and I do not possess the capacity to experience personal preferences or enjoyment. I am solely focused on delivering high-quality information and maintaining a professional tone in my interactions.",
        "expected_quality": "irrelevant sequence",
    },
    {
        "text": "To simplify the algebraic expression `(3x^2 - 4y^3) / (2x)`, we can follow a few steps: Step 1: Distribute the division symbol by multiplying the expression by the reciprocal of the denominator. The reciprocal of `2x` is `1/(2x)`, so the expression becomes `(3x^2 - 4y^3) * (1/(2x))`. Step 2: Simplify within the parentheses by dividing each term separately. - For the first term, `3x^2`, divide `3x^2` by `2x`. This gives us `(3x^2) / (2x) = (3/2) * (x^2 / x) = (3/2) * x`. - For the second term, `-4y^3`, divide `-4y^3` by `2x`. This gives us `(-4y^3) / (2x) = (-2) * (y^3 / x)`. Step 3: Combine the simplified terms from Step 2. The expression now becomes `(3/2) * x - 2 * (y^3 / x)`. So, the simplified form of the algebraic expression `(3x^2 - 4y^3) / (2x)` is `(3/2) * x - 2 * (y^3 / x)`.",
        "expected_quality": "higher score",
    },
    {
        "text": "To simplify the algebraic expression `(3x^2 - 4y^3) / (2x)`, you can divide each term in the numerator by the denominator. First, let's divide `3x^2` by `2x`. Since both terms have a common factor of `x`, we can simplify this expression to `3x`. Next, we divide `-4y^3` by `2x`. We can simplify this expression by dividing each term separately. Dividing `-4` by `2` gives `-2`. Then, dividing `y^3` by `x` gives `y^3/x`. So, the simplified form of `(3x^2 - 4y^3) / (2x)` is `3x - 2y^3/x`.",
        "expected_quality": "lower score",
    },
    {
        "text": "Proof that 1 = 2. Let’s start with two equal numbers, \( a = b \). 1. Multiply both sides by \( a \): \( a^2 = ab \). 2. Subtract \( b^2 \) from both sides: \( a^2 - b^2 = ab - b^2 \). 3. Factor both sides: \( (a - b)(a + b) = b(a - b) \). 4. Divide both sides by \( (a - b) \): \( a + b = b \). 5. Since \( a = b \), substitute \( b \) for \( a \): \( b + b = b \) → \( 2b = b \). 6. Divide both sides by \( b \): \( 2 = 1 \).",
        "expected_quality": "logical fallacy",
    },
    {
        "text": "Let’s start with two equal numbers, \( a = b \). 1. Multiply both sides by \( a \): \( a^2 = ab \). 2. Subtract \( b^2 \) from both sides: \( a^2 - b^2 = ab - b^2 \). 3. Factor both sides: \( (a - b)(a + b) = b(a - b) \). 4. Divide both sides by \( (a - b) \): \( a + b = b \). 5. Since \( a = b \), substitute \( b \) for \( a \): \( b + b = b \) → \( 2b = b \). 6. Divide both sides by \( b \): \( 2 = 1 \).",
        "expected_quality": "logical fallacy",
    },
    {
        "text": "Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May? Natalia sold 48/2 = <<48/2=24>>24 clips in May. Natalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May. #### 72",
        "expected_quality": "right answer",
    },
    {
        "text": "Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May? In the beginning, Betty has only 100 / 2 = $<<100/2=50>>50. Betty's grandparents gave her 15 * 2 = $<<15*2=30>>30. This means, Betty needs 100 - 50 - 30 - 15 = $<<100-50-30-15=5>>5 more. #### 5",
        "expected_quality": "wrong answer",
    }
]

In [None]:
model = ImpostorVerifier.load_from_checkpoint('last.ckpt')
tokenizer = AutoTokenizer.from_pretrained(cfg.MODEL_NAME)

def min_max_scale(scores, min_score=-150, max_score=0):
    return (scores - min_score) / (max_score - min_score)

model.eval()
model.model.eval()

# model.model = model.model._orig_mod  # Access the original model
# model.impostor_layer = model.impostor_layer._orig_mod

for example in examples:
    text = example["text"]
    expected_quality = example["expected_quality"]

    inputs = tokenizer(
        text,
        padding="max_length",
        max_length=2048,
        truncation=True,
        return_tensors="pt"
    ).to('cuda')

    output_logits = model(**inputs)  # Shape: (batch, seq_len, 2)
    output_probs = torch.softmax(output_logits, dim=-1)  # Apply softmax to get class probabilities
    perturbed_probs = output_probs[:, :, 1]  # Extract probability of class 1 (perturbed token)

    # Compute sequence-level quality using Negative Log-Likelihood
    sequence_quality_raw = -torch.sum(torch.log(1 - perturbed_probs + 1e-12), dim=1).item()
    sequence_quality_revised = min_max_scale(-sequence_quality_raw)

    # --- Print Results ---
    print(f"Text: {text}")
    print(f"Expected Quality: {expected_quality}")
    print(f"Token Probabilities: {output_probs.squeeze(0).tolist()}")
    print(f"Sequence Quality (Negative log-likelihood): {sequence_quality_raw}")
    print(f"Sequence Quality Revised: {sequence_quality_revised}")

    print("-" * 20)

Text: Habibullah Akbar
Expected Quality: irrelevant sequence
Token Probabilities: [[0.422060489654541, 0.577939510345459], [0.4325057864189148, 0.5674942135810852], [0.4328915476799011, 0.5671084523200989], [0.4352264702320099, 0.5647735595703125], [0.420693963766098, 0.5793060660362244], [0.43279072642326355, 0.5672093033790588], [0.4310302734375, 0.5689696669578552], [0.44010603427886963, 0.5598939657211304], [0.5495031476020813, 0.4504968523979187], [0.5917304158210754, 0.4082695543766022], [0.661821722984314, 0.3381783068180084], [0.7406641244888306, 0.25933587551116943], [0.7061005234718323, 0.2938994765281677], [0.6110638976097107, 0.3889361619949341], [0.5622503161430359, 0.4377496540546417], [0.5949892401695251, 0.40501075983047485], [0.7094557881355286, 0.29054421186447144], [0.7796685099601746, 0.22033150494098663], [0.7588849067687988, 0.24111509323120117], [0.6386818289756775, 0.3613181710243225], [0.5617765188217163, 0.4382234513759613], [0.5566512942314148, 0.443348705768

In [None]:
import numpy as np
import pandas as pd
from scipy.stats import pearsonr, spearmanr
from datasets import load_dataset

def get_score(problem, answer):
    """Replace with your actual model inference code"""
    inputs = tokenizer(
        "QUESTION: " + problem + "\n\nANSWER: " + answer,
        padding="max_length",
        max_length=2048,
        truncation=True,
        return_tensors="pt"
    ).to('cuda')

    output_logits = model(**inputs)  # Shape: (batch, seq_len, 2)
    output_probs = torch.softmax(output_logits, dim=-1)  # Apply softmax to get class probabilities
    perturbed_probs = output_probs[:, :, 1]  # Extract probability of class 1 (perturbed token)

    # Compute sequence-level quality using Negative Log-Likelihood
    sequence_quality_raw = -torch.sum(torch.log(1 - perturbed_probs + 1e-12), dim=1).item()
    sequence_quality_revised = min_max_scale(-sequence_quality_raw)

    return sequence_quality_revised

# Load dataset
dataset = load_dataset("kreasof-ai/MATH-WD-Lite")
df = dataset['train'].to_pandas()

# Initialize metrics storage
results = {
    'correct_highest': [],
    'delta_max_wrong': [],
    'delta_avg_wrong': [],
    'correct_length': [],
    'decoy_lengths': [],
    'levels': [],
    'level_accuracies': {l: [] for l in range(1, 6)},
    'level_deltas': {l: [] for l in range(1, 6)}
}

# Process each question
for _, row in df.iterrows():
    problem = row['Problem']
    candidates = {
        'correct': row['Answer'],
        'decoy_a': row['Decoy A'],
        'decoy_b': row['Decoy B'],
        'decoy_c': row['Decoy C']
    }

    # Get scores and lengths
    scores, lengths = {}, {}
    for key, answer in candidates.items():
        scores[key] = get_score(problem, answer)
        lengths[key] = len(answer.split())  # Simple length approximation

    # Store results
    decoy_scores = [scores['decoy_a'], scores['decoy_b'], scores['decoy_c']]
    results['correct_highest'].append(scores['correct'] > max(decoy_scores))
    results['delta_max_wrong'].append(scores['correct'] - max(decoy_scores))
    results['delta_avg_wrong'].append(scores['correct'] - np.mean(decoy_scores))
    results['correct_length'].append(lengths['correct'])
    results['decoy_lengths'].extend([lengths[k] for k in ['decoy_a', 'decoy_b', 'decoy_c']])
    results['levels'].append(row['Level'])

    # Track level-based metrics
    results['level_accuracies'][row['Level']].append(results['correct_highest'][-1])
    results['level_deltas'][row['Level']].append(results['delta_max_wrong'][-1])

# Calculate metrics
# 1. Accuracy
accuracy = np.mean(results['correct_highest'])

# 2. Score gaps
mean_delta_max = np.mean(results['delta_max_wrong'])
mean_delta_avg = np.mean(results['delta_avg_wrong'])

# 3. Sequence length analysis
all_lengths = results['correct_length'] + results['decoy_lengths']
all_scores = [scores['correct']] + decoy_scores  # Changed logic here
                                                  # to include scores for all answers

# Now, all_scores will contain:
# - the score for the correct answer
# - the scores for decoy_a, decoy_b, decoy_c

# After this change, you'll need to extend all_scores for each question, similar to how you extended all_lengths:
all_scores = []
for i in range(len(results['correct_highest'])):
    all_scores.extend([results['delta_max_wrong'][i] + scores['correct'] if results['correct_highest'][i] else scores['correct'],
                       scores['decoy_a'],
                       scores['decoy_b'],
                       scores['decoy_c']])

length_corr, _ = pearsonr(all_lengths, all_scores)

# 4. Level correlation
level_accs = [np.mean(results['level_accuracies'][l]) for l in range(1, 6)]
level_deltas = [np.mean(results['level_deltas'][l]) for l in range(1, 6)]
level_numbers = list(range(1, 6))
level_acc_corr, _ = spearmanr(level_numbers, level_accs)
level_delta_corr, _ = spearmanr(level_numbers, level_deltas)

# Print results
print(f"Accuracy: {accuracy:.4f}")
print(f"Mean Δ(max wrong): {mean_delta_max:.4f}")
print(f"Mean Δ(avg wrong): {mean_delta_avg:.4f}")
print(f"\nLength-Score Correlation: {length_corr:.4f}")
print(f"\nLevel vs Accuracy Correlation: {level_acc_corr:.4f}")
print(f"Level vs Δ(max wrong) Correlation: {level_delta_corr:.4f}")

# Additional analysis
print("\nAccuracy by Level:")
for l in range(1, 6):
    print(f"Level {l}: {np.mean(results['level_accuracies'][l]):.4f}")

print("\nΔ(max wrong) by Level:")
for l in range(1, 6):
    print(f"Level {l}: {np.mean(results['level_deltas'][l]):.4f}")

Accuracy: 0.3812
Mean Δ(max wrong): -0.3965
Mean Δ(avg wrong): -0.0940

Length-Score Correlation: 0.0286

Level vs Accuracy Correlation: -1.0000
Level vs Δ(max wrong) Correlation: -1.0000

Accuracy by Level:
Level 1: 0.6000
Level 2: 0.4865
Level 3: 0.4194
Level 4: 0.3158
Level 5: 0.2308

Δ(max wrong) by Level:
Level 1: 0.0429
Level 2: -0.0100
Level 3: -0.2590
Level 4: -0.3063
Level 5: -1.1293
