In [1]:
!pip install -U transformers>=4.48.0 datasets lomo-optim

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompatible.[0m[31m
[0m

In [2]:
from datasets import load_dataset

dataset = load_dataset("HuggingFaceTB/finemath", "finemath-4plus", split="train", streaming=True)
dataset = dataset.take(5000)
print(dataset.column_names)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/12.0k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/128 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/64 [00:00<?, ?it/s]

['url', 'fetch_time', 'content_mime_type', 'warc_filename', 'warc_record_offset', 'warc_record_length', 'text', 'token_count', 'char_count', 'metadata', 'score', 'int_score', 'crawl', 'snapshot_type', 'language', 'language_score']


In [3]:
import random
import numpy as np
from collections import Counter

def get_frequent_words(dataset, top_k=1000):
    """Gets the top-k most frequent words from the dataset."""
    word_counts = Counter()
    for example in dataset:
        word_counts.update(example['text'].lower().split())  # Lowercase and split
    return [word for word, count in word_counts.most_common(top_k)]

def calculate_embeddings(words, model, tokenizer):
    """Calculates embeddings for a list of words."""
    inputs = tokenizer(words, return_tensors='pt', padding=True, truncation=True, max_length=512)
    inputs = {key: value.to(model.device) for key, value in inputs.items()} # Move inputs to model's device
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)  # Average over tokens
    return embeddings.cpu().numpy()

def find_nearest_neighbors(word, embeddings, word_list, top_n=100):
    """Finds the nearest neighbors of a word based on embedding distances."""
    try:
        word_index = word_list.index(word)
    except ValueError:
        return []
    word_embedding = embeddings[word_index]
    distances = np.dot(embeddings, word_embedding)
    nearest_indices = np.argsort(distances)[::-1][1:top_n + 1]  # Exclude the word itself
    return [word_list[i] for i in nearest_indices]


def create_impostor(tokenized_text, frequent_words, embeddings, word_list, difficulty='easy', num_impostors=5, original_score=None):
    """Creates an impostor version of a text and adjusts the score accordingly."""
    words = [tokenizer.decode(token_id) for token_id in tokenized_text]
    score_reduction = 0

    # Filter out words that are not in the frequent_words list
    eligible_words = [word for word in words if word in frequent_words]

    # If no words are eligible, return the original text and score
    if not eligible_words:
        return tokenized_text, original_score

    # Select multiple words to replace
    num_to_replace = min(num_impostors, len(eligible_words))
    words_to_replace = random.sample(eligible_words, num_to_replace)

    new_words = words[:]  # Create a copy to modify

    for original_word in words_to_replace:
        nearest_neighbors = find_nearest_neighbors(original_word, embeddings, word_list)

        if difficulty == 'hard':
            impostor_word = random.choice(nearest_neighbors[:10]) if nearest_neighbors else original_word
            base_reduction = 0.1
        elif difficulty == 'medium':
            impostor_word = random.choice(nearest_neighbors[10:50]) if len(nearest_neighbors) > 10 else original_word
            base_reduction = 0.3
        else:  # difficulty == 'easy'
            impostor_word = random.choice(nearest_neighbors[50:]) if len(nearest_neighbors) > 50 else original_word
            base_reduction = 0.5

        # Replace the original word with the impostor
        original_word_index = new_words.index(original_word)
        new_words[original_word_index] = impostor_word

        # Calculate score reduction for this impostor
        try:
            original_index = word_list.index(original_word)
            impostor_index = word_list.index(impostor_word)
            original_embedding = embeddings[original_index]
            impostor_embedding = embeddings[impostor_index]

            similarity = np.dot(original_embedding, impostor_embedding) / (
                        np.linalg.norm(original_embedding) * np.linalg.norm(impostor_embedding))

            distance_factor = 1 - similarity
            reduction = base_reduction * (1 + distance_factor)
        except ValueError:
            reduction = base_reduction

        score_reduction += reduction

    # Convert back to token IDs
    impostor_text = [tokenizer.encode(word, add_special_tokens=False) for word in new_words]
    # Flatten the list of lists
    impostor_text = [item for sublist in impostor_text for item in sublist]

    # Adjust the score
    if original_score is not None:
        adjusted_score = max(0, min(original_score - score_reduction, 1))
    else:
        adjusted_score = None

    return impostor_text, adjusted_score

In [4]:
import numpy as np

train_scores = []
for example in dataset:
    train_scores.append(example['score'])

mean = np.mean(train_scores)
std = np.std(train_scores)

def normalize(example):
    example['labels'] = (example['score'] - mean) / std
    return example

dataset = dataset.map(normalize)

In [5]:
def cast_labels(example):
    example['labels'] = np.float32(example['labels'])
    return example

dataset = dataset.map(cast_labels)

In [6]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoModel, AutoTokenizer, DataCollatorWithPadding
from dataclasses import dataclass
from typing import Any # Import Any from typing module

model_id = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Get frequent words and their embeddings
frequent_words = get_frequent_words(dataset)

# Extract model and move to the appropriate device (CPU or GPU)
model_for_embedding = AutoModel.from_pretrained(model_id)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_for_embedding.to(device)

embeddings = calculate_embeddings(frequent_words, model_for_embedding, tokenizer)

# Modify the tokenization function to include SPL
def tokenize_and_perturb(examples, num_impostors_range=(1, 10)):
    tokenized_inputs = tokenizer(examples['text'], padding='max_length', truncation=True, max_length=512, return_tensors="pt")

    # Move tokenized inputs to the CPU
    tokenized_inputs = {k: v.cpu() for k, v in tokenized_inputs.items()}

    impostor_texts = []
    adjusted_scores = []
    for text, labels in zip(tokenized_inputs['input_ids'], examples['labels']):
        difficulty = random.choice(['easy', 'medium', 'hard'])
        num_impostors = random.randint(num_impostors_range[0], num_impostors_range[1])  # Randomly choose the number of impostors
        impostor_text, adjusted_score = create_impostor(text.tolist(), frequent_words, embeddings, frequent_words, difficulty=difficulty, num_impostors=num_impostors, original_score=labels)
        impostor_texts.append(impostor_text)
        adjusted_scores.append(adjusted_score)

    # Convert adjusted_scores to a tensor
    tokenized_inputs['impostor_labels'] = torch.tensor(adjusted_scores)

    # Pad and truncate the impostor texts to the maximum length
    padded_impostor_texts = []
    for impostor_text in impostor_texts:
        if len(impostor_text) > tokenizer.model_max_length:
            padded_impostor_texts.append(impostor_text[:tokenizer.model_max_length])
        else:
            padded_impostor_texts.append(impostor_text + [tokenizer.pad_token_id] * (tokenizer.model_max_length - len(impostor_text)))

    tokenized_inputs['impostor_input_ids'] = torch.tensor(padded_impostor_texts)

    return tokenized_inputs

# Map the tokenization and perturbation function to your dataset
tokenized_dataset = dataset.map(tokenize_and_perturb, batched=True)

# Update the data collator to handle the new input
@dataclass
class CustomDataCollator(DataCollatorWithPadding):
    def __call__(self, features: list[dict[str, Any]]) -> dict[str, Any]:
        batch = super().__call__(features)
        # Add impostor_input_ids to the batch if it exists
        if "impostor_input_ids" in features[0]:
          batch["impostor_input_ids"] = torch.stack([torch.tensor(f["impostor_input_ids"]) for f in features])
        return batch

data_collator = CustomDataCollator(tokenizer=tokenizer, pad_to_multiple_of=8)

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.13M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/599M [00:00<?, ?B/s]

In [7]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=1)
model.to(device)

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ModernBertForSequenceClassification(
  (model): ModernBertModel(
    (embeddings): ModernBertEmbeddings(
      (tok_embeddings): Embedding(50368, 768, padding_idx=50283)
      (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (drop): Dropout(p=0.0, inplace=False)
    )
    (layers): ModuleList(
      (0): ModernBertEncoderLayer(
        (attn_norm): Identity()
        (attn): ModernBertAttention(
          (Wqkv): Linear(in_features=768, out_features=2304, bias=False)
          (rotary_emb): ModernBertRotaryEmbedding()
          (Wo): Linear(in_features=768, out_features=768, bias=False)
          (out_drop): Identity()
        )
        (mlp_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): ModernBertMLP(
          (Wi): Linear(in_features=768, out_features=2304, bias=False)
          (act): GELUActivation()
          (drop): Dropout(p=0.0, inplace=False)
          (Wo): Linear(in_features=1152, out_features=768, bias=False)
        )
      

In [8]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=2e-5,
    optim="adalomo",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    max_steps=5000,
    logging_steps=10, # Log every 10 steps
)

In [9]:
from transformers import Trainer
import torch.nn as nn
from typing import Dict # Import Dict from typing module

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        impostor_labels = inputs.pop("impostor_labels", None)
        impostor_input_ids = inputs.pop("impostor_input_ids", None)

        # Calculate loss
        loss_fct = nn.MSELoss()
        loss = None
        outputs = None

        # Forward pass with impostor inputs
        if impostor_input_ids is not None:
            impostor_outputs = model(input_ids=impostor_input_ids, attention_mask=inputs["attention_mask"])
            outputs = impostor_outputs
            loss_impostor = loss_fct(impostor_outputs.logits.view(-1), impostor_labels.view(-1))
            loss = loss_impostor
        else:
            # Forward pass with original inputs
            outputs = model(**inputs)
            loss_original = loss_fct(outputs.logits.view(-1), labels.view(-1))
            loss = loss_original

        return (loss, outputs) if return_outputs else loss

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

  trainer = CustomTrainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Step,Training Loss
10,1.1733
20,1.0265
30,0.8217
40,1.2215
50,0.8613
60,0.8476
70,1.112
80,0.7068
90,0.8416
100,0.9662


TrainOutput(global_step=5000, training_loss=0.42068009805679324, metrics={'train_runtime': 8354.3242, 'train_samples_per_second': 4.788, 'train_steps_per_second': 0.598, 'total_flos': 1.363021934592e+16, 'train_loss': 0.42068009805679324, 'epoch': 7.125})

In [10]:
trainer.save_model('./best_model')
tokenizer.save_pretrained('./best_model')

('./best_model/tokenizer_config.json',
 './best_model/special_tokens_map.json',
 './best_model/tokenizer.json')

In [24]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import numpy as np

# 1. Load the Model and Tokenizer
model_path = './best_model'  # Replace with your saved model path
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=1)  # Assuming regression

# Move model to the appropriate device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()  # Set the model to evaluation mode

# 2. Load Normalization Parameters (if applicable)
# If you normalized the quality scores during training, load the mean and std here
train_scores = []
for example in dataset:
    train_scores.append(example['score'])

mean = np.mean(train_scores)
std = np.std(train_scores)

# 3. Define the Denormalization Function
def denormalize(score, mean=mean, std=std):
    """Denormalizes the score back to the original scale."""
    return score * std + mean

# 4. Define the Impostor Creation Function (adapted from training script)

# Assuming you have your frequent_words, embeddings, word_list ready from training
# You can load them from saved files if you stored them during training

def create_impostor(tokenized_text, frequent_words, embeddings, word_list, difficulty='easy', num_impostors=5):
    """Creates an impostor version of a text by replacing multiple words with neighbors."""
    words = [tokenizer.decode(token_id) for token_id in tokenized_text]

    # Filter out words that are not in the frequent_words list
    eligible_words = [word for word in words if word in frequent_words]

    # If no words are eligible, return the original text
    if not eligible_words:
        return tokenized_text

    # Select multiple words to replace
    num_to_replace = min(num_impostors, len(eligible_words))
    words_to_replace = random.sample(eligible_words, num_to_replace)

    new_words = words[:]  # Create a copy to modify

    for original_word in words_to_replace:
        nearest_neighbors = find_nearest_neighbors(original_word, embeddings, word_list)

        if difficulty == 'hard':
            impostor_word = random.choice(nearest_neighbors[:10]) if nearest_neighbors else original_word
        elif difficulty == 'medium':
            impostor_word = random.choice(nearest_neighbors[10:50]) if len(nearest_neighbors) > 10 else original_word
        else:  # difficulty == 'easy'
            impostor_word = random.choice(nearest_neighbors[50:]) if len(nearest_neighbors) > 50 else original_word

        # Replace the original word with the impostor
        original_word_index = new_words.index(original_word)
        new_words[original_word_index] = impostor_word

    # Convert back to token IDs
    impostor_text = [tokenizer.encode(word, add_special_tokens=False) for word in new_words]
    # Flatten the list of lists
    impostor_text = [item for sublist in impostor_text for item in sublist]

    return impostor_text

# 5. Define the Inference Function
def predict_quality(text, model, tokenizer, frequent_words, embeddings, word_list, difficulty='easy', num_impostors=5):
    """Predicts the quality score of a math text using the SPL verifier model."""

    # Tokenize the original input text
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Create an impostor version of the text
    tokenized_text = inputs["input_ids"].squeeze().tolist()  # Convert tensor to list
    impostor_text = create_impostor(tokenized_text, frequent_words, embeddings, word_list, difficulty, num_impostors)

    # Convert the impostor text back to tensor and to the model's device
    impostor_inputs = {
        "input_ids": torch.tensor([impostor_text]).to(device),
        "attention_mask": torch.ones_like(torch.tensor([impostor_text])).to(device)  # Assuming all tokens are valid
    }

    # Perform inference
    with torch.no_grad():
        original_outputs = model(**inputs)
        impostor_outputs = model(**impostor_inputs)

    # Get the predicted scores
    original_score = original_outputs.logits.squeeze().item()
    impostor_score = impostor_outputs.logits.squeeze().item()

    # Denormalize the scores
    original_score_dn = denormalize(original_score)
    impostor_score_dn = denormalize(impostor_score)

    return original_score_dn, impostor_score_dn, original_outputs, impostor_outputs, tokenized_text, impostor_text

# Load the frequent words from the saved file
frequent_words =  get_frequent_words(dataset)
embeddings = calculate_embeddings(frequent_words, model_for_embedding, tokenizer)

In [27]:
# Example Usage
text = "The distance between two stars is 6.52 × 10^5 light years. What is the distance between the two stars in parsecs? (1 parsec = 3.26 light years) Answer Choices: (A) 2 × 10^5 (B) 4 × 10^6 (C) 5 × 10^7 (D) 7 × 10^7 (E) 9 × 10^8"

original_score, impostor_score, _, _, tokenized_text, impostor_text = predict_quality(text, model, tokenizer, frequent_words, embeddings, frequent_words)

print(f"Original Text: {tokenizer.decode(tokenized_text)}\nPredicted Quality Score: {original_score:.4f}\n")
print(f"Impostor Text: {tokenizer.decode(impostor_text)}\nPredicted Quality Score (Impostor): {impostor_score:.4f}\n")

Original Text: [CLS]The distance between two stars is 6.52 × 10^5 light years. What is the distance between the two stars in parsecs? (1 parsec = 3.26 light years) Answer Choices: (A) 2 × 10^5 (B) 4 × 10^6 (C) 5 × 10^7 (D) 7 × 10^7 (E) 9 × 10^8[SEP]
Predicted Quality Score: 3.6581

Impostor Text: [CLS]The distance between two stars is 6defined52 × 10airchoice light yearsgot What is the distance between the two stars in parsecs? (1 parsec = 3.26 light yearschange Answer Choices: (A) 2 × 10^5 (B) 4 × 10^6 (C) 5 × 10^7 (D) 7 × 10^7 (E) 9 × 10^8[SEP]
Predicted Quality Score (Impostor): 3.6324



In [28]:
# Example Usage
text = "The distance between two stars is 6.52 × 10^5 light years. What is the distance between the two stars in parsecs? (1 parsec = 3.26 light years) Answer Choices: (A) 2 × 10^5 (B) 4 × 10^6 (C) 5 × 10^7 (D) 7 × 10^7 (E) 9 × 10^8"

original_score, impostor_score, _, _, tokenized_text, impostor_text = predict_quality(text, model, tokenizer, frequent_words, embeddings, frequent_words, difficulty='hard', num_impostors=5)

print(f"Original Text: {tokenizer.decode(tokenized_text)}\nPredicted Quality Score: {original_score:.4f}\n")
print(f"Impostor Text: {tokenizer.decode(impostor_text)}\nPredicted Quality Score (Impostor): {impostor_score:.4f}\n")

Original Text: [CLS]The distance between two stars is 6.52 × 10^5 light years. What is the distance between the two stars in parsecs? (1 parsec = 3.26 light years) Answer Choices: (A) 2 × 10^5 (B) 4 × 10^6 (C) 5 × 10^7 (D) 7 × 10^7 (E) 9 × 10^8[SEP]
Predicted Quality Score: 3.6581

Impostor Text: [CLS]The distance between two stars is 6once52 × 10water5 light years. What is the distance between the two stars in parsecs? (1 parsec = 3.26 light yearspm Answer Choices: (A) 2 × 10field5 (B) 4 × 10water6 (C) 5 × 10^7 (D) 7 × 10^7 (E) 9 × 10^8[SEP]
Predicted Quality Score (Impostor): 3.5992



In [29]:
# Example Usage
text = "The distance between two stars is 6.52 × 10^5 light years. What is the distance between the two stars in parsecs? (1 parsec = 3.26 light years) Answer Choices: (A) 2 × 10^5 (B) 4 × 10^6 (C) 5 × 10^7 (D) 7 × 10^7 (E) 9 × 10^8"

original_score, impostor_score, _, _, tokenized_text, impostor_text = predict_quality(text, model, tokenizer, frequent_words, embeddings, frequent_words, difficulty='medium', num_impostors=5)

print(f"Original Text: {tokenizer.decode(tokenized_text)}\nPredicted Quality Score: {original_score:.4f}\n")
print(f"Impostor Text: {tokenizer.decode(impostor_text)}\nPredicted Quality Score (Impostor): {impostor_score:.4f}\n")

Original Text: [CLS]The distance between two stars is 6.52 × 10^5 light years. What is the distance between the two stars in parsecs? (1 parsec = 3.26 light years) Answer Choices: (A) 2 × 10^5 (B) 4 × 10^6 (C) 5 × 10^7 (D) 7 × 10^7 (E) 9 × 10^8[SEP]
Predicted Quality Score: 3.6581

Impostor Text: [CLS]The distance between two stars is 6take52 × 10t5 light years. What is the distance between the two stars in parsecqjan (1 parsec = 3.26 light yearsdo Answer Choices: (A) 2 × 10^5 (B) 4 × 10^6 (C) 5 × 10^7 (D) 7 × 10^7 (E) 9 × 10^8[SEP]
Predicted Quality Score (Impostor): 3.6228



In [30]:
text = "To simplify the algebraic expression `(3x^2 - 4y^3) / (2x)`, we can follow a few steps: Step 1: Distribute the division symbol by multiplying the expression by the reciprocal of the denominator. The reciprocal of `2x` is `1/(2x)`, so the expression becomes `(3x^2 - 4y^3) * (1/(2x))`. Step 2: Simplify within the parentheses by dividing each term separately. - For the first term, `3x^2`, divide `3x^2` by `2x`. This gives us `(3x^2) / (2x) = (3/2) * (x^2 / x) = (3/2) * x`. - For the second term, `-4y^3`, divide `-4y^3` by `2x`. This gives us `(-4y^3) / (2x) = (-2) * (y^3 / x)`. Step 3: Combine the simplified terms from Step 2. The expression now becomes `(3/2) * x - 2 * (y^3 / x)`. So, the simplified form of the algebraic expression `(3x^2 - 4y^3) / (2x)` is `(3/2) * x - 2 * (y^3 / x)`."

original_score, impostor_score, _, _, tokenized_text, impostor_text = predict_quality(text, model, tokenizer, frequent_words, embeddings, frequent_words, difficulty='medium', num_impostors=5)

print(f"Original Text: {tokenizer.decode(tokenized_text)}\nPredicted Quality Score: {original_score:.4f}\n")
print(f"Impostor Text: {tokenizer.decode(impostor_text)}\nPredicted Quality Score (Impostor): {impostor_score:.4f}\n")

Original Text: [CLS]To simplify the algebraic expression `(3x^2 - 4y^3) / (2x)`, we can follow a few steps: Step 1: Distribute the division symbol by multiplying the expression by the reciprocal of the denominator. The reciprocal of `2x` is `1/(2x)`, so the expression becomes `(3x^2 - 4y^3) * (1/(2x))`. Step 2: Simplify within the parentheses by dividing each term separately. - For the first term, `3x^2`, divide `3x^2` by `2x`. This gives us `(3x^2) / (2x) = (3/2) * (x^2 / x) = (3/2) * x`. - For the second term, `-4y^3`, divide `-4y^3` by `2x`. This gives us `(-4y^3) / (2x) = (-2) * (y^3 / x)`. Step 3: Combine the simplified terms from Step 2. The expression now becomes `(3/2) * x - 2 * (y^3 / x)`. So, the simplified form of the algebraic expression `(3x^2 - 4y^3) / (2x)` is `(3/2) * x - 2 * (y^3 / x)`.[SEP]
Predicted Quality Score: 4.3717

Impostor Text: [CLS]To simplify the algebraic expression `(octconversion^2 - 4kg^3true / (2x)`, we can follow a few steps: Step 1: Distribute the d

In [31]:
text = "To simplify the algebraic expression `(3x^2 - 4y^3) / (2x)`, you can divide each term in the numerator by the denominator. First, let's divide `3x^2` by `2x`. Since both terms have a common factor of `x`, we can simplify this expression to `3x`. Next, we divide `-4y^3` by `2x`. We can simplify this expression by dividing each term separately. Dividing `-4` by `2` gives `-2`. Then, dividing `y^3` by `x` gives `y^3/x`. So, the simplified form of `(3x^2 - 4y^3) / (2x)` is `3x - 2y^3/x`."

original_score, impostor_score, _, _, tokenized_text, impostor_text = predict_quality(text, model, tokenizer, frequent_words, embeddings, frequent_words, difficulty='medium', num_impostors=5)

print(f"Original Text: {tokenizer.decode(tokenized_text)}\nPredicted Quality Score: {original_score:.4f}\n")
print(f"Impostor Text: {tokenizer.decode(impostor_text)}\nPredicted Quality Score (Impostor): {impostor_score:.4f}\n")

Original Text: [CLS]To simplify the algebraic expression `(3x^2 - 4y^3) / (2x)`, you can divide each term in the numerator by the denominator. First, let's divide `3x^2` by `2x`. Since both terms have a common factor of `x`, we can simplify this expression to `3x`. Next, we divide `-4y^3` by `2x`. We can simplify this expression by dividing each term separately. Dividing `-4` by `2` gives `-2`. Then, dividing `y^3` by `x` gives `y^3/x`. So, the simplified form of `(3x^2 - 4y^3) / (2x)` is `3x - 2y^3/x`.[SEP]
Predicted Quality Score: 4.2426

Impostor Text: [CLS]To simplify the algebraic expression `(90xheat2 - 4yvelocityh) / (2x)`, you can divide each term in the numerator by the denominator. First, let's divide `3x^2methods by `2x`. Since both terms have a common factor of `x`, we can simplify this expression to `3x`. Next, we divide `-4y^3` by `2x`. We can simplify this expression by dividing each term separately. Dividing `-4` by `2` gives `-2`. Then, dividing `y^3` by `x` gives `y^3

In [32]:
text = "Of course! To simplify the expression 5(2x - 3) + 4(3x + 1), we can apply the distributive property and then combine like terms. First, we distribute the 5 to both terms inside the first parentheses and the 4 to both terms inside the second parentheses. Doing so, we get: 10x - 15 + 12x + 4 Next, we combine like terms. The x terms (10x and 12x) can be added together to give 22x, and the constant terms (-15 and 4) can be added together to give -11. Therefore, the simplified expression is: 22x - 11 And that's how you simplify the expression 5(2x - 3) + 4(3x + 1) to 22x - 11!"

original_score, impostor_score, _, _, tokenized_text, impostor_text = predict_quality(text, model, tokenizer, frequent_words, embeddings, frequent_words, difficulty='medium', num_impostors=5)

print(f"Original Text: {tokenizer.decode(tokenized_text)}\nPredicted Quality Score: {original_score:.4f}\n")
print(f"Impostor Text: {tokenizer.decode(impostor_text)}\nPredicted Quality Score (Impostor): {impostor_score:.4f}\n")

Original Text: [CLS]Of course! To simplify the expression 5(2x - 3) + 4(3x + 1), we can apply the distributive property and then combine like terms. First, we distribute the 5 to both terms inside the first parentheses and the 4 to both terms inside the second parentheses. Doing so, we get: 10x - 15 + 12x + 4 Next, we combine like terms. The x terms (10x and 12x) can be added together to give 22x, and the constant terms (-15 and 4) can be added together to give -11. Therefore, the simplified expression is: 22x - 11 And that's how you simplify the expression 5(2x - 3) + 4(3x + 1) to 22x - 11![SEP]
Predicted Quality Score: 4.4514

Impostor Text: [CLS]Of course! To simplify the expression 5and2how - 3done + 4(3x + 1), we can apply the distributive property and then combine like termschange First, we distribute the 5 to both terms inside the first parentheses and the 4 to both terms inside the second parentheses. Doing so, we gete 10x - 15 + 12x + 4 Next, we combine like terms. The x terms

In [33]:
text = "Sure! Let's simplify the expression step by step: 1. Distribute the multiplication to each term inside the parentheses: 5(2x - 3) + 4(3x + 1) = 10x - 15 + 12x + 4 2. Combine like terms: = (10x + 12x) + (-15 + 4) = 22x - 11 So, the simplified expression is 22x - 11."

original_score, impostor_score, _, _, tokenized_text, impostor_text = predict_quality(text, model, tokenizer, frequent_words, embeddings, frequent_words, difficulty='medium', num_impostors=5)

print(f"Original Text: {tokenizer.decode(tokenized_text)}\nPredicted Quality Score: {original_score:.4f}\n")
print(f"Impostor Text: {tokenizer.decode(impostor_text)}\nPredicted Quality Score (Impostor): {impostor_score:.4f}\n")

Original Text: [CLS]Sure! Let's simplify the expression step by step: 1. Distribute the multiplication to each term inside the parentheses: 5(2x - 3) + 4(3x + 1) = 10x - 15 + 12x + 4 2. Combine like terms: = (10x + 12x) + (-15 + 4) = 22x - 11 So, the simplified expression is 22x - 11.[SEP]
Predicted Quality Score: 4.0727

Impostor Text: [CLS]Sure! Let's simplify the expression step by step: 1oct Distribute the multiplication to each term inside the parentheses: 5he2kg - 3) + 4(18x + 1) = 10x - 15 + 12x + 4 2. Combine like terms: = (10x + 12x) + (-15 + 4) = 22x - 11 Sohe the simplified expression is 22x - 11.[SEP]
Predicted Quality Score (Impostor): 3.9171

