## Imports

In [None]:
!pip install -q transformers torch;

!pip install transformers datasets torch pandas numpy scikit-learn;
!pip install accelerate -U;

!pip install sacrebleu evaluate;
!pip install sentence-transformers;

!pip install rouge_score;
!pip install bert_score;

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packag

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline, T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer
import torch
import re
import pandas as pd
from torch.utils.data import Dataset
import numpy as np
from sklearn.model_selection import train_test_split

import os
from sentence_transformers import SentenceTransformer
from evaluate import load

Constants

In [None]:
BASE_GOOGLE_DRIVE_PATH = None # this is set below - depends on who's running this notebook

# set env variable to disable wandb logging
os.environ["WANDB_DISABLED"] = "true"

## Mount google drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Set base google drive path (only run 1 of these)

Ben's Account:

In [None]:
!ls drive/MyDrive/ds266/'w266 - Project'

"Kent's Proposal"	 'Project Notes.gdoc'	   t5-detox-model
 paradetox.tsv		 'Project Proposal.gdoc'
 ProjectNotebook1.ipynb   t5-base-detox-model


In [None]:
BASE_GOOGLE_DRIVE_PATH = "drive/MyDrive/ds266/w266 - Project"

Kent's Account:

In [None]:
!ls ./drive/MyDrive/'w266 - Project'

In [None]:
BASE_GOOGLE_DRIVE_PATH = "./drive/MyDrive/'w266 - Project'"

## Load ParaDetox Dataset

In [None]:
data_file_path = BASE_GOOGLE_DRIVE_PATH + '/paradetox.tsv'
df = pd.read_csv(data_file_path, sep='\t')

def get_neutral_text(row):
    for col in ['neutral1', 'neutral2', 'neutral3']:
        if pd.notna(row[col]) and row[col].strip():
            return row[col].strip()
    return None

training_data = []
for _, row in df.iterrows():
    toxic_text = row['toxic']
    neutral_text = get_neutral_text(row)
    if neutral_text and toxic_text:
        training_data.append({
            'input': f"detoxify: {toxic_text}",
            'output': neutral_text
        })

print(f"training examples: {len(training_data)}")

## Create dataset class

In [None]:
class DetoxDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]

        input_encoding = self.tokenizer(
            item['input'],
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        target_encoding = self.tokenizer(
            item['output'],
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': input_encoding['input_ids'].flatten(),
            'attention_mask': input_encoding['attention_mask'].flatten(),
            'labels': target_encoding['input_ids'].flatten()
        }

## Initialize model and trainer

In [None]:
def create_augmented_data(df):
    training_data = []
    for _, row in df.iterrows():
        toxic_text = row['toxic']
        for col in ['neutral1', 'neutral2', 'neutral3']:
            if pd.notna(row[col]) and row[col].strip():
                training_data.append({
                    'input': f"detoxify: {toxic_text}",
                    'output': row[col].strip()
                })
    return training_data

training_data = create_augmented_data(df)
print(f"augmented examples: {len(training_data)}")

train_data, temp_data = train_test_split(training_data, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

print(f"train: {len(train_data)}")
print(f"val: {len(val_data)}")
print(f"test: {len(test_data)}")

In [None]:
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

train_dataset = DetoxDataset(train_data, tokenizer)
val_dataset = DetoxDataset(val_data, tokenizer)
test_dataset = DetoxDataset(test_data, tokenizer)

training_args = TrainingArguments(
    output_dir='./t5-detox-results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    eval_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=1000,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

Train model

In [None]:
trainer.train()

## Inference function

In [None]:
def detoxify_text(text, model, tokenizer, max_length=128):
    device = next(model.parameters()).device

    input_text = f"detoxify: {text}"
    input_ids = tokenizer.encode(input_text, return_tensors='pt', max_length=max_length, truncation=True)

    input_ids = input_ids.to(device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids,
            max_length=max_length,
            num_beams=5,
            early_stopping=True,
            no_repeat_ngram_size=2
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

test_text = "This is a stupid idea"
detoxified = detoxify_text(test_text, model, tokenizer)
print(f"orig: {test_text}")
print(f"detox: {detoxified}")

## Checkpoint model

In [None]:
checkpoint_path = BASE_GOOGLE_DRIVE_PATH + '/t5-base-detox-model'
model.save_pretrained(checkpoint_path)
tokenizer.save_pretrained(checkpoint_path)

## Evaluate Model

Load checkpointed model

In [None]:
checkpoint_path = BASE_GOOGLE_DRIVE_PATH + '/t5-base-detox-model'
tokenizer = T5Tokenizer.from_pretrained(checkpoint_path)
model = T5ForConditionalGeneration.from_pretrained(checkpoint_path)

### Evaluator class

In [None]:
class DetoxEvaluator:
    def __init__(self):
        self.toxicity_tokenizer = AutoTokenizer.from_pretrained("s-nlp/roberta_toxicity_classifier")
        self.toxicity_model = AutoModelForSequenceClassification.from_pretrained("s-nlp/roberta_toxicity_classifier")
        self.sim_model = SentenceTransformer('all-MiniLM-L6-v2')
        self.bleu = load("sacrebleu")
        self.rouge = load("rouge")
        self.bertscore = load("bertscore")

    def style_transfer_accuracy(self, texts):
        """sta metric"""
        inputs = self.toxicity_tokenizer(texts, return_tensors="pt", padding=True, truncation=True)

        with torch.no_grad():
            outputs = self.toxicity_model(**inputs)
            predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
            # 0 = nontoxic
            non_toxic_probs = predictions[:, 0]

        return (non_toxic_probs > 0.5).float().mean().item()

    def content_similarity(self, original_texts, detoxified_texts):
        """sim"""
        original_embeddings = self.sim_model.encode(original_texts)
        detoxified_embeddings = self.sim_model.encode(detoxified_texts)

        similarities = []
        for orig, detox in zip(original_embeddings, detoxified_embeddings):
            sim = np.dot(orig, detox) / (np.linalg.norm(orig) * np.linalg.norm(detox))
            similarities.append(sim)

        return np.mean(similarities)

    def bleu_score(self, references, predictions):
        # sacrebleu wants list of refs
        formatted_refs = [[ref] for ref in references]
        result = self.bleu.compute(predictions=predictions, references=formatted_refs)
        return result['score'] / 100.0

    def evaluate_model(self, original_texts, detoxified_texts):
        sta = self.style_transfer_accuracy(detoxified_texts)
        sim = self.content_similarity(original_texts, detoxified_texts)
        bleu = self.bleu_score(original_texts, detoxified_texts)
        rouge_scores = self.rouge.compute(predictions=detoxified_texts, references=original_texts)
        bert_scores = self.bertscore.compute(predictions=detoxified_texts, references=original_texts, lang="en")

        return {
            'Style_Transfer_Accuracy': sta,
            'Content_Similarity': sim,
            'BLEU': bleu,
            'ROUGE-L': rouge_scores['rougeL'],
            'BERTScore_F1': np.mean(bert_scores['f1']),
            'J_Score': sta * sim
        }

### Test model

In [None]:
def evaluate_on_test_set(test_data, model, tokenizer, evaluator):
    original_texts = []
    ground_truth_texts = []
    predicted_texts = []

    for item in test_data:
        toxic_text = item['input'].replace('detoxify: ', '')
        ground_truth = item['output']
        prediction = detoxify_text(toxic_text, model, tokenizer)

        original_texts.append(toxic_text)
        ground_truth_texts.append(ground_truth)
        predicted_texts.append(prediction)

    sta = evaluator.style_transfer_accuracy(predicted_texts)
    sim = evaluator.content_similarity(original_texts, predicted_texts)

    bleu_result = evaluator.bleu.compute(predictions=predicted_texts, references=[[gt] for gt in ground_truth_texts])
    rouge_result = evaluator.rouge.compute(predictions=predicted_texts, references=ground_truth_texts)
    bert_result = evaluator.bertscore.compute(predictions=predicted_texts, references=ground_truth_texts, lang="en")

    return {
        'Style_Transfer_Accuracy': sta,
        'Content_Similarity': sim,
        'BLEU': bleu_result['score'] / 100.0,
        'ROUGE-L': rouge_result['rougeL'],
        'BERTScore_F1': np.mean(bert_result['f1']),
        'J_Score': sta * sim
    }

In [None]:
print("evaluating...")

evaluator = DetoxEvaluator()

real_results = evaluate_on_test_set(test_data[:100], model, tokenizer, evaluator)

print("\nresults")
for metric, score in real_results.items():
    print(f"{metric}: {score:.4f}")

print("\nexamples:")
for i, item in enumerate(test_data[:5]):
    toxic_text = item['input'].replace('detoxify: ', '')
    ground_truth = item['output']
    prediction = detoxify_text(toxic_text, model, tokenizer)

    print(f"\n{i+1}:")
    print(f"orig: {toxic_text}")
    print(f"truth: {ground_truth}")
    print(f"pred: {prediction}")

## Potential config-only improvement

In [None]:

# # Initialize tokenizer and model
# model_name = "t5-base"  # Start with small for faster training
# tokenizer = T5Tokenizer.from_pretrained(model_name)
# model = T5ForConditionalGeneration.from_pretrained(model_name)

# # Create train/val/test split
# train_data, temp_data = train_test_split(training_data, test_size=0.2, random_state=42)
# val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

# print(f"Training: {len(train_data)}")      # 80%
# print(f"Validation: {len(val_data)}")      # 10%
# print(f"Test: {len(test_data)}")           # 10%

# # Create datasets
# train_dataset = DetoxDataset(train_data, tokenizer)
# val_dataset = DetoxDataset(val_data, tokenizer)
# test_dataset = DetoxDataset(test_data, tokenizer)

# # Training arguments
# training_args = TrainingArguments(
#     output_dir='./t5-detox-results',
#     num_train_epochs=5,  # More epochs
#     per_device_train_batch_size=4,  # Smaller batch for larger model
#     per_device_eval_batch_size=8,
#     gradient_accumulation_steps=4,  # Effective batch size = 16
#     warmup_steps=1000,  # More warmup
#     weight_decay=0.01,
#     learning_rate=5e-5,  # Lower learning rate
#     logging_dir='./logs',
#     logging_steps=100,
#     evaluation_strategy="steps",
#     eval_steps=500,
#     save_strategy="steps",
#     save_steps=1000,
#     load_best_model_at_end=True,
#     metric_for_best_model="eval_loss",
#     greater_is_better=False,
#     save_total_limit=3,
# )

# # Initialize trainer
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=val_dataset,
#     tokenizer=tokenizer,
# )

In [None]:
# # Train the model
# trainer.train()

# Code Sandbox

In [None]:
# MODEL_NAME = "unitary/toxic-bert"  # BERT fine-tuned for toxicity

# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

# device = 0 if torch.cuda.is_available() else -1  # GPU if available

# toxicity_pipeline = pipeline(
#     "text-classification",
#     model=model,
#     tokenizer=tokenizer,
#     device=device,
#     top_k=None,            # return all labels
#     truncation=True,
#     padding=True
# )


In [None]:
# def get_toxicity_score(text: str) -> float:
#     """
#     Returns a scalar toxicity score in [0, 1].
#     For unitary/toxic-bert we take the max prob over all labels.
#     """
#     outputs = toxicity_pipeline(text)[0]  # list of {label, score}
#     # e.g., labels might be: TOXIC, SEVERE_TOXIC, OBSCENE, THREAT, INSULT, IDENTITY_HATE
#     scores = {o["label"]: o["score"] for o in outputs}

#     # Simple heuristic: toxicity = max over all categories
#     tox_score = max(scores.values())
#     return float(tox_score)


In [None]:
# examples = [
#     "I love working with you, this is great.",
#     "You are such an idiot, I hate you.",
# ]

# for s in examples:
#     print(s, "=>", get_toxicity_score(s))


In [None]:
# # Very small example lexicon; extend this list as needed.
# PROFANITY_MAP = {
#     r"\bfuck(ing)?\b": "really",
#     r"\bshit(ty)?\b": "bad",
#     r"\basshole\b": "person",
#     r"\bidiot\b": "person",
#     r"\bstupid\b": "unwise",
#     r"\btrash\b": "not good",
#     r"\bI hate you\b": "I'm really upset with you",
# }

# def simple_neutralize(text: str) -> str:
#     """
#     Replace some common profanities/insults with milder wording.
#     This is just a placeholder; BERT is used as the *judge*,
#     while this function handles *how* we detoxify.
#     """
#     new_text = text
#     for pattern, replacement in PROFANITY_MAP.items():
#         new_text = re.sub(pattern, replacement, new_text, flags=re.IGNORECASE)
#     return new_text


# def detoxify_text(
#     text: str,
#     threshold: float = 0.5,
#     max_passes: int = 2,
#     verbose: bool = False
# ):
#     """
#     Run toxicity detection; if above threshold, apply simple neutralization.
#     Optionally run multiple passes until toxicity falls below threshold
#     or max_passes is reached.
#     """
#     current_text = text
#     history = []

#     for i in range(max_passes + 1):
#         tox = get_toxicity_score(current_text)
#         history.append((current_text, tox))
#         if verbose:
#             print(f"Pass {i}: toxicity={tox:.3f}")
#             print("Text:", current_text)
#             print("-" * 50)

#         # Stop if we are already under the threshold
#         if tox <= threshold:
#             break

#         # Otherwise, neutralize and try again
#         if i < max_passes:
#             current_text = simple_neutralize(current_text)

#     return {
#         "original": text,
#         "final": current_text,
#         "history": history,
#     }


In [None]:
# toxic_sentence = "You are such a fucking idiot, this code is trash."
# result = detoxify_text(toxic_sentence, threshold=0.4, verbose=True)

# print("Original:")
# print(result["original"])
# print("\nDetoxified:")
# print(result["final"])
