# Complete Paraphrase Attack Experiment

This notebook runs **complete** paraphrase attack experiments:
1. Generates watermarked texts for all model/watermarker combinations
2. Applies paraphrase attack using Pegasus
3. Runs watermark detection
4. Outputs comprehensive CSV results

**Models:** OPT-1.3B, GPT-2, Qwen-7B  
**Watermarkers:** GPW, GPW-SP, GPW-SP-LOW, Unigram, KGW  
**Samples:** 50 per combination

**Output:** `paraphrase_complete_results.csv` with all per-sample results

In [None]:
#@title 1. Install Dependencies
!pip install -q torch transformers sentencepiece tqdm pandas scipy nltk accelerate

In [None]:
#@title 2. Download NLTK Data
import nltk
nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('wordnet', quiet=True)
print("NLTK data downloaded!")

In [None]:
#@title 3. Configuration
import torch

# Experiment settings
N_SAMPLES = 50
MAX_NEW_TOKENS = 200
Z_THRESHOLD = 4.0

# Models to test
MODELS = [
    ("opt-1.3b", "facebook/opt-1.3b"),
    ("gpt2", "gpt2"),
    # ("qwen-7b", "Qwen/Qwen2-7B"),  # Uncomment if you have enough GPU memory
]

# Watermarkers to test (now includes gpw_sp_sr)
WATERMARKERS = ["gpw", "gpw_sp", "gpw_sp_low", "gpw_sp_sr", "unigram", "kgw"]

# Check GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")
if device == "cuda":
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

print(f"\nExperiment Configuration:")
print(f"  Samples per experiment: {N_SAMPLES}")
print(f"  Models: {[m[0] for m in MODELS]}")
print(f"  Watermarkers: {WATERMARKERS}")
print(f"  Total experiments: {len(MODELS) * len(WATERMARKERS)}")

In [None]:
#@title 4. Prompts for Text Generation
PROMPTS = [
    "The future of artificial intelligence will",
    "Climate change is affecting our planet because",
    "The history of computing began when",
    "In modern society, technology has",
    "Scientists have discovered that",
    "The importance of education lies in",
    "Economic growth depends on",
    "The human brain is remarkable because",
    "Space exploration has revealed",
    "The art of storytelling involves",
    "Medical advances have helped",
    "The ocean covers most of Earth and",
    "Ancient civilizations developed",
    "Music affects human emotions by",
    "The principles of democracy include",
    "Renewable energy sources are",
    "The study of genetics reveals",
    "Urban planning must consider",
    "The psychology of decision making",
    "Global trade has transformed",
    "The evolution of language shows",
    "Environmental conservation requires",
    "The philosophy of ethics addresses",
    "Digital communication has changed",
    "The architecture of ancient Rome",
    "Biodiversity is essential because",
    "The mathematics of probability",
    "Social media platforms have",
    "The chemistry of cooking involves",
    "Historical events shape our",
    "The physics of light explains",
    "Cultural traditions preserve",
    "The economics of healthcare",
    "Artificial neural networks can",
    "The geography of continents",
    "Public health initiatives aim to",
    "The literature of the 19th century",
    "Quantum mechanics describes",
    "The sociology of communities",
    "Agricultural innovation has",
    "The engineering behind bridges",
    "Political systems vary because",
    "The astronomy of distant galaxies",
    "Mental health awareness helps",
    "The anthropology of human cultures",
    "Sustainable development requires",
    "The linguistics of grammar shows",
    "International relations depend on",
    "The meteorology of storms",
    "Educational technology enables",
]

print(f"Loaded {len(PROMPTS)} prompts")

In [None]:
#@title 5. Watermarker Implementations
import hashlib
import numpy as np
from scipy import stats
import torch.nn.functional as F

class BaseWatermarker:
    """Base class for watermarkers."""
    def __init__(self, model, tokenizer, device):
        self.model = model
        self.tokenizer = tokenizer
        self.device = device
        # Get vocab size from model's output dimension
        if hasattr(model, 'lm_head'):
            self.vocab_size = model.lm_head.out_features
        elif hasattr(model, 'get_output_embeddings'):
            out_emb = model.get_output_embeddings()
            if out_emb is not None:
                self.vocab_size = out_emb.weight.shape[0]
            else:
                self.vocab_size = tokenizer.vocab_size
        else:
            self.vocab_size = tokenizer.vocab_size
        print(f"    Using vocab_size: {self.vocab_size}")

    def generate(self, prompt, max_new_tokens=200):
        raise NotImplementedError

    def detect(self, text):
        raise NotImplementedError


class GPWWatermarker(BaseWatermarker):
    """GPW watermarker with green/red list based on previous token."""
    def __init__(self, model, tokenizer, device, omega=2.0, z_threshold=4.0):
        super().__init__(model, tokenizer, device)
        self.omega = omega
        self.z_threshold = z_threshold

    def _get_green_indices(self, prev_token, size):
        """Get green list indices based on previous token."""
        hash_input = str(prev_token).encode()
        hash_val = int(hashlib.sha256(hash_input).hexdigest(), 16)
        rng = np.random.RandomState(hash_val % (2**31))
        green_size = int(size * 0.5)
        indices = rng.permutation(size)
        return indices[:green_size]

    def generate(self, prompt, max_new_tokens=200):
        input_ids = self.tokenizer.encode(prompt, return_tensors="pt").to(self.device)
        generated = input_ids.clone()

        for _ in range(max_new_tokens):
            with torch.no_grad():
                outputs = self.model(generated)
                logits = outputs.logits[:, -1, :].clone()  # Clone to avoid in-place issues
                vocab_size = logits.shape[-1]

            # Get green indices and apply bias
            prev_token = generated[0, -1].item()
            green_indices = self._get_green_indices(prev_token, vocab_size)
            
            # Apply bias using index_add for safety
            bias = torch.zeros_like(logits)
            bias[0, green_indices] = self.omega
            logits = logits + bias

            # Sample with numerical stability
            logits = logits.float()
            probs = F.softmax(logits / 0.8, dim=-1)
            probs = torch.clamp(probs, min=1e-9)
            probs = probs / probs.sum(dim=-1, keepdim=True)
            
            next_token = torch.multinomial(probs, 1)
            generated = torch.cat([generated, next_token], dim=-1)

            if next_token.item() == self.tokenizer.eos_token_id:
                break

        return self.tokenizer.decode(generated[0], skip_special_tokens=True)

    def detect(self, text):
        tokens = self.tokenizer.encode(text, add_special_tokens=False)
        if len(tokens) < 2:
            return {'z_score': 0, 'p_value': 1.0, 'is_detected': False}

        green_count = 0
        total = 0

        for i in range(1, len(tokens)):
            prev_token = tokens[i-1]
            curr_token = tokens[i]
            green_indices = self._get_green_indices(prev_token, self.vocab_size)
            green_set = set(green_indices)

            if curr_token < self.vocab_size and curr_token in green_set:
                green_count += 1
            total += 1

        if total == 0:
            return {'z_score': 0, 'p_value': 1.0, 'is_detected': False}

        p = 0.5
        observed = green_count / total
        z_score = (observed - p) / np.sqrt(p * (1-p) / total)
        p_value = 1 - stats.norm.cdf(z_score)

        return {'z_score': z_score, 'p_value': p_value, 'is_detected': z_score >= self.z_threshold}


class GPWSPWatermarker(GPWWatermarker):
    """GPW-SP: Salted phase variant."""
    def __init__(self, model, tokenizer, device, omega=2.0, z_threshold=4.0):
        super().__init__(model, tokenizer, device, omega, z_threshold)
        self.salt = np.random.randint(0, 2**31)
        print(f"    GPW-SP salt: {self.salt}")

    def _get_green_indices(self, prev_token, size):
        """Get green list indices based on previous token and salt."""
        hash_input = f"{prev_token}_{self.salt}".encode()
        hash_val = int(hashlib.sha256(hash_input).hexdigest(), 16)
        rng = np.random.RandomState(hash_val % (2**31))
        green_size = int(size * 0.5)
        indices = rng.permutation(size)
        return indices[:green_size]


class GPWSPLowWatermarker(GPWSPWatermarker):
    """GPW-SP-LOW: Salted phase with lower omega."""
    def __init__(self, model, tokenizer, device, omega=1.0, z_threshold=4.0):
        super().__init__(model, tokenizer, device, omega, z_threshold)


class GPWSPSRWatermarker(GPWSPWatermarker):
    """GPW-SP-SR: Salted phase with Semantic Relatedness detection.
    
    Note: This uses a simplified SR detection based on bigram consistency.
    The full SR implementation would require SentenceTransformer.
    """
    def __init__(self, model, tokenizer, device, omega=2.0, z_threshold=4.0):
        super().__init__(model, tokenizer, device, omega, z_threshold)
        print(f"    GPW-SP-SR (simplified SR detection)")

    def detect(self, text):
        """Simplified SR detection - uses windowed approach."""
        tokens = self.tokenizer.encode(text, add_special_tokens=False)
        if len(tokens) < 10:
            return {'z_score': 0, 'p_value': 1.0, 'is_detected': False}

        # Use sliding windows for more robust detection
        window_size = 25
        window_scores = []
        
        for start in range(0, len(tokens) - window_size + 1, 10):
            window_tokens = tokens[start:start + window_size]
            green_count = 0
            total = 0
            
            for i in range(1, len(window_tokens)):
                prev_token = window_tokens[i-1]
                curr_token = window_tokens[i]
                green_indices = self._get_green_indices(prev_token, self.vocab_size)
                green_set = set(green_indices)
                
                if curr_token < self.vocab_size and curr_token in green_set:
                    green_count += 1
                total += 1
            
            if total > 0:
                window_scores.append(green_count / total)
        
        if not window_scores:
            return {'z_score': 0, 'p_value': 1.0, 'is_detected': False}
        
        # Use median of window scores for robustness
        observed = np.median(window_scores)
        n = len(tokens) - 1
        p = 0.5
        z_score = (observed - p) / np.sqrt(p * (1-p) / n)
        p_value = 1 - stats.norm.cdf(z_score)

        return {'z_score': z_score, 'p_value': p_value, 'is_detected': z_score >= self.z_threshold}


class UnigramWatermarker(BaseWatermarker):
    """Unigram watermarker with fixed green list."""
    def __init__(self, model, tokenizer, device, delta=2.0, z_threshold=4.0):
        super().__init__(model, tokenizer, device)
        self.delta = delta
        self.z_threshold = z_threshold
        # Pre-compute green list with fixed seed
        rng = np.random.RandomState(42)
        self.green_indices = rng.permutation(self.vocab_size)[:int(self.vocab_size * 0.5)]
        self.green_set = set(self.green_indices)

    def generate(self, prompt, max_new_tokens=200):
        input_ids = self.tokenizer.encode(prompt, return_tensors="pt").to(self.device)
        generated = input_ids.clone()

        for _ in range(max_new_tokens):
            with torch.no_grad():
                outputs = self.model(generated)
                logits = outputs.logits[:, -1, :].clone()
                vocab_size = logits.shape[-1]

            # Create bias - handle vocab size mismatch
            bias = torch.zeros_like(logits)
            valid_indices = self.green_indices[self.green_indices < vocab_size]
            bias[0, valid_indices] = self.delta
            logits = logits + bias

            logits = logits.float()
            probs = F.softmax(logits / 0.8, dim=-1)
            probs = torch.clamp(probs, min=1e-9)
            probs = probs / probs.sum(dim=-1, keepdim=True)
            
            next_token = torch.multinomial(probs, 1)
            generated = torch.cat([generated, next_token], dim=-1)

            if next_token.item() == self.tokenizer.eos_token_id:
                break

        return self.tokenizer.decode(generated[0], skip_special_tokens=True)

    def detect(self, text):
        tokens = self.tokenizer.encode(text, add_special_tokens=False)
        if len(tokens) < 1:
            return {'z_score': 0, 'p_value': 1.0, 'is_detected': False}

        green_count = sum(1 for t in tokens if t in self.green_set)
        total = len(tokens)

        p = 0.5
        observed = green_count / total
        z_score = (observed - p) / np.sqrt(p * (1-p) / total)
        p_value = 1 - stats.norm.cdf(z_score)

        return {'z_score': z_score, 'p_value': p_value, 'is_detected': z_score >= self.z_threshold}


class KGWWatermarker(BaseWatermarker):
    """KGW watermarker with context-based green list."""
    def __init__(self, model, tokenizer, device, gamma=0.5, delta=2.0, z_threshold=4.0):
        super().__init__(model, tokenizer, device)
        self.gamma = gamma
        self.delta = delta
        self.z_threshold = z_threshold

    def _get_green_indices(self, context, size):
        """Get green list indices based on context."""
        context_hash = hash(tuple(context)) % (2**31)
        rng = np.random.RandomState(context_hash)
        green_size = int(size * self.gamma)
        indices = rng.permutation(size)
        return indices[:green_size]

    def generate(self, prompt, max_new_tokens=200):
        input_ids = self.tokenizer.encode(prompt, return_tensors="pt").to(self.device)
        generated = input_ids.clone()

        for _ in range(max_new_tokens):
            with torch.no_grad():
                outputs = self.model(generated)
                logits = outputs.logits[:, -1, :].clone()
                vocab_size = logits.shape[-1]

            context = generated[0, -4:].tolist()
            green_indices = self._get_green_indices(context, vocab_size)
            
            bias = torch.zeros_like(logits)
            bias[0, green_indices] = self.delta
            logits = logits + bias

            logits = logits.float()
            probs = F.softmax(logits / 0.8, dim=-1)
            probs = torch.clamp(probs, min=1e-9)
            probs = probs / probs.sum(dim=-1, keepdim=True)
            
            next_token = torch.multinomial(probs, 1)
            generated = torch.cat([generated, next_token], dim=-1)

            if next_token.item() == self.tokenizer.eos_token_id:
                break

        return self.tokenizer.decode(generated[0], skip_special_tokens=True)

    def detect(self, text):
        tokens = self.tokenizer.encode(text, add_special_tokens=False)
        if len(tokens) < 2:
            return {'z_score': 0, 'p_value': 1.0, 'is_detected': False}

        green_count = 0
        total = 0

        for i in range(1, len(tokens)):
            context = tokens[max(0, i-4):i]
            green_indices = self._get_green_indices(context, self.vocab_size)
            green_set = set(green_indices)

            if tokens[i] < self.vocab_size and tokens[i] in green_set:
                green_count += 1
            total += 1

        if total == 0:
            return {'z_score': 0, 'p_value': 1.0, 'is_detected': False}

        p = self.gamma
        observed = green_count / total
        z_score = (observed - p) / np.sqrt(p * (1-p) / total)
        p_value = 1 - stats.norm.cdf(z_score)

        return {'z_score': z_score, 'p_value': p_value, 'is_detected': z_score >= self.z_threshold}


def get_watermarker(name, model, tokenizer, device):
    """Factory function to create watermarker."""
    watermarkers = {
        "gpw": GPWWatermarker,
        "gpw_sp": GPWSPWatermarker,
        "gpw_sp_low": GPWSPLowWatermarker,
        "gpw_sp_sr": GPWSPSRWatermarker,
        "unigram": UnigramWatermarker,
        "kgw": KGWWatermarker,
    }
    if name not in watermarkers:
        raise ValueError(f"Unknown watermarker: {name}. Available: {list(watermarkers.keys())}")
    return watermarkers[name](model, tokenizer, device)

print("Watermarker classes defined!")

In [None]:
#@title 6. Load Pegasus Paraphraser
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import re

print("Loading Pegasus model for paraphrasing...")
pegasus_model_name = "tuner007/pegasus_paraphrase"
pegasus_tokenizer = PegasusTokenizer.from_pretrained(pegasus_model_name)
pegasus_model = PegasusForConditionalGeneration.from_pretrained(pegasus_model_name).to(device)
pegasus_model.eval()
print("Pegasus loaded!")

def paraphrase_text(text):
    """Paraphrase a text by paraphrasing each sentence."""
    sentences = re.split(r'(?<=[.!?])\s+', text.strip())
    paraphrased_sentences = []

    for sentence in sentences:
        if not sentence.strip():
            continue

        if len(sentence) > 400:
            sentence = sentence[:400]

        sentence = sentence.encode('ascii', 'ignore').decode('ascii').strip()
        if not sentence:
            continue

        try:
            inputs = pegasus_tokenizer(
                sentence,
                return_tensors="pt",
                max_length=100,
                truncation=True
            ).to(device)

            with torch.no_grad():
                outputs = pegasus_model.generate(
                    **inputs,
                    max_length=100,
                    num_beams=4,
                    do_sample=False,
                )

            paraphrased = pegasus_tokenizer.decode(outputs[0], skip_special_tokens=True)
            paraphrased_sentences.append(paraphrased)
        except:
            paraphrased_sentences.append(sentence)

    return ' '.join(paraphrased_sentences)

print("Paraphrase function ready!")

In [None]:
#@title 7. Run Complete Experiments
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
import pandas as pd
import gc

all_results = []

for model_name, model_path in MODELS:
    print(f"\n{'='*70}")
    print(f"MODEL: {model_name}")
    print(f"{'='*70}")

    # Load model
    print(f"Loading {model_path}...")
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        torch_dtype=torch.float16,
        device_map="auto"
    )
    model.eval()
    print(f"Model loaded!")

    for wm_name in WATERMARKERS:
        print(f"\n  --- Watermarker: {wm_name} ---")

        # Create watermarker
        watermarker = get_watermarker(wm_name, model, tokenizer, device)

        # Generate watermarked texts
        print(f"  Generating {N_SAMPLES} watermarked texts...")
        generated_texts = []
        for i in tqdm(range(N_SAMPLES), desc="  Generating", leave=False):
            prompt = PROMPTS[i % len(PROMPTS)]
            try:
                text = watermarker.generate(prompt, max_new_tokens=MAX_NEW_TOKENS)
                generated_texts.append(text)
            except Exception as e:
                print(f"    Error generating sample {i}: {e}")
                generated_texts.append(prompt)

        # Paraphrase texts
        print(f"  Paraphrasing {len(generated_texts)} texts...")
        paraphrased_texts = []
        for text in tqdm(generated_texts, desc="  Paraphrasing", leave=False):
            try:
                paraphrased = paraphrase_text(text)
                paraphrased_texts.append(paraphrased)
            except Exception as e:
                paraphrased_texts.append(text)

        # Detect watermarks
        print(f"  Running detection...")
        detected_count = 0
        for i, text in enumerate(tqdm(paraphrased_texts, desc="  Detecting", leave=False)):
            try:
                result = watermarker.detect(text)
                z_score = result['z_score']
                is_detected = result['is_detected']

                all_results.append({
                    'model': model_name,
                    'watermarker': wm_name,
                    'attack': 'paraphrase',
                    'sample_idx': i,
                    'z_score': z_score,
                    'p_value': result['p_value'],
                    'is_detected': 1 if is_detected else 0
                })

                if is_detected:
                    detected_count += 1
            except Exception as e:
                all_results.append({
                    'model': model_name,
                    'watermarker': wm_name,
                    'attack': 'paraphrase',
                    'sample_idx': i,
                    'z_score': 0,
                    'p_value': 1.0,
                    'is_detected': 0
                })

        detection_rate = detected_count / len(paraphrased_texts) * 100
        print(f"  Result: {detection_rate:.1f}% ({detected_count}/{len(paraphrased_texts)})")

    # Free model memory
    del model
    del tokenizer
    gc.collect()
    torch.cuda.empty_cache()

print(f"\n{'='*70}")
print("ALL EXPERIMENTS COMPLETE!")
print(f"{'='*70}")

In [None]:
#@title 8. Create Results DataFrame and Summary
import json
from datetime import datetime

# Create DataFrame
df = pd.DataFrame(all_results)

# Calculate summary statistics
summary = df.groupby(['model', 'watermarker']).agg({
    'is_detected': ['sum', 'count', 'mean'],
    'z_score': 'mean'
}).round(4)

summary.columns = ['detected_count', 'total_samples', 'detection_rate', 'mean_z_score']
summary['detection_rate'] = (summary['detection_rate'] * 100).round(2)
summary = summary.reset_index()

print("\n" + "="*70)
print("SUMMARY OF RESULTS")
print("="*70)
print(summary.to_string(index=False))

# Display per-model breakdown
print("\n" + "="*70)
print("DETAILED VIEW")
print("="*70)
for model in df['model'].unique():
    print(f"\n{model}:")
    model_df = summary[summary['model'] == model]
    for _, row in model_df.iterrows():
        print(f"  {row['watermarker']:15} - Detection: {row['detection_rate']:5.1f}% | Mean Z: {row['mean_z_score']:6.2f}")

In [None]:
#@title 9. Save Results to CSV
from google.colab import files

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Save per-sample results
csv_filename = f"paraphrase_complete_results_{timestamp}.csv"
df.to_csv(csv_filename, index=False)
print(f"Per-sample results saved to: {csv_filename}")

# Save summary
summary_csv = f"paraphrase_summary_{timestamp}.csv"
summary.to_csv(summary_csv, index=False)
print(f"Summary saved to: {summary_csv}")

# Save as JSON for easy parsing
summary_json = f"paraphrase_summary_{timestamp}.json"
summary_dict = summary.to_dict(orient='records')
with open(summary_json, 'w') as f:
    json.dump(summary_dict, f, indent=2)
print(f"Summary JSON saved to: {summary_json}")

# Preview
print("\nFirst 10 rows of results:")
display(df.head(10))

In [None]:
#@title 10. Download All Results
print("Downloading per-sample results CSV...")
files.download(csv_filename)

print("Downloading summary CSV...")
files.download(summary_csv)

print("Downloading summary JSON...")
files.download(summary_json)

print("\n" + "="*70)
print("COMPLETE!")
print("="*70)
print("\nYou can now update unified_results.csv with the paraphrase attack data.")
print("\nFormat for unified_results.csv:")
print("Model,Watermarker,Variant,Alpha,Omega,Z_Threshold,Attack,Detection_Rate,Detection_Count,Total_Samples,Mean_Z_Score,Perplexity,Notes")

## Notes

### To add Qwen-7B:
Uncomment the Qwen line in the MODELS configuration. Note that Qwen-7B requires more GPU memory.

### Updating unified_results.csv:
For each row in the summary, add to unified_results.csv:
```
OPT-1.3B,GPW,GPW (non-salted),3.0,50.0,4.0,paraphrase,XX.X,XX,50,X.XX,-,Paraphrase attack
```

### Troubleshooting:
- If you run out of GPU memory, reduce N_SAMPLES or run one model at a time
- If Pegasus fails on some texts, they will use the original text (counted as failed paraphrase)