In [1]:
# ===============================
# MODE COLLAPSE TEST NOTEBOOK
# ===============================

# ✅ Install Dependencies
!pip install -q transformers datasets nltk sentence-transformers tqdm pandas numpy

import nltk
nltk.download('punkt', quiet=True)

from transformers import pipeline
from datasets import load_dataset
from nltk import ngrams
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import numpy as np
import pandas as pd

# =====================================
# 1️⃣ Load Dataset (Synthetic Persona-Chat)
# =====================================

print("📦 Loading dataset...")
try:
    ds = load_dataset("google/Synthetic-Persona-Chat", split="train")
    df = pd.DataFrame(ds)
except Exception as e:
    print("⚠️ Couldn't load from hub:", e)
    print("Trying manual CSV load instead...")
    df = pd.read_csv("hf://datasets/google/Synthetic-Persona-Chat/data/Synthetic-Persona-Chat_train.csv")

print("\n✅ Dataset loaded successfully!")
print("Available columns:", df.columns.tolist())
print(df.head(2))

# Automatically detect the most likely dialogue column
dialogue_col = None
for c in df.columns:
    if any(k in c.lower() for k in ["dialog", "conversation", "prompt", "text", "utterance"]):
        dialogue_col = c
        break

if not dialogue_col:
    raise ValueError("❌ No suitable dialogue column found in dataset. Check column names above.")

# Use a subset for testing
prompts = df[dialogue_col].astype(str).head(100).tolist()
print(f"\nUsing column '{dialogue_col}' with {len(prompts)} prompts.")

# =====================================
# 2️⃣ Load Models
# =====================================
print("\n🧠 Loading text-generation models...")
model_names = {
    "GPT-2": "gpt2",
    "DistilGPT-2": "distilgpt2"
}
models = {}

for name, model_id in model_names.items():
    try:
        print(f"→ Loading {name}...")
        models[name] = pipeline("text-generation", model=model_id, device_map="auto")
    except Exception as e:
        print(f"⚠️ Failed to load {name}: {e}")

# =====================================
# 3️⃣ Generate Outputs
# =====================================
def generate_responses(model, prompts, max_new_tokens=40):
    responses = []
    for p in tqdm(prompts, desc="🔁 Generating", leave=False):
        try:
            out = model(p, max_new_tokens=max_new_tokens, do_sample=True, temperature=0.8)
            responses.append(out[0]["generated_text"])
        except Exception as e:
            responses.append("")
    return responses

responses_dict = {}
for name, model in models.items():
    print(f"\n🗣 Generating responses with {name}...")
    responses_dict[name] = generate_responses(model, prompts)

# =====================================
# 4️⃣ Diversity Metrics
# =====================================
def distinct_n(texts, n):
    all_ngrams = [ng for t in texts for ng in ngrams(t.split(), n)]
    if not all_ngrams:
        return 0.0
    return len(set(all_ngrams)) / len(all_ngrams)

def self_bleu(texts, sample_size=30):
    sample_texts = texts[:sample_size]
    smoothie = SmoothingFunction().method1
    scores = []
    for i in range(len(sample_texts)):
        refs = [t.split() for j, t in enumerate(sample_texts) if j != i]
        if len(refs) == 0: continue
        score = sentence_bleu(refs, sample_texts[i].split(), smoothing_function=smoothie)
        scores.append(score)
    return np.mean(scores) if scores else 0.0

print("\n📊 Calculating diversity metrics...")

metrics = {}
for name, responses in responses_dict.items():
    d1 = distinct_n(responses, 1)
    d2 = distinct_n(responses, 2)
    sb = self_bleu(responses)
    metrics[name] = {"Distinct-1": d1, "Distinct-2": d2, "Self-BLEU": sb}
    print(f"\n{name} Metrics:")
    print(f"Distinct-1: {d1:.4f}, Distinct-2: {d2:.4f}, Self-BLEU: {sb:.4f}")

# =====================================
# 5️⃣ Embedding Variance (Semantic Diversity)
# =====================================
print("\n🧩 Computing embedding variance (semantic diversity)...")
embedder = SentenceTransformer("all-MiniLM-L6-v2")

def embedding_variance(texts):
    embeddings = embedder.encode(texts, show_progress_bar=False)
    return np.mean(np.var(embeddings, axis=0))

for name, responses in responses_dict.items():
    var = embedding_variance(responses)
    metrics[name]["EmbeddingVariance"] = var
    print(f"{name} Embedding Variance: {var:.6f}")

# =====================================
# 6️⃣ Results & Mode Collapse Analysis
# =====================================
print("\n==================== MODE COLLAPSE ANALYSIS ====================")
print("Indicators of mode collapse:")
print("- High Self-BLEU → Less diversity")
print("- Low Distinct-n → Repetitive wording")
print("- Low Embedding Variance → Semantic similarity\n")

for name, vals in metrics.items():
    print(f"\n{name}:")
    for k, v in vals.items():
        print(f"  {k}: {v:.4f}")

# Simple thresholds for interpretation
def detect_collapse(m):
    return m["Self-BLEU"] > 0.8 or m["Distinct-1"] < 0.05 or m["EmbeddingVariance"] < 0.005

print("\n==================== CONCLUSION ====================")
collapse_flags = {name: detect_collapse(vals) for name, vals in metrics.items()}
for name, flag in collapse_flags.items():
    if flag:
        print(f"⚠️ {name}: Possible mode collapse detected.")
    else:
        print(f"✅ {name}: No strong signs of mode collapse.")

# =====================================
# 7️⃣ Summary Table
# =====================================
print("\n📄 Summary Table:")
summary = pd.DataFrame(metrics).T
summary


📦 Loading dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/Synthetic-Persona-Chat_train.csv:   0%|          | 0.00/15.9M [00:00<?, ?B/s]

Synthetic-Persona-Chat_valid.csv: 0.00B [00:00, ?B/s]

Synthetic-Persona-Chat_test.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/8938 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/968 [00:00<?, ? examples/s]


✅ Dataset loaded successfully!
Available columns: ['user 1 personas', 'user 2 personas', 'Best Generated Conversation']
                                     user 1 personas  \
0  I am 32.\nI do not want a job.\nI play video g...   
1  I am 32.\nI play video games all day.\nI still...   

                                     user 2 personas  \
0  My favorite drink is iced coffee.\nI have a bl...   
1  I have a ford f150.\nI like ford cars.\nMy tru...   

                         Best Generated Conversation  
0  User 1: Hi! I'm [user 1's name].\nUser 2: Hi [...  
1  User 1: Hey, how's it going?\nUser 2: Good, I'...  

Using column 'Best Generated Conversation' with 100 prompts.

🧠 Loading text-generation models...
→ Loading GPT-2...


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu


→ Loading DistilGPT-2...


config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu



🗣 Generating responses with GPT-2...


🔁 Generating:   0%|          | 0/100 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
🔁 Generating:   1%|          | 1/100 [00:04<06:43,  4.07s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
🔁 Generating:   2%|▏         | 2/100 [00:09<07:31,  4.61s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
🔁 Generating:   3%|▎         | 3/100 [00:13<07:41,  4.76s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
🔁 Generating:   4%|▍         | 4/100 [00:17<06:53,  4.30s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
🔁 Generating:   5%|▌         | 5/100 [00:22<06:53,  4.35s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
🔁 Generating:   6%|▌         | 6/100 [00:25<06:34,  4.20s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
🔁 Generating:   7%|▋         | 7/100 [00:30<06:34,  4.24s/it]Setting `pad_tok


🗣 Generating responses with DistilGPT-2...


🔁 Generating:   0%|          | 0/100 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
🔁 Generating:   1%|          | 1/100 [00:02<03:53,  2.36s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
🔁 Generating:   2%|▏         | 2/100 [00:04<03:58,  2.43s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
🔁 Generating:   3%|▎         | 3/100 [00:08<04:30,  2.79s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
🔁 Generating:   4%|▍         | 4/100 [00:10<04:23,  2.75s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
🔁 Generating:   5%|▌         | 5/100 [00:13<04:07,  2.60s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
🔁 Generating:   6%|▌         | 6/100 [00:15<03:57,  2.53s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
🔁 Generating:   7%|▋         | 7/100 [00:18<03:58,  2.56s/it]Setting `pad_tok


📊 Calculating diversity metrics...

GPT-2 Metrics:
Distinct-1: 0.0942, Distinct-2: 0.3084, Self-BLEU: 0.4735

DistilGPT-2 Metrics:
Distinct-1: 0.0919, Distinct-2: 0.3013, Self-BLEU: 0.4684

🧩 Computing embedding variance (semantic diversity)...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

GPT-2 Embedding Variance: 0.000990
DistilGPT-2 Embedding Variance: 0.000990

Indicators of mode collapse:
- High Self-BLEU → Less diversity
- Low Distinct-n → Repetitive wording
- Low Embedding Variance → Semantic similarity


GPT-2:
  Distinct-1: 0.0942
  Distinct-2: 0.3084
  Self-BLEU: 0.4735
  EmbeddingVariance: 0.0010

DistilGPT-2:
  Distinct-1: 0.0919
  Distinct-2: 0.3013
  Self-BLEU: 0.4684
  EmbeddingVariance: 0.0010

⚠️ GPT-2: Possible mode collapse detected.
⚠️ DistilGPT-2: Possible mode collapse detected.

📄 Summary Table:


Unnamed: 0,Distinct-1,Distinct-2,Self-BLEU,EmbeddingVariance
GPT-2,0.094223,0.308444,0.473456,0.00099
DistilGPT-2,0.091865,0.301336,0.468446,0.00099
