
# Section 2.3 Perplexity & Sampling Analysis (Fixed Functions & Prints)


In [16]:
import torch
import torch.nn.functional as F
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import warnings
import re
from typing import Dict
from pathlib import Path

# Import Distilgpt2 Model and Tokenizer

In [17]:

model_name = "distilgpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

device = torch.device("cpu")
model = model.to(device)
model.eval()

print(f"Model loaded on: {device}")
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"Vocabulary size: {tokenizer.vocab_size:,}")


Model loaded on: cpu
Model parameters: 81,912,576
Vocabulary size: 50,257


# Perplexity and paragraph shuffle function

In [18]:

def compute_perplexity(text: str, model: GPT2LMHeadModel, tokenizer: GPT2Tokenizer, device: torch.device):
    enc = tokenizer(text, return_tensors="pt")
    input_ids = enc["input_ids"].to(device)
    with torch.no_grad():
        out = model(input_ids=input_ids, labels=input_ids)
        loss = out.loss
    ppl = torch.exp(loss).item()
    return ppl, float(loss)

def shuffle_text(text: str):
    sentences = [s.strip() for s in text.split('.') if s.strip()]
    shuffled = sentences.copy()
    random.shuffle(shuffled)
    joined = '. '.join(shuffled)
    if joined and not joined.endswith('.'):
        joined += '.'
    return joined

# Part a) perplexity Analysis

In [24]:

test_paragraph = (
    "The unanimous Declaration of the thirteen united States of America, "
    "When in the Course of human events, it becomes necessary for one people to "
    "dissolve the political bands which have connected them with another, and to assume among "
    "the powers of the earth, the separate and equal station to which the Laws of Nature and of Nature's God entitle them, "
    "a decent respect to the opinions of mankind requires that they should declare the causes which impel them to the separation. We hold these truths "
    "to be self-evident, that all men are created equal, that they are endowed by their Creator with certain unalienable Rights, that among these are Life, "
    "Liberty and the pursuit of Happiness.--That to secure these rights, Governments are instituted among Men, deriving their just powers from the consent of the "
    "governed, --That whenever any Form of Government becomes destructive of these ends, it is the Right of the People to alter or to abolish it, and to institute new Government, "
    "laying its foundation on such principles and organizing its powers in such form, as to them shall seem most likely to effect their Safety and Happiness."
)

print(test_paragraph)

results = []

original_perplexity, original_loss = compute_perplexity(test_paragraph, model, tokenizer, device)

shuffled_paragraph = shuffle_text(test_paragraph)
shuffled_perplexity, shuffled_loss = compute_perplexity(shuffled_paragraph, model, tokenizer, device)

result = {
    'paragraph_num': i,
    'original_text': paragraph,
    'shuffled_text': shuffled_paragraph,
    'original_perplexity': original_perplexity,
    'shuffled_perplexity': shuffled_perplexity,
    'original_loss': original_loss,
    'shuffled_loss': shuffled_loss,
    'perplexity_ratio': (shuffled_perplexity / original_perplexity) if original_perplexity else float('inf'),
    'loss_difference': shuffled_loss - original_loss
}
results.append(result)

print("\n")
print(f"Original perplexity: {original_perplexity:.2f} | loss: {original_loss:.4f}")
print(f"Shuffled  perplexity: {shuffled_perplexity:.2f} | loss: {shuffled_loss:.4f}")
print(f"Ratio (shuffled / original): {result['perplexity_ratio']:.2f}")
print(f"Loss difference (shuffled - original): {result['loss_difference']:.4f}")
print(f"Shuffled preview: {shuffled_paragraph[:200]}{'...' if len(shuffled_paragraph) > 200 else ''}")


The unanimous Declaration of the thirteen united States of America, When in the Course of human events, it becomes necessary for one people to dissolve the political bands which have connected them with another, and to assume among the powers of the earth, the separate and equal station to which the Laws of Nature and of Nature's God entitle them, a decent respect to the opinions of mankind requires that they should declare the causes which impel them to the separation. We hold these truths to be self-evident, that all men are created equal, that they are endowed by their Creator with certain unalienable Rights, that among these are Life, Liberty and the pursuit of Happiness.--That to secure these rights, Governments are instituted among Men, deriving their just powers from the consent of the governed, --That whenever any Form of Government becomes destructive of these ends, it is the Right of the People to alter or to abolish it, and to institute new Government, laying its foundation 

# Comment on difference part a):

The shuffled paragraph’s perplexity is higher than the oriparagraph. This is expected because shuffling disrupts discourse-level structure and some cross-sentence context that the LM relies on for next‑token prediction.  Since we shuffled by sentence, most within‑sentence token dependencies remain intact, so the increase is modest compared to word‑level shuffling. If we instead shuffle words, the increase in perplexity would be a lot higher.

# Part b) Sampling Comparison

## Greedy text generation and with temperature tuning text generation function

In [20]:

def generate_text_greedy(prompt: str, max_length: int = 500, model: GPT2LMHeadModel = None,
                         tokenizer: GPT2Tokenizer = None, device: torch.device = None) -> str:
    if model is None:
        model = globals()['model']
    if tokenizer is None:
        tokenizer = globals()['tokenizer']
    if device is None:
        device = globals()['device']
    enc = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        out_ids = model.generate(
            **enc,
            max_length=max_length,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id
        )
    return tokenizer.decode(out_ids[0], skip_special_tokens=True)

def generate_text_temperature(prompt: str, temperature: float = 1.0, max_length: int = 500,
                              model: GPT2LMHeadModel = None,
                              tokenizer: GPT2Tokenizer = None, device: torch.device = None) -> str:
    if model is None:
        model = globals()['model']
    if tokenizer is None:
        tokenizer = globals()['tokenizer']
    if device is None:
        device = globals()['device']
    enc = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        out_ids = model.generate(
            **enc,
            max_length=max_length,
            do_sample=True,
            temperature=temperature,
            top_k=0,
            pad_token_id=tokenizer.eos_token_id
        )
    return tokenizer.decode(out_ids[0], skip_special_tokens=True)


In [28]:

prompt = "Once upon a time"
max_length = 500
temperatures = [0, 0.3, 0.6, 0.9, 1.2, 1.5]

greedy_text = generate_text_greedy(prompt, max_length)
generated_texts = {0: greedy_text}

for idx, temp in enumerate(temperatures[1:], start=2):
    temp_text = generate_text_temperature(prompt, temp, max_length)
    generated_texts[temp] = temp_text

out_path = Path.cwd() / "generated_texts_by_temperature.txt"
with open(out_path, "w", encoding="utf-8") as f:
    for temp, text in sorted(generated_texts.items(), key=lambda kv: float(kv[0])):
        header = "Greedy (T=0)" if float(temp) == 0.0 else f"Temperature {float(temp):.1f}"
        f.write(header + "\n" + "-" * len(header) + "\n")
        f.write(f"Prompt: {prompt}\n")
        f.write(text + "\n")

# Compare diversity and quality:


Greedy (T=0)
Highly repetitive, loops the same sentence (“United States… military presence”). Coherent at the sentence level, but collapses into exact repetition; no progression.

T = 0.3
Output has slightly more variation than greedy, but still having heavy repetition. Low novelty; minimal narrative development.

T = 0.6
The output has more lexical variety (“Caspian Empire” for example), but had several repetition in sentence pieces (“was divided into two parts” repeated).

T = 0.9
Decent diversity and narrative movement (ceasefire, oil fields, characters). Sentence starts to contradicts each other.

T = 1.2
High creativity in sentences but have severe incoherence and broken logic.

T = 1.5
Maximum diversity, minimal sense: word salad, entity mashups, formatting glitches, opaque references.

In general, diversity increases monotonically with temperature. On the other hand, The quality of the text follows an inverted-U shape. It is  best around 0.6–0.9 for creative text. Low temperature cause a lot of repetition while high temperatures  makes the text incoherent.