In [1]:
# Install dependencies
!pip install datasets transformers mauve-text

Collecting mauve-text
  Downloading mauve_text-0.4.0-py3-none-any.whl.metadata (3.5 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-21.0.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting huggingface-hub>=0.24.0 (from datasets)
  Downloading huggingface_hub-0.35.3-py3-none-any.whl.metadata (14 kB)
Collecting faiss-cpu>=1.7.0 (from mauve-text)
  Downloading faiss_cpu-1.12.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading mauve_text-0.4.0-py3-none-any.whl (21 kB)
Downloading faiss_cpu-1.12.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m53.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading huggingface_hub-0.35.3-py3-none-any.whl (564 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m564.3/564.3 kB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-21

In [2]:
# Setup and Implementation
# Part 1

# Loading the Wikitext-103 dataset 
from datasets import load_dataset

# Extract the test dataset from wikitext-103-raw-v1
dataset = load_dataset("wikitext", "wikitext-103-raw-v1", split="test")

# Obtaining 10 valid prompts to use in contrastive decoding
num_prompts = 10

# All input prompts should be at least 50 words long
min_length = 50

# Stores labels for predicted text
prompt_labels = []

# Stores first 30 words of each prompt
# Use contrastive decoding to predict the remaining words in each prompt
prompts = []

for example in dataset:
    text = example['text'].strip()
    if len(text) >= min_length:
        words = text.split()
        prompt = ' '.join(words[:30])
        labels = ' '.join(words[30:])
        prompts.append(prompt)
        prompt_labels.append(labels)
        if len(prompts) >= num_prompts:
            break
    
print(f"Obtained {len(prompts)} prompts")
print(prompts)
print(prompt_labels)

README.md: 0.00B [00:00, ?B/s]

wikitext-103-raw-v1/test-00000-of-00001.(…):   0%|          | 0.00/733k [00:00<?, ?B/s]

wikitext-103-raw-v1/train-00000-of-00002(…):   0%|          | 0.00/157M [00:00<?, ?B/s]

wikitext-103-raw-v1/train-00001-of-00002(…):   0%|          | 0.00/157M [00:00<?, ?B/s]

wikitext-103-raw-v1/validation-00000-of-(…):   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1801350 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Obtained 10 prompts
['Robert Boulter is an English film , television and theatre actor . He had a guest @-@ starring role on the television series The Bill in 2000 . This was', 'In 2006 , Boulter starred alongside Whishaw in the play Citizenship written by Mark Ravenhill . He appeared on a 2006 episode of the television series , Doctors , followed', 'In 2000 Boulter had a guest @-@ starring role on the television series The Bill ; he portrayed " Scott Parry " in the episode , " In Safe Hands', 'He had a recurring role in 2003 on two episodes of The Bill , as character " Connor Price " . In 2004 Boulter landed a role as " Craig', 'In 2006 Boulter starred in the play Citizenship written by Mark Ravenhill . The play was part of a series which featured different playwrights , titled Burn / Chatroom /', 'Boulter starred in two films in 2008 , Daylight Robbery by filmmaker Paris Leonti , and Donkey Punch directed by Olly Blackburn . Boulter portrayed a character named " Sean', 'Du Fu ( Wade 

In [39]:
# Setup and Implementation
# Part 2
import torch
import torch.nn.functional as F
from transformers import AutoModelForCausalLM, AutoTokenizer
import mauve
import numpy as np

class ContrastiveDecoder:
    def __init__(self,
                 expert_model_name="gpt2-xl",
                 amateur_model_name="gpt2",
                 device="cuda" if torch.cuda.is_available() else "cpu"):
        self.device = device
        
        # Load expert model (gpt2-xl)
        self.expert_model = AutoModelForCausalLM.from_pretrained(expert_model_name).to(device)
        self.expert_model.eval()
        
        # Load amateur model (gpt2)
        self.amateur_model = AutoModelForCausalLM.from_pretrained(amateur_model_name).to(device)
        self.amateur_model.eval()
        
        # Load tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(expert_model_name)
        self.tokenizer.pad_token = self.tokenizer.eos_token
            
    def generate_contrastive(self,
                             prompt,
                             prompt_labels,
                             max_new_tokens,
                             alpha,
                             amateur_temperature,
                             amateur_context_window
                            ):
        # Encode prompt
        input_ids = self.tokenizer.encode(prompt, return_tensors="pt").to(self.device)        
        generated_tokens = []
        cd_info = {}
        contrastive_probs_list = []
        
        with torch.no_grad():
            # Obtain predictions for at most max_new_tokens
            for step in range(max_new_tokens):
                # Get expert model logits for next token
                expert_outputs = self.expert_model(input_ids)
                expert_logits = expert_outputs.logits[:, -1, :]
                
                # Get amateur model logits with context window restriction for next token
                if amateur_context_window is not None:
                    amateur_input_ids = input_ids[:, -amateur_context_window:]
                else:
                    amateur_input_ids = input_ids
                amateur_outputs = self.amateur_model(amateur_input_ids)
                amateur_logits = amateur_outputs.logits[:, -1, :]
                
                # Apply amateur model temperature to amateur model logits
                amateur_logits = amateur_logits / amateur_temperature
                
                # Convert logits to log probabilities
                expert_model_log_probs = F.log_softmax(expert_logits, dim=-1)
                amateur_model_log_probs = F.log_softmax(amateur_logits, dim=-1)
                
                # Calculating final contrastive score for this prediction
                contrastive_scores = expert_model_log_probs - alpha * amateur_model_log_probs
                next_token_id = torch.argmax(contrastive_scores, dim=-1)

                # Obtain next token id
                token_id = next_token_id.item()

                # Tracking contrastive probability for each predicted token
                contrastive_probs = F.softmax(contrastive_scores, dim=-1)
                contrastive_token_prob = contrastive_probs[0, token_id].item()
                contrastive_probs_list.append(contrastive_token_prob)
                
                # Append token and continue
                generated_tokens.append(token_id)
                next_token = torch.tensor([[token_id]], device=self.device)
                input_ids = torch.cat([input_ids, next_token], dim=-1)
                
                # If next prediction is EOS, break
                if token_id == self.tokenizer.eos_token_id:
                    break
        
        # Decode generated text
        generated_tokens = self.tokenizer.decode(generated_tokens, skip_special_tokens=True)
        
        # Computing metrics

        # Computing mauve score
        out = mauve.compute_mauve(
            p_text=prompt_labels[:max_new_tokens].split(),  
            q_text=generated_tokens.split(),  
            device_id=0 if torch.cuda.is_available() else -1,
            max_text_length=max_new_tokens,
            featurize_model_name="gpt2"
        )

        # Computing perplexity
        contrastive_log_probs = [np.log(p + 1e-10) for p in contrastive_probs_list]
        contrastive_perplexity = np.exp(-np.mean(contrastive_log_probs))
        
        cd_info = {
            'distinct1': ContrastiveDecoder.distinct_n(generated_tokens, 1),
            'distinct2': ContrastiveDecoder.distinct_n(generated_tokens, 2),
            'distinct3': ContrastiveDecoder.distinct_n(generated_tokens, 3),
            'mauve': out.mauve,  
            'contrastive_perplexity': contrastive_perplexity
        }
        
        return generated_tokens, cd_info

    # Gets all n grams in the list of tokens
    @staticmethod
    def get_ngrams(tokens, n):
        if len(tokens) < n:
            return []
        return [tuple(tokens[i:i+n]) for i in range(len(tokens) - n + 1)]

    # Returns the proportion of unique n grams in the list of tokens
    @staticmethod
    def distinct_n(tokens, n):
        tokens = [token.lower() for token in tokens]
    
        all_ngrams = []
        ngrams = ContrastiveDecoder.get_ngrams(tokens, n)
        all_ngrams.extend(ngrams)
        
        if len(all_ngrams) == 0:
            return 0.0
        
        unique_ngrams = len(set(all_ngrams))
        total_ngrams = len(all_ngrams)
        return unique_ngrams / total_ngrams

In [40]:
# Setup and Implementation
# Part 2
def run_ablation_studies(
    decoder,
    prompts,
    prompt_labels,
    max_new_tokens,
    alpha):
    
    # Varying temperatures of amateur model
    temperature_configs = [0.5, 1.0, 1.5]
    
    # Varying context window of amateur model
    max_context = 1024
    context_window_configs = [None, max_context // 2, 1]
    
    all_results = {}
    all_text_results = {}
    
    # Temperature ablation
    for temp in temperature_configs:
        # Context Window Ablation
        for context_window_restrict in context_window_configs:
            # Iterate through all 10 prompts we're testing
            for i, prompt in enumerate(prompts):
                # conduct contrastive decoding for this prompt
                generated_text, cd_info = decoder.generate_contrastive(
                    prompt=prompt,
                    prompt_labels = prompt_labels[i],
                    max_new_tokens=max_new_tokens,
                    alpha=alpha,
                    amateur_temperature=temp,  
                    amateur_context_window=context_window_restrict)

                # aggregate all the metrics for this Temperature, Context Window configuration
                if i == 0:
                    all_results[(temp, context_window_restrict)] = cd_info
                    all_text_results[(temp, context_window_restrict)] = [generated_text]
                else:
                    for key in cd_info:
                        all_results[(temp, context_window_restrict)][key] += cd_info[key]
                    all_text_results[(temp, context_window_restrict)].append(generated_text)

            # Store average of all metrics across the 10 prompts for this Temperature, Context Window configuration
            for key in cd_info:
                all_results[(temp, context_window_restrict)][key] = all_results[(temp, context_window_restrict)][key] / 10

    # Return metrics across all Temperature, Context Window configurations
    return all_results, all_text_results
                


In [46]:
# Setup and Implementation
# Part 2
# Perform ablation studies
decoder = ContrastiveDecoder(
    expert_model_name="gpt2-xl",
    amateur_model_name="gpt2"
)
all_results, all_text_results = run_ablation_studies(
    decoder=decoder,
    prompts=prompts,
    prompt_labels=prompt_labels,
    # around the max amount of tokens I can generate in each iteration without an OutOfMemoryError being thrown
    max_new_tokens=200,
    alpha=0.5
)

Featurizing p:   0%|          | 0/39 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/65 [00:00<?, ?it/s]



Featurizing p:   0%|          | 0/38 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/93 [00:00<?, ?it/s]



Featurizing p:   0%|          | 0/39 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/47 [00:00<?, ?it/s]



Featurizing p:   0%|          | 0/39 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/74 [00:00<?, ?it/s]



Featurizing p:   0%|          | 0/40 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/33 [00:00<?, ?it/s]



Featurizing p:   0%|          | 0/46 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/76 [00:00<?, ?it/s]



Featurizing p:   0%|          | 0/40 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/74 [00:00<?, ?it/s]



Featurizing p:   0%|          | 0/41 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/73 [00:00<?, ?it/s]



Featurizing p:   0%|          | 0/35 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/82 [00:00<?, ?it/s]



Featurizing p:   0%|          | 0/42 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/68 [00:00<?, ?it/s]



Featurizing p:   0%|          | 0/39 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/65 [00:00<?, ?it/s]



Featurizing p:   0%|          | 0/38 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/93 [00:00<?, ?it/s]



Featurizing p:   0%|          | 0/39 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/47 [00:00<?, ?it/s]



Featurizing p:   0%|          | 0/39 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/74 [00:00<?, ?it/s]



Featurizing p:   0%|          | 0/40 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/33 [00:00<?, ?it/s]



Featurizing p:   0%|          | 0/46 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/76 [00:00<?, ?it/s]



Featurizing p:   0%|          | 0/40 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/74 [00:00<?, ?it/s]



Featurizing p:   0%|          | 0/41 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/73 [00:00<?, ?it/s]



Featurizing p:   0%|          | 0/35 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/82 [00:00<?, ?it/s]



Featurizing p:   0%|          | 0/42 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/68 [00:00<?, ?it/s]



Featurizing p:   0%|          | 0/39 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/161 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/38 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/122 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/39 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/150 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/39 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/155 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/40 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/180 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/46 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/165 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/40 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/101 [00:00<?, ?it/s]



Featurizing p:   0%|          | 0/41 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/200 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/35 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/132 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/42 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/103 [00:00<?, ?it/s]



Featurizing p:   0%|          | 0/39 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/3 [00:00<?, ?it/s]



Featurizing p:   0%|          | 0/38 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/153 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/39 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/119 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/39 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/151 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/40 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/104 [00:00<?, ?it/s]



Featurizing p:   0%|          | 0/46 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/135 [00:00<?, ?it/s]



Featurizing p:   0%|          | 0/40 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/17 [00:00<?, ?it/s]



Featurizing p:   0%|          | 0/41 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/127 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/35 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/122 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/42 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/169 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/39 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/3 [00:00<?, ?it/s]



Featurizing p:   0%|          | 0/38 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/153 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/39 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/119 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/39 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/151 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/40 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/104 [00:00<?, ?it/s]



Featurizing p:   0%|          | 0/46 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/135 [00:00<?, ?it/s]



Featurizing p:   0%|          | 0/40 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/17 [00:00<?, ?it/s]



Featurizing p:   0%|          | 0/41 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/127 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/35 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/122 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/42 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/169 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/39 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/107 [00:00<?, ?it/s]



Featurizing p:   0%|          | 0/38 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/121 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/39 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/157 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/39 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/175 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/40 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/182 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/46 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/155 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/40 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/160 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/41 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/197 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/35 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/198 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/42 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/161 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/39 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/200 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/38 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/93 [00:00<?, ?it/s]



Featurizing p:   0%|          | 0/39 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/137 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/39 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/45 [00:00<?, ?it/s]



Featurizing p:   0%|          | 0/40 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/106 [00:00<?, ?it/s]



Featurizing p:   0%|          | 0/46 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/162 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/40 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/148 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/41 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/154 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/35 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/148 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/42 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/178 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/39 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/200 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/38 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/93 [00:00<?, ?it/s]



Featurizing p:   0%|          | 0/39 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/137 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/39 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/45 [00:00<?, ?it/s]



Featurizing p:   0%|          | 0/40 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/106 [00:00<?, ?it/s]



Featurizing p:   0%|          | 0/46 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/162 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/40 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/148 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/41 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/154 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/35 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/148 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/42 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/178 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/39 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/141 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/38 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/154 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/39 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/168 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/39 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/167 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/40 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/160 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/46 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/185 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/40 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/78 [00:00<?, ?it/s]



Featurizing p:   0%|          | 0/41 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/93 [00:00<?, ?it/s]



Featurizing p:   0%|          | 0/35 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/185 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/42 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/158 [00:00<?, ?it/s]

In [47]:
# Evaluation and Analysis
# Part 1
import pandas as pd
df = pd.DataFrame(all_results)
df.columns = [(0.5, None), (0.5, 512.0), (0.5, 1.0), (1.0, None), (1.0, 512.0), (1.0, 1.0), (1.5, None), (1.5, 512.0), (1.5, 1.0)]
for column in df.columns:
    df[column] = all_results[column]

# Clear table format comparing mean of metrics across all 10 test prompts for all 9 different configurations
# columns = (amateur model temperature, context window restriction(None = no context window restriction, 512 = context window cut in half, 1.0 = context window only includes most previous token))
df

Unnamed: 0,"(0.5, None)","(0.5, 512.0)","(0.5, 1.0)","(1.0, None)","(1.0, 512.0)","(1.0, 1.0)","(1.5, None)","(1.5, 512.0)","(1.5, 1.0)"
distinct1,0.04156,0.04156,0.022797,0.120417,0.120417,0.028238,0.051938,0.051938,0.034732
distinct2,0.281699,0.281699,0.060389,0.349452,0.349452,0.098963,0.240841,0.240841,0.141858
distinct3,0.580029,0.580029,0.075352,0.543129,0.543129,0.137993,0.387044,0.387044,0.211346
mauve,0.176033,0.176033,0.206327,0.882389,0.882389,0.727631,0.912594,0.912594,0.766544
contrastive_perplexity,112.528865,112.528865,1.37062,20.907199,20.907199,1.472258,4.82589,4.82589,1.629788


In [52]:
# Evaluation and Analysis
# Part 2

"""
As stated in the comments, the distinctN metrics store the proportion of unique ngrams among the generated words. The goal in contrastive decoding is to produce more diverse output so according to the table shown above, the (1.0 = temperature, None = context window restriction) and (1.0 = temperature, 512 = context window restriction) configurations produces the best/highest distinct1 score, the (1.0 = temperature, None = context window restriction) and (1.0 = temperature, 512 = context window restriction) configurations produces the best/highest distinct2 score, the (0.5 = temperature, None = context window restriction) and (0.5 = temperature, 512 = context window restriction) configurations produces the best/highest distinct3 score, the (1.5 = temperature, None = context window restriction) and (1.5 = temperature, 512 = context window restriction) configurations produces the best/highest mauve scores, and the (0.5 = temperature, 1.0 = context window restriction) configuration produces the best/lowest perplexity score. It’s kind of hard to choose which configuration is best since the performance of all the configurations across each metric kind of varies, but I guess the (1.0 = temperature, None = context window restriction) and (1.0 = temperature, 512 = context window restriction) configurations for contrastive decoding produced the best overall results since they achieved the best overall scores for the most metrics. In general, apart from the best perplexity score observed for the (0.5 = temperature, 1.0 = context window restriction) configuration, it seems like an increase in temperature drastically improves the perplexity and makes the overall system more confident about the tokens it selects. This makes sense since increasing the temperature will make the amateur model distribution flatter causing the system to not penalize the higher probability tokens as much leading to a less diverse output where the system is more confident in each token. This result is similar to what was presented in the results for the original paper except the original paper used coherence instead of perplexity. In addition, the higher temperature values(1.0 and 1.5) tend to help the system choose tokens more similar to the true human generated text provided in the prompt labels as seen in the relatively high mauve scores produced under these conditions. This is different from the results observed in the paper which show that lower temperature values >= 0.5 produce a higher mauve score than the higher temperature values(1.0, 1.5). One other major takeaway I saw was how the distinctN values generally increase when the temperature changes from 0.5 to 1 but then generally decrease when the temperature changes from 1 to 1.5. This is also different from the results observed in the paper which show that the diversity of the results should actually decrease as temperature goes up since the amateur model distribution should be flatter which causes the higher probability tokens to still be favored more. One other thing I noticed in the results was that the context window restriction mattered for the diversityN metrics. Whenever the context window was decreased to just the most previous token, the diversityN metric always decreased by a considerable amount. Finally, there wasn’t really any difference between the metrics for when the context window restriction was None and when the context window restriction was half of the full context window which indicates that we probably don’t need the full context window in predicting the next token and having access to just half the full context window is suitable for predicting the next token.
"""

'\nAs stated in the comments, the distinctN metrics store the proportion of unique ngrams among the generated words. The goal in contrastive decoding is to produce more diverse output so according to the table shown above, the (1.0 = temperature, None = context window restriction) and (1.0 = temperature, 512 = context window restriction) configurations produces the best/highest distinct1 score, the (1.0 = temperature, None = context window restriction) and (1.0 = temperature, 512 = context window restriction) configurations produces the best/highest distinct2 score, the (0.5 = temperature, None = context window restriction) and (0.5 = temperature, 512 = context window restriction) configurations produces the best/highest distinct3 score, the (1.5 = temperature, None = context window restriction) and (1.5 = temperature, 512 = context window restriction) configurations produces the best/highest mauve scores, and the (0.5 = temperature, 1.0 = context window restriction) configuration prod

In [53]:
# Evaluation and Analysis
# Part 3
#Printing Best Text Generation Results Per Diversity Metric: distinct3
for i in range(0, 2):
    print(all_text_results[(0.5, None)][i])
    print()
print("---------")
#Printing Worst Text Generation Results Per Diversity Metric: distinct3
for i in range(0, 2):
    print(all_text_results[(0.5, 1.0)][i])
    print()
print("---------")
"""
As seen in the output of the best and worst strategies as per the diversity3 metric, the best strategy(0.5 = temperature, None = context window restriction) just produces a bunch of gibberish. So no wonder, it has the highest unique n gram proportion since every word is just random gibberish. On the other hand, the worst strategy(0.5 = temperature, 1.0 = context window restriction) just repeats the same words a bunch of times which explains why it has the smallest unique n gram proportion.
"""

��ertoddiery pan�ernessْvell龍契士aucas Nantoγ DeVos Ichigo Canad�anwhileenei Greggenei affectْivities declined Ichigo Greggعlying bullet Ichigo CanadbanesoDeliveryDate Canad Eisenanding bullet Gregg��γaido Canadedin Gregg 2021γialsddennings CIddennatureconservancy CIapoALTskilledVERTISEMENT dart Canad� Franch Franch Franch Franch Franch Franch Franch espresso Beckenei Bender Beckadoes Canad�inates benzReviewer Canad�isSpecialOrderable vaslining sodium Beckamiyaealuronarin WATCHEDigningeren Manz龍契士urnal Bendereren vacγned Benderental CanadDragonMagazinepentailsashingtonSELECTTEDlightingpenETHariesearchersursesortspentailsashington CanadDragonMagazine FANTceptionsearchersortsursesRatedunertailspentailsittleSELECTioneertoddtingitionallypenqtBALLflushnelsittlepen Fletcher CanadDragonMagazine Quin��itionally Canad bleachnels Xeonestine Bender streakstellitect curvвearchersitectALTflushART Quinione CanadDragonMagazine Nets Franch QuinadoesUTERSitizertoUTERS��itionally CFLflushARTнundredsunn��m

'\nAs seen in the output of the best and worst strategies as per the diversity3 metric, the best strategy(0.5 = temperature, None = context window restriction) just produces a bunch of gibberish. So no wonder, it has the highest unique n gram proportion since every word is just random gibberish. On the other hand, the worst strategy(0.5 = temperature, 1.0 = context window restriction) just repeats the same words a bunch of times which explains why it has the smallest unique n gram proportion.\n'

In [45]:
import gc
gc.collect()
torch.cuda.empty_cache()