In [1]:
# Prompt Engineering with GPT-2 and BERTScore - WORKING VERSION
# Run this in Google Colab

# First install all required packages
!pip install transformers torch bert-score pandas

# Import libraries
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
from bert_score import score
import pandas as pd

# Part 1 - Load GPT-2 Model
print("Loading GPT-2 model...")
model_name = "gpt2"  # Using the smallest GPT-2 model
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
print("Model loaded successfully!\n")

# Part 2 - Design Your Prompts and Generate Outputs
prompts = [
    # Direct instruction
    "Write a motivational quote about overcoming fear.",

    # Scenario-based
    "Imagine you're helping a friend who failed a test. Write something encouraging.",

    # Persona-based
    "As a wise monk, write a quote about inner strength.",

    # Keyword-based
    "Using the words 'growth', 'struggle', and 'hope', write something inspiring.",

    # Conversational
    "User: I feel like giving up. GPT-2: Here's a quote for you:"
]

def generate_text(prompt, num_outputs=3, max_length=50):
    inputs = tokenizer(prompt, return_tensors="pt")

    outputs = []
    for _ in range(num_outputs):
        output = model.generate(
            inputs.input_ids,
            max_length=max_length,
            num_return_sequences=1,
            no_repeat_ngram_size=2,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            temperature=0.7
        )
        generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
        outputs.append(generated_text)

    return outputs

# Generate outputs for all prompts
print("Generating outputs for each prompt...")
all_outputs = {}
prompt_types = ["Direct", "Scenario", "Persona", "Keyword", "Conversational"]

for i, prompt in enumerate(prompts):
    all_outputs[prompt_types[i]] = generate_text(prompt)

# Display generated outputs
print("\n=== Generated Outputs ===")
for prompt_type, outputs in all_outputs.items():
    print(f"\nPrompt Type: {prompt_type}")
    for i, output in enumerate(outputs, 1):
        print(f"Output {i}: {output}")

# Part 3 - Human-Written Reference
human_reference = """
Success is not final, failure is not fatal: It is the courage to continue that counts.
- Winston Churchill
"""
print("\n=== Human Reference ===")
print(human_reference)

# Part 4 - Evaluate Outputs Using BERTScore
print("\nCalculating BERTScore (this may take a few minutes)...")

def calculate_bertscore(generated_texts, reference):
    # Make sure we're using CUDA if available
    device = "cuda" if torch.cuda.is_available() else "cpu"
    P, R, F1 = score(generated_texts, [reference]*len(generated_texts),
                lang="en", verbose=True, device=device)
    return F1.tolist()

# Calculate BERTScore for all outputs
results = []
for prompt_type, outputs in all_outputs.items():
    f1_scores = calculate_bertscore(outputs, human_reference)
    for i, f1 in enumerate(f1_scores, 1):
        results.append({
            "Prompt Type": prompt_type,
            "Output #": i,
            "BERTScore F1": f"{f1:.4f}"
        })

# Part 5 - Results Table
print("\n=== BERTScore Results ===")
results_df = pd.DataFrame(results)
print(results_df)

# Save results
results_df.to_csv("gpt2_bertscore_results.csv", index=False)
print("\nResults saved to 'gpt2_bertscore_results.csv'")

Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collec

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Model loaded successfully!

Generating outputs for each prompt...


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati


=== Generated Outputs ===

Prompt Type: Direct
Output 1: Write a motivational quote about overcoming fear.

4. Be a positive role model for yourself. This might be the hardest thing to do, but once you've done that, you'll be able to be a better person. It might take a
Output 2: Write a motivational quote about overcoming fear.

You may want to do some research on the subject. It is important to remember that fear is an emotion, and that the negative emotions associated with fear can create some of the same negative effects. Here
Output 3: Write a motivational quote about overcoming fear. Then try to remember this one.

"When you're afraid, you want to keep going, and you don't want the fear to get in the way of your work."
. . . and

Prompt Type: Scenario
Output 1: Imagine you're helping a friend who failed a test. Write something encouraging.

Don't be afraid to speak up. In fact, you can speak your mind. Be the voice of reason. You are the only one who can do this.
Output 2: Imagin

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 2.01 seconds, 1.49 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 1.93 seconds, 1.56 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 1.89 seconds, 1.59 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 2.90 seconds, 1.04 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 1.92 seconds, 1.57 sentences/sec

=== BERTScore Results ===
       Prompt Type  Output # BERTScore F1
0           Direct         1       0.8327
1           Direct         2       0.8381
2           Direct         3       0.8422
3         Scenario         1       0.8412
4         Scenario         2       0.8227
5         Scenario         3       0.8325
6          Persona         1       0.8405
7          Persona         2       0.8401
8          Persona         3       0.8262
9          Keyword         1       0.8319
10         Keyword         2       0.8306
11         Keyword         3       0.8397
12  Conversational         1       0.8249
13  Conversational         2       0.8235
14  Conversational         3       0.8270

Results saved to 'gpt2_bertscore_results.csv'
