In [34]:
import os
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from bert_score import score
from IPython.display import display, HTML

# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

def setup_gpt2():
    """Load the GPT-2 model and tokenizer"""
    print("Loading GPT-2 model and tokenizer...")
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    tokenizer.pad_token = tokenizer.eos_token
    return model, tokenizer

def generate_text(prompt, model, tokenizer, max_length=40, num_return=3, 
                 temperature=0.9, top_k=50, top_p=0.95):
    """Generate text from a prompt using GPT-2
    - Using text completion approach rather than instruction following
    - Shorter max_length (40) since we only need to generate the completion
    - Higher temperature (0.9) for creative completions
    """
    inputs = tokenizer(prompt, return_tensors='pt', padding=True)
    
    # Generate outputs
    outputs = model.generate(
        input_ids=inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_length=max_length,
        num_return_sequences=num_return,
        do_sample=True,
        top_k=top_k,
        top_p=top_p,
        temperature=temperature,
        pad_token_id=tokenizer.eos_token_id,
        no_repeat_ngram_size=2  # Prevent repetition of n-grams
    )
    
    # Decode and return the generated texts
    return [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

def define_prompts():
    """Define different prompt types for the experiment - formatted for GPT-2 completion style"""
    return {
        # For GPT-2, use partial statements it can complete rather than instructions
        "Direct Instruction": "The key to academic success is",
        "Scenario-Based": "When I failed my exam, my professor told me: \"",
        "Persona-Based": "As a scientist who has experienced many failures, I believe that",
        # More persona variations that worked well
        "Persona-Based 2": "Einstein once said about persistence: \"",
        "Keyword-Based": "Success comes to those who persist and learn because",
        "Conversational": "Student: I failed again.\nMentor: Don't worry, remember that"
    }

def get_human_reference():
    """Return the human-written reference quote"""
    return '"Success is not the absence of failure; it\'s the persistence through failure." - A.P.J. Abdul Kalam, former President of India and aerospace scientist [Source: \'Wings of Fire\' autobiography]'

def process_outputs(outputs, prompts=None, prompt_type=None):
    """Process outputs to clean up and ensure they're motivational
    Added handling for prompt prefix removal and sentence completion
    """
    processed = []
    
    # Get the current prompt if provided
    current_prompt = None
    if prompts and prompt_type:
        current_prompt = prompts[prompt_type]
    
    for output in outputs:
        # First remove the prompt to get just the generated content
        if current_prompt and output.startswith(current_prompt):
            output = output[len(current_prompt):].strip()
        
        # Special handling for conversational outputs - avoid truncation
        if prompt_type == "Conversational":
            # Format the output properly to avoid truncation
            mentor_response = output.split("Mentor: Don't worry, remember that")[1].strip() if "Mentor: Don't worry, remember that" in output else output
            full_output = f"Student: I failed again.\nMentor: Don't worry, remember that {mentor_response}"
            processed.append(full_output)
            continue
            
        # Truncate at sentence end if possible
        end_markers = ['.', '!', '?', '"', '\n']
        for marker in end_markers:
            pos = output.find(marker)
            if pos > 5:  # Ensure we have some content
                output = output[:pos+1]
                break
                
        # Clean up other issues
        output = output.strip('," \n')
        
        # Add quotes for persona outputs if needed
        if prompt_type and "Persona-Based" in prompt_type and not output.startswith('"'):
            output = f'"{output}"'
            
        # Add the prompt back for context
        if current_prompt:
            full_output = f"{current_prompt} {output}"
        else:
            full_output = output
            
        processed.append(full_output)
    
    return processed

def evaluate_with_bertscore(all_outputs, reference):
    """Evaluate outputs using BERTScore against reference"""
    print("Calculating BERTScore...")
    P, R, F1 = score(all_outputs, [reference] * len(all_outputs), lang='en', verbose=True)
    return P, R, F1

def visualize_results(results_df):
    """Create visualizations for the results"""
    # Group and aggregate BERTScore F1 by prompt type
    agg_results = results_df.groupby('Prompt Type', as_index=False)['BERTScore F1'].mean()

    # Plot using seaborn, with hue to avoid warning
    plt.figure(figsize=(10, 6))
    sns.barplot(
        data=agg_results,
        x='Prompt Type',
        y='BERTScore F1',
        hue='Prompt Type',
        palette='muted',
        legend=False  # hide redundant legend
    )
    plt.title('Average BERTScore F1 by Prompt Type')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.savefig('bertscore_by_prompt_type.png')
    plt.close()

    return plt.gcf()



def create_results_table(gpt2_outputs, F1):
    """Create dataframe with results"""
    results = []
    idx = 0
    for prompt_type in gpt2_outputs:
        for i in range(len(gpt2_outputs[prompt_type])):
            results.append({
                "Prompt Type": prompt_type,
                "Output #": i+1,
                "BERTScore F1": round(F1[idx].item(), 4),
                "Output Text": gpt2_outputs[prompt_type][i]
            })
            idx += 1
    
    return pd.DataFrame(results)

def generate_summary_stats(results_df):
    """Generate summary statistics for each prompt type"""
    summary = results_df.groupby('Prompt Type')['BERTScore F1'].agg(
        ['mean', 'min', 'max']).round(4).reset_index()
    summary.columns = ['Prompt Type', 'Average F1', 'Min F1', 'Max F1']
    summary = summary.sort_values('Average F1', ascending=False)
    return summary

def print_submission_checklist(prompts, gpt2_outputs, human_reference):
    """Print the submission checklist in the required format"""
    print("\nSubmission Checklist:\n")
    print("1. The 5 prompts designed:")
    for prompt_type, prompt in prompts.items():
        print(f"   {prompt_type}: '{prompt}'")
    
    print("\n2. All 15 GPT-2 outputs (3 per prompt):")
    for prompt_type in gpt2_outputs:
        print(f"\n   {prompt_type}:")
        for i, output in enumerate(gpt2_outputs[prompt_type], 1):
            print(f"      Output {i}: {output}")
    
    print("\n3. Human-written reference:")
    print(f"   {human_reference}")

def main():
    # Part 1: Load GPT-2 Model
    model, tokenizer = setup_gpt2()
    
    # Part 2: Design Prompts and Generate Outputs
    prompts = define_prompts()
    
    # Generate outputs with varied parameters for different prompt types
    gpt2_outputs = {}
    for prompt_type, prompt in prompts.items():
        print(f"\nGenerating for {prompt_type} prompt...")
        
        # Customize parameters based on prompt type
        if "Persona-Based" in prompt_type:
            temp = 0.85
            max_len = 50  # Slightly longer for persona-based
        elif prompt_type == "Direct Instruction":
            temp = 0.8
            max_len = 30  # Short completions for direct style
        elif prompt_type == "Conversational":
            temp = 0.9
            max_len = 60  # Longer for conversational to capture full response
        else:
            temp = 0.9
            max_len = 40
            
        outputs = generate_text(
            prompt, 
            model, 
            tokenizer,
            max_length=max_len,
            num_return=3,
            temperature=temp
        )
        
        # Process outputs with knowledge of prompt context
        gpt2_outputs[prompt_type] = process_outputs(outputs, prompts, prompt_type)
        
        for i, output in enumerate(gpt2_outputs[prompt_type], 1):
            print(f"Output {i}: {output}")
    
    # Part 3: Get Human Reference
    human_reference = get_human_reference()
    print("\nHuman Reference:")
    print(human_reference)
    
    # Part 4: Evaluate Outputs Using BERTScore
    # Flatten outputs for evaluation
    all_outputs = []
    for prompt_type in gpt2_outputs:
        all_outputs.extend(gpt2_outputs[prompt_type])
    
    # Calculate BERTScore
    P, R, F1 = evaluate_with_bertscore(all_outputs, human_reference)
    
    # Part 5: Create Results Table
    results_df = create_results_table(gpt2_outputs, F1)
    
    # Generate summary statistics
    summary_stats = generate_summary_stats(results_df)
    
    # Print submission checklist
    print_submission_checklist(prompts, gpt2_outputs, human_reference)
    
    # Display results
    print("\n4. BERTScore output table:")
    display(results_df[["Prompt Type", "Output #", "BERTScore F1"]])
    
    print("\nSummary Statistics by Prompt Type:")
    display(summary_stats)
    
    print("\nDetailed Outputs with Scores:")
    display(results_df[["Prompt Type", "Output #", "BERTScore F1", "Output Text"]])
    
    # Create visualization
    try:
        fig = visualize_results(results_df)
        plt.close(fig)
        print("\nVisualization saved as 'bertscore_by_prompt_type.png'")
    except Exception as e:
        print(f"Could not create visualization: {e}")
    
    # Save results directly to Excel with proper formatting
    print("\nSaving results to Excel...")
    excel_writer = pd.ExcelWriter('gpt2_prompt_results.xlsx', engine='openpyxl')
    results_df.to_excel(excel_writer, index=False, sheet_name='Results')
    
    # Adjust column widths for better readability in Excel
    worksheet = excel_writer.sheets['Results']
    worksheet.column_dimensions['A'].width = 20  # Prompt Type
    worksheet.column_dimensions['B'].width = 10  # Output #
    worksheet.column_dimensions['C'].width = 15  # BERTScore
    worksheet.column_dimensions['D'].width = 100  # Output Text
    
    excel_writer.close()
    print("\nResults saved to 'gpt2_prompt_results.xlsx' with proper formatting")

if __name__ == "__main__":
    main()

Loading GPT-2 model and tokenizer...

Generating for Direct Instruction prompt...
Output 1: The key to academic success is to work in groups.
Output 2: The key to academic success is to create strong relationships with students and teachers.
Output 3: The key to academic success is finding ways to achieve it.

Generating for Scenario-Based prompt...
Output 1: When I failed my exam, my professor told me: " That means you have to give your full name, your last name (the last syllable of your name), your first name and your second
Output 2: When I failed my exam, my professor told me: " You have got to do what is right.
Output 3: When I failed my exam, my professor told me: " You are the boss, now don't you think you can hold your nose?"

You have never been an American.

Generating for Persona-Based prompt...
Output 1: As a scientist who has experienced many failures, I believe that "the answer is simple: if you are not doing something right, your performance will suffer."
Output 2: As a

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:03<00:00,  3.94s/it]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 15.48it/s]


done in 4.03 seconds, 4.46 sentences/sec

Submission Checklist:

1. The 5 prompts designed:
   Direct Instruction: 'The key to academic success is'
   Scenario-Based: 'When I failed my exam, my professor told me: "'
   Persona-Based: 'As a scientist who has experienced many failures, I believe that'
   Persona-Based 2: 'Einstein once said about persistence: "'
   Keyword-Based: 'Success comes to those who persist and learn because'
   Conversational: 'Student: I failed again.
Mentor: Don't worry, remember that'

2. All 15 GPT-2 outputs (3 per prompt):

   Direct Instruction:
      Output 1: The key to academic success is to work in groups.
      Output 2: The key to academic success is to create strong relationships with students and teachers.
      Output 3: The key to academic success is finding ways to achieve it.

   Scenario-Based:
      Output 1: When I failed my exam, my professor told me: " That means you have to give your full name, your last name (the last syllable of your na

Unnamed: 0,Prompt Type,Output #,BERTScore F1
0,Direct Instruction,1,0.8532
1,Direct Instruction,2,0.8421
2,Direct Instruction,3,0.8481
3,Scenario-Based,1,0.8052
4,Scenario-Based,2,0.8366
5,Scenario-Based,3,0.8222
6,Persona-Based,1,0.8448
7,Persona-Based,2,0.8363
8,Persona-Based,3,0.8534
9,Persona-Based 2,1,0.8463



Summary Statistics by Prompt Type:


Unnamed: 0,Prompt Type,Average F1,Min F1,Max F1
1,Direct Instruction,0.8478,0.8421,0.8532
3,Persona-Based,0.8448,0.8363,0.8534
4,Persona-Based 2,0.8415,0.8378,0.8463
2,Keyword-Based,0.8334,0.8271,0.8387
5,Scenario-Based,0.8213,0.8052,0.8366
0,Conversational,0.8097,0.8095,0.8098



Detailed Outputs with Scores:


Unnamed: 0,Prompt Type,Output #,BERTScore F1,Output Text
0,Direct Instruction,1,0.8532,The key to academic success is to work in groups.
1,Direct Instruction,2,0.8421,The key to academic success is to create stron...
2,Direct Instruction,3,0.8481,The key to academic success is finding ways to...
3,Scenario-Based,1,0.8052,"When I failed my exam, my professor told me: ""..."
4,Scenario-Based,2,0.8366,"When I failed my exam, my professor told me: ""..."
5,Scenario-Based,3,0.8222,"When I failed my exam, my professor told me: ""..."
6,Persona-Based,1,0.8448,As a scientist who has experienced many failur...
7,Persona-Based,2,0.8363,As a scientist who has experienced many failur...
8,Persona-Based,3,0.8534,As a scientist who has experienced many failur...
9,Persona-Based 2,1,0.8463,"Einstein once said about persistence: "" ""There..."



Visualization saved as 'bertscore_by_prompt_type.png'

Saving results to Excel...

Results saved to 'gpt2_prompt_results.xlsx' with proper formatting
