# Prompting Results Analysis

This notebook analyzes the results from attribute prompting experiments.

In [None]:
import json
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import os
import glob

# Load all JSON files from a folder
results_folder = "path/to/your/results/folder"  # Update this path

# Get all JSON files in the folder
json_files = glob.glob(os.path.join(results_folder, "*.json"))
print(f"Found {len(json_files)} JSON files")

# Load and combine all data
all_data = {}
for file_path in json_files:
    print(f"Loading {os.path.basename(file_path)}...")
    with open(file_path, 'r') as f:
        file_data = json.load(f)
        # Add filename info to each record for tracking
        for record in file_data:
            record['source_file'] = os.path.basename(file_path)
        all_data.extend(file_data)

print(f"\nTotal loaded examples: {len(all_data)}")
data = all_data

In [None]:
# Convert to DataFrame for easier analysis
df = pd.DataFrame(data)
print(f"Dataset shape: {df.shape}")
print("\nFirst few examples:")
df.head()

In [None]:
# Convert responses to numeric format for evaluation
def response_to_numeric(response):
    """Convert 'True'/'False' responses to 1/0"""
    if response.strip().lower() == 'true':
        return 1
    elif response.strip().lower() == 'false':
        return 0
    else:
        print(f"Unexpected response: {response}")
        return -1

df['predicted'] = df['response'].apply(response_to_numeric)

# Check for unexpected responses
unexpected_responses = df[df['predicted'] == -1]
if len(unexpected_responses) > 0:
    print(f"Found {len(unexpected_responses)} unexpected responses")
    print(unexpected_responses['response'].unique())
else:
    print("All responses are 'True' or 'False'")

In [None]:
# Remove any rows with unexpected responses
df_clean = df[df['predicted'] != -1].copy()
print(f"Clean dataset shape: {df_clean.shape}")

# Extract true labels and predictions
y_true = df_clean['label'].values
y_pred = df_clean['predicted'].values

In [None]:
# Calculate metrics
accuracy = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")

# Detailed classification report
print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=['False', 'True']))

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_true, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Predicted False', 'Predicted True'],
            yticklabels=['Actual False', 'Actual True'])
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

In [None]:
# Distribution of responses
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

# True labels distribution
df_clean['label'].value_counts().plot(kind='bar', ax=ax1, color=['red', 'green'])
ax1.set_title('Distribution of True Labels')
ax1.set_xlabel('Label')
ax1.set_ylabel('Count')
ax1.set_xticklabels(['False (0)', 'True (1)'], rotation=0)

# Predicted labels distribution
df_clean['predicted'].value_counts().plot(kind='bar', ax=ax2, color=['red', 'green'])
ax2.set_title('Distribution of Predictions')
ax2.set_xlabel('Prediction')
ax2.set_ylabel('Count')
ax2.set_xticklabels(['False (0)', 'True (1)'], rotation=0)

plt.tight_layout()
plt.show()

In [None]:
# Analysis by attribute
print("Performance by attribute:")
attribute_results = []

for attr in df_clean['attribute'].unique():
    attr_data = df_clean[df_clean['attribute'] == attr]
    attr_accuracy = accuracy_score(attr_data['label'], attr_data['predicted'])
    attr_f1 = f1_score(attr_data['label'], attr_data['predicted'])
    
    attribute_results.append({
        'attribute': attr,
        'count': len(attr_data),
        'accuracy': attr_accuracy,
        'f1_score': attr_f1
    })
    
    print(f"{attr}: {len(attr_data)} examples, Accuracy: {attr_accuracy:.4f}, F1: {attr_f1:.4f}")

# Convert to DataFrame for visualization
attr_df = pd.DataFrame(attribute_results)
attr_df = attr_df.sort_values('accuracy', ascending=False)

# Analysis by source file (if applicable)
if 'source_file' in df_clean.columns:
    print("\n\nPerformance by source file:")
    file_results = []
    
    for file_name in df_clean['source_file'].unique():
        file_data = df_clean[df_clean['source_file'] == file_name]
        file_accuracy = accuracy_score(file_data['label'], file_data['predicted'])
        file_f1 = f1_score(file_data['label'], file_data['predicted'])
        
        file_results.append({
            'source_file': file_name,
            'count': len(file_data),
            'accuracy': file_accuracy,
            'f1_score': file_f1
        })
        
        print(f"{file_name}: {len(file_data)} examples, Accuracy: {file_accuracy:.4f}, F1: {file_f1:.4f}")
    
    file_df = pd.DataFrame(file_results)
    file_df = file_df.sort_values('accuracy', ascending=False)

attr_df

In [None]:
# Visualize performance by attribute
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Accuracy by attribute
attr_df.plot(x='attribute', y='accuracy', kind='bar', ax=axes[0,0], color='skyblue')
axes[0,0].set_title('Accuracy by Attribute')
axes[0,0].set_ylabel('Accuracy')
axes[0,0].tick_params(axis='x', rotation=45)
axes[0,0].set_ylim(0, 1)

# F1 score by attribute
attr_df.plot(x='attribute', y='f1_score', kind='bar', ax=axes[0,1], color='lightcoral')
axes[0,1].set_title('F1 Score by Attribute')
axes[0,1].set_ylabel('F1 Score')
axes[0,1].set_xlabel('Attribute')
axes[0,1].tick_params(axis='x', rotation=45)
axes[0,1].set_ylim(0, 1)

# Performance by source file (if available)
if 'source_file' in df_clean.columns and len(file_df) > 1:
    # Accuracy by file
    file_df.plot(x='source_file', y='accuracy', kind='bar', ax=axes[1,0], color='lightgreen')
    axes[1,0].set_title('Accuracy by Source File')
    axes[1,0].set_ylabel('Accuracy')
    axes[1,0].tick_params(axis='x', rotation=45)
    axes[1,0].set_ylim(0, 1)
    
    # F1 by file
    file_df.plot(x='source_file', y='f1_score', kind='bar', ax=axes[1,1], color='orange')
    axes[1,1].set_title('F1 Score by Source File')
    axes[1,1].set_ylabel('F1 Score')
    axes[1,1].set_xlabel('Source File')
    axes[1,1].tick_params(axis='x', rotation=45)
    axes[1,1].set_ylim(0, 1)
else:
    # Hide unused subplots if no file data
    axes[1,0].set_visible(False)
    axes[1,1].set_visible(False)

plt.tight_layout()
plt.show()

In [None]:
# Summary statistics
print("\n=== SUMMARY ===")
print(f"Total examples: {len(df)}")
print(f"Clean examples: {len(df_clean)}")
print(f"Overall Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"Overall F1 Score: {f1:.4f}")
print(f"Number of unique attributes: {len(df_clean['attribute'].unique())}")
print(f"Best performing attribute: {attr_df.iloc[0]['attribute']} (Accuracy: {attr_df.iloc[0]['accuracy']:.4f})")
print(f"Worst performing attribute: {attr_df.iloc[-1]['attribute']} (Accuracy: {attr_df.iloc[-1]['accuracy']:.4f})")

if 'source_file' in df_clean.columns and len(file_df) > 1:
    print(f"\nNumber of source files: {len(file_df)}")
    print(f"Best performing file: {file_df.iloc[0]['source_file']} (Accuracy: {file_df.iloc[0]['accuracy']:.4f})")
    print(f"Worst performing file: {file_df.iloc[-1]['source_file']} (Accuracy: {file_df.iloc[-1]['accuracy']:.4f})")