# Perfect Instructions Analysis

Analysis of instructions that achieved 100% training score and at least one correct test prediction.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.width', None)

## 1. Load Data

In [None]:
# Load perfect instructions CSV
df = pd.read_csv('/data/hjkim/soar2cot/data/perfect_instructions.csv')

print(f"Total records: {len(df)}")
print(f"Unique tasks: {df['task_id'].nunique()}")
print(f"\nColumns: {list(df.columns)}")

df.head()

## 2. Basic Statistics

In [None]:
# Score statistics
print("Test Score Statistics:")
print(df['best_test_score'].describe())

print("\nScore Distribution:")
print(df['best_test_score'].value_counts().sort_index(ascending=False))

In [None]:
# Model distribution
if 'soar_source_model' in df.columns:
    print("Source Model Distribution:")
    print(df['soar_source_model'].value_counts())
    
    # Plot
    plt.figure(figsize=(10, 5))
    df['soar_source_model'].value_counts().plot(kind='bar')
    plt.title('Perfect Instructions by Source Model')
    plt.xlabel('Model')
    plt.ylabel('Count')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

In [None]:
# Hindsight vs Original
print("Hindsight Distribution:")
print(df['is_hindsight'].value_counts())

if df['is_hindsight'].any():
    print("\nAverage test score by hindsight:")
    print(df.groupby('is_hindsight')['best_test_score'].mean())

## 3. Top Performing Tasks

In [None]:
# Top 10 by test score
top_10 = df.nlargest(10, 'best_test_score')[['task_id', 'training_score', 'best_test_score', 'soar_source_model']]
print("Top 10 Tasks by Test Score:")
print(top_10.to_string(index=False))

## 4. Instruction Analysis

In [None]:
# Instruction length analysis
df['instruction_length'] = df['instructions'].str.len()

print("Instruction Length Statistics:")
print(df['instruction_length'].describe())

plt.figure(figsize=(10, 5))
plt.hist(df['instruction_length'], bins=30, edgecolor='black')
plt.title('Distribution of Instruction Length')
plt.xlabel('Length (characters)')
plt.ylabel('Count')
plt.axvline(df['instruction_length'].median(), color='red', linestyle='--', label=f'Median: {df["instruction_length"].median():.0f}')
plt.legend()
plt.show()

In [None]:
# Sample instructions from perfect tasks
perfect_tasks = df[df['best_test_score'] == 1.0]

if len(perfect_tasks) > 0:
    print(f"\n=== SAMPLE PERFECT INSTRUCTIONS (Test Score = 1.0) ===")
    print(f"Total: {len(perfect_tasks)}\n")
    
    for idx, row in perfect_tasks.head(3).iterrows():
        print(f"Task: {row['task_id']}")
        print(f"Model: {row.get('soar_source_model', 'N/A')}")
        print(f"Instructions:\n{row['instructions'][:500]}...")
        print("\n" + "="*80 + "\n")

## 5. Export Analysis Results

In [None]:
# Save perfect tasks (score = 1.0) separately
perfect_tasks = df[df['best_test_score'] == 1.0]

if len(perfect_tasks) > 0:
    output_path = '/data/hjkim/soar2cot/data/perfect_tasks_100percent.csv'
    perfect_tasks.to_csv(output_path, index=False)
    print(f"Saved {len(perfect_tasks)} perfect tasks to: {output_path}")