# SOAR 데이터셋 정답 Solutions 분석

**분석 내용:**
1. 맞은 데이터의 수 (train & test 모두 정답)
2. 존재하는 ARC 데이터 수와의 비교
3. Task별 데이터 수 통계
4. 모델별 통계

## 1. 데이터 로딩

In [None]:
import pyarrow.parquet as pq
import pandas as pd
import json
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

# Configuration
SOAR_DATA_DIR = "/data/m-soar/soar_arc_train_5M"
ARC_DATA_PATH = "/home/ubuntu/arc-lang-public/data/arc-prize-2024/arc-agi_training_challenges.json"

# Plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

In [None]:
# Load all SOAR parquet files
print("📂 SOAR 데이터 로딩 중...")
parquet_files = sorted(Path(SOAR_DATA_DIR).glob("train_part_*.parquet"))
print(f"   찾은 파일: {len(parquet_files)}개")

all_data = []
for i, file in enumerate(parquet_files, 1):
    print(f"   [{i}/{len(parquet_files)}] {file.name} 읽는 중...", end=" ")
    table = pq.read_table(file)
    df = table.to_pandas()
    all_data.append(df)
    print(f"✓ ({len(df):,} rows)")

df_all = pd.concat(all_data, ignore_index=True)
print(f"\n✅ 총 데이터: {len(df_all):,} rows")
print(f"   컬럼: {list(df_all.columns)}")

In [None]:
# Load original ARC data
print("📂 원본 ARC 데이터 로딩 중...")
with open(ARC_DATA_PATH, 'r') as f:
    arc_data = json.load(f)
print(f"✅ 원본 ARC tasks: {len(arc_data)} tasks")

## 2. 맞은 데이터 필터링

In [None]:
# Filter: train과 test 모두 정답
correct_solutions = df_all[
    df_all['correct_train_input'].apply(lambda x: all(x)) &
    df_all['correct_test_input'].apply(lambda x: all(x))
].copy()

print(f"전체 데이터: {len(df_all):,} rows")
print(f"맞은 데이터 (train & test 모두 정답): {len(correct_solutions):,} rows")
print(f"비율: {len(correct_solutions) / len(df_all) * 100:.2f}%")

In [None]:
# 정답 vs 오답 비율 시각화
fig, ax = plt.subplots(1, 2, figsize=(14, 5))

# Pie chart
labels = ['Correct Solutions\n(train & test 모두 정답)', 'Hindsight Relabeling\n(하나라도 틀림)']
sizes = [len(correct_solutions), len(df_all) - len(correct_solutions)]
colors = ['#2ecc71', '#e74c3c']
explode = (0.05, 0)

ax[0].pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.2f%%',
          shadow=True, startangle=90)
ax[0].set_title('SOAR 데이터셋 구성', fontsize=14, fontweight='bold')

# Bar chart
categories = ['Correct', 'Hindsight']
ax[1].bar(categories, sizes, color=colors)
ax[1].set_ylabel('Solutions 수', fontsize=12)
ax[1].set_title('Solutions 수 비교', fontsize=14, fontweight='bold')
for i, v in enumerate(sizes):
    ax[1].text(i, v + 50000, f'{v:,}', ha='center', fontsize=11, fontweight='bold')

plt.tight_layout()
plt.show()

## 3. 원본 ARC 데이터와 비교

In [None]:
# Get unique task_ids in correct solutions
solved_tasks = correct_solutions['task_id'].unique()

print(f"원본 ARC tasks: {len(arc_data)} tasks")
print(f"SOAR에서 풀린 tasks: {len(solved_tasks)} tasks")
print(f"풀린 비율: {len(solved_tasks) / len(arc_data) * 100:.1f}%")
print()

# Find unsolved tasks
unsolved_tasks = set(arc_data.keys()) - set(solved_tasks)
print(f"아직 못 푼 tasks: {len(unsolved_tasks)} tasks")
if len(unsolved_tasks) > 0:
    print(f"예시: {list(unsolved_tasks)[:10]}")

In [None]:
# 풀린 task vs 못 푼 task 시각화
fig, ax = plt.subplots(figsize=(10, 6))

categories = ['원본 ARC\nTotal Tasks', 'SOAR에서\n풀린 Tasks', '아직\n못 푼 Tasks']
values = [len(arc_data), len(solved_tasks), len(unsolved_tasks)]
colors = ['#3498db', '#2ecc71', '#e74c3c']

bars = ax.bar(categories, values, color=colors, alpha=0.8)
ax.set_ylabel('Tasks 수', fontsize=12)
ax.set_title('원본 ARC vs SOAR 풀린 Tasks 비교', fontsize=14, fontweight='bold')

for bar, val in zip(bars, values):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{val}\n({val/len(arc_data)*100:.1f}%)',
            ha='center', va='bottom', fontsize=11, fontweight='bold')

plt.tight_layout()
plt.show()

## 4. Task별 데이터 수 통계

In [None]:
# Task-wise statistics
task_stats = correct_solutions.groupby('task_id').agg({
    'code': 'count',  # Total solutions
    'model': lambda x: x.nunique(),  # Unique models
    'generation': 'max'  # Max generation number
}).rename(columns={
    'code': 'total_solutions',
    'model': 'unique_models',
    'generation': 'max_generation'
}).reset_index()

# Sort by total solutions
task_stats = task_stats.sort_values('total_solutions', ascending=False)

print(f"총 {len(task_stats)} tasks가 풀렸습니다.\n")

print("▶ 통계 요약:")
print(f"  - 평균 solutions per task: {task_stats['total_solutions'].mean():.1f}")
print(f"  - 중앙값: {task_stats['total_solutions'].median():.1f}")
print(f"  - 최소: {task_stats['total_solutions'].min()}")
print(f"  - 최대: {task_stats['total_solutions'].max()}")
print(f"  - 표준편차: {task_stats['total_solutions'].std():.1f}")

In [None]:
# Solutions 수 분포
bins = [0, 10, 50, 100, 200, 500, 1000, float('inf')]
labels = ['1-10', '11-50', '51-100', '101-200', '201-500', '501-1000', '1000+']
task_stats['solutions_bin'] = pd.cut(task_stats['total_solutions'], bins=bins, labels=labels)

distribution = task_stats['solutions_bin'].value_counts().sort_index()
print("\n▶ Solutions 수 분포:")
for bin_label, count in distribution.items():
    print(f"  {bin_label:>10} solutions: {count:3} tasks")

In [None]:
# 분포 시각화
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Histogram
axes[0, 0].hist(task_stats['total_solutions'], bins=50, color='#3498db', alpha=0.7, edgecolor='black')
axes[0, 0].set_xlabel('Solutions per Task', fontsize=11)
axes[0, 0].set_ylabel('Tasks 수', fontsize=11)
axes[0, 0].set_title('Task별 Solutions 수 분포 (Histogram)', fontsize=13, fontweight='bold')
axes[0, 0].axvline(task_stats['total_solutions'].mean(), color='red', linestyle='--', linewidth=2, label='평균')
axes[0, 0].axvline(task_stats['total_solutions'].median(), color='green', linestyle='--', linewidth=2, label='중앙값')
axes[0, 0].legend()

# 2. Distribution bins
axes[0, 1].bar(range(len(distribution)), distribution.values, color='#2ecc71', alpha=0.7)
axes[0, 1].set_xticks(range(len(distribution)))
axes[0, 1].set_xticklabels(distribution.index, rotation=45)
axes[0, 1].set_xlabel('Solutions 범위', fontsize=11)
axes[0, 1].set_ylabel('Tasks 수', fontsize=11)
axes[0, 1].set_title('Solutions 범위별 Task 분포', fontsize=13, fontweight='bold')
for i, v in enumerate(distribution.values):
    axes[0, 1].text(i, v + 1, str(v), ha='center', fontsize=10, fontweight='bold')

# 3. Top 20 tasks
top20 = task_stats.head(20)
axes[1, 0].barh(range(len(top20)), top20['total_solutions'].values, color='#e74c3c', alpha=0.7)
axes[1, 0].set_yticks(range(len(top20)))
axes[1, 0].set_yticklabels(top20['task_id'].values, fontsize=9)
axes[1, 0].set_xlabel('Solutions 수', fontsize=11)
axes[1, 0].set_title('상위 20개 Tasks (Solutions 많은 순)', fontsize=13, fontweight='bold')
axes[1, 0].invert_yaxis()

# 4. Bottom 20 tasks
bottom20 = task_stats.tail(20).sort_values('total_solutions')
axes[1, 1].barh(range(len(bottom20)), bottom20['total_solutions'].values, color='#f39c12', alpha=0.7)
axes[1, 1].set_yticks(range(len(bottom20)))
axes[1, 1].set_yticklabels(bottom20['task_id'].values, fontsize=9)
axes[1, 1].set_xlabel('Solutions 수', fontsize=11)
axes[1, 1].set_title('하위 20개 Tasks (Solutions 적은 순)', fontsize=13, fontweight='bold')
axes[1, 1].invert_yaxis()

plt.tight_layout()
plt.show()

In [None]:
# 상위 20개 Task 상세 정보
print("▶ 상위 20개 Task (solutions 많은 순):")
print(f"{'Rank':<6}{'Task ID':<12}{'Solutions':<12}{'Models':<10}{'Max Gen':<10}")
print("-" * 60)
for i, (idx, row) in enumerate(task_stats.head(20).iterrows(), 1):
    print(f"{i:<6}{row['task_id']:<12}{row['total_solutions']:<12}{row['unique_models']:<10}{row['max_generation']:<10}")

In [None]:
# 하위 20개 Task 상세 정보
print("▶ 하위 20개 Task (solutions 적은 순):")
print(f"{'Rank':<6}{'Task ID':<12}{'Solutions':<12}{'Models':<10}{'Max Gen':<10}")
print("-" * 60)
for i, (idx, row) in enumerate(task_stats.tail(20).iterrows(), 1):
    rank = len(task_stats) - 20 + i
    print(f"{rank:<6}{row['task_id']:<12}{row['total_solutions']:<12}{row['unique_models']:<10}{row['max_generation']:<10}")

## 5. 모델별 통계

In [None]:
# Model-wise statistics
model_stats = correct_solutions.groupby('model').agg({
    'code': 'count',
    'task_id': lambda x: x.nunique()
}).rename(columns={
    'code': 'total_solutions',
    'task_id': 'tasks_solved'
}).reset_index().sort_values('total_solutions', ascending=False)

print(f"{'Model':<40}{'Solutions':<15}{'Tasks Solved':<15}")
print("-" * 70)
for _, row in model_stats.iterrows():
    print(f"{row['model']:<40}{row['total_solutions']:<15}{row['tasks_solved']:<15}")

In [None]:
# 모델별 시각화
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Solutions per model
axes[0].barh(range(len(model_stats)), model_stats['total_solutions'].values, color='#9b59b6', alpha=0.7)
axes[0].set_yticks(range(len(model_stats)))
axes[0].set_yticklabels(model_stats['model'].values, fontsize=10)
axes[0].set_xlabel('Total Solutions', fontsize=11)
axes[0].set_title('모델별 Total Solutions', fontsize=13, fontweight='bold')
axes[0].invert_yaxis()

# Tasks solved per model
axes[1].barh(range(len(model_stats)), model_stats['tasks_solved'].values, color='#1abc9c', alpha=0.7)
axes[1].set_yticks(range(len(model_stats)))
axes[1].set_yticklabels(model_stats['model'].values, fontsize=10)
axes[1].set_xlabel('Tasks Solved', fontsize=11)
axes[1].set_title('모델별 풀린 Tasks 수', fontsize=13, fontweight='bold')
axes[1].invert_yaxis()

plt.tight_layout()
plt.show()

## 6. 결과 저장

In [None]:
# Save results to JSON
output_dir = Path("/home/ubuntu/arc-lang-public/heejun")
output_file = output_dir / "soar_correct_solutions_stats.json"

results = {
    "summary": {
        "total_rows": int(len(df_all)),
        "correct_solutions": int(len(correct_solutions)),
        "correct_percentage": float(len(correct_solutions) / len(df_all) * 100),
        "original_arc_tasks": int(len(arc_data)),
        "solved_tasks": int(len(solved_tasks)),
        "solved_percentage": float(len(solved_tasks) / len(arc_data) * 100),
        "unsolved_tasks": int(len(unsolved_tasks))
    },
    "task_statistics": {
        "mean_solutions_per_task": float(task_stats['total_solutions'].mean()),
        "median_solutions_per_task": float(task_stats['total_solutions'].median()),
        "min_solutions": int(task_stats['total_solutions'].min()),
        "max_solutions": int(task_stats['total_solutions'].max()),
        "std_solutions": float(task_stats['total_solutions'].std())
    },
    "top_20_tasks": [
        {
            "task_id": row['task_id'],
            "total_solutions": int(row['total_solutions']),
            "unique_models": int(row['unique_models']),
            "max_generation": int(row['max_generation'])
        }
        for _, row in task_stats.head(20).iterrows()
    ],
    "bottom_20_tasks": [
        {
            "task_id": row['task_id'],
            "total_solutions": int(row['total_solutions']),
            "unique_models": int(row['unique_models']),
            "max_generation": int(row['max_generation'])
        }
        for _, row in task_stats.tail(20).iterrows()
    ],
    "model_statistics": [
        {
            "model": row['model'],
            "total_solutions": int(row['total_solutions']),
            "tasks_solved": int(row['tasks_solved'])
        }
        for _, row in model_stats.iterrows()
    ],
    "unsolved_task_ids": sorted(list(unsolved_tasks))
}

with open(output_file, 'w') as f:
    json.dump(results, f, indent=2)

print(f"✅ 결과 저장: {output_file}")

In [None]:
# Save task-wise detailed CSV
csv_file = output_dir / "soar_task_statistics.csv"
task_stats.to_csv(csv_file, index=False)
print(f"✅ Task별 상세 통계 CSV: {csv_file}")

In [None]:
print("\n" + "=" * 80)
print("분석 완료!")
print("=" * 80)