# Fly vs Fly NEAR Results Analysis

This notebook analyzes and compares results from:
- ~Baseline NEAR experiments (from baseline_results.json)~
- Neurosym-lib NEAR experiments (from various results pickle files)
- Reported program from the NEAR paper

## Step 0: Data collection.

Compute results for all neurosym-lib experiments and the reported program
```bash
# Search for the top 40 programs on the Fly-v-Fly dataset
$ CUDA_VISIBLE_DEVICES=0 python notebooks/flyvfly_reproduction/benchmark_flyvfly.py

In [1]:
%load_ext autoreload
%autoreload 2
%cd /home/asehgal/neurosym-lib

import json
import pickle
from pathlib import Path

import numpy as np
import pandas as pd

from neurosym.examples.near.metrics import compute_metrics

  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


/home/asehgal/neurosym-lib


  from .autonotebook import tqdm as notebook_tqdm


## Load Baseline Results

In [2]:
# baseline results not available as NEAR code
# doesn't include execution code for this.

# # Load best NEAR results
# near_results_pth = Path("outputs/flyvfly_results/near_outputs/")

# baseline_results = {}
# for result_file in near_results_pth.rglob("test_results.json"):
#     print(f"Loading Baseline results from {result_file}")
#     with open(result_file, "r") as f:
#         expt_report = json.load(f)
#         experiment_name = result_file.parent.name
#         y_true = np.array(expt_report["true_vals"])
#         y_scores = np.array(expt_report["predicted_vals"])
#         expt_report["report"] = compute_metrics(y_scores, y_true)
#         baseline_results[experiment_name] = expt_report

## Load Neurosym-lib Results

In [3]:
file_path = "outputs/flyvfly_results/reproduction.pkl"
neurosym_results = {}
if Path(file_path).exists():
    with open(file_path, "rb") as f:
        results = pickle.load(f)
        file_name = Path(file_path).stem
        for i, result in enumerate(results):
            key = f"neurosym_{file_name}_{i:03d}"
            neurosym_results[key] = result
    print(f"  Loaded {len(results)} programs from {file_name}")
else:
    print(f"Warning: {file_path} not found")

  Loaded 40 programs from reproduction


## Create Comparison Table

In [4]:
# Combine all results
all_results = {}
# all_results.update(baseline_results)
all_results.update(neurosym_results)

print(f"Total experiments: {len(all_results)}")

Total experiments: 40


In [5]:
# Create comparison dataframe
table_data = []

for name, result in all_results.items():
    # Extract metrics
    report = result.get("report", {})
    
    # Handle different report structures
    if isinstance(report, dict):
        if "report" in report:
            macro_avg = report["report"].get("macro avg", {})
            hamming_acc = report.get("hamming_accuracy", 0.0)
        elif "macro avg" in report:
            macro_avg = report["macro avg"]
            hamming_acc = report.get("hamming_accuracy", 0.0)
        else:
            # Need to compute metrics
            if "pred_vals" in result and "true_vals" in result:
                pred_vals = np.array(result["pred_vals"])
                true_vals = np.array(result["true_vals"])
                metrics = compute_metrics(pred_vals, true_vals)
                macro_avg = metrics["report"]["macro avg"]
                hamming_acc = metrics["hamming_accuracy"]
            else:
                continue
    else:
        continue
    
    row = {
        "experiment": name,
        "precision": macro_avg.get("precision", 0.0),
        "recall": macro_avg.get("recall", 0.0),
        "f1_score": macro_avg.get("f1-score", 0.0),
        "support": macro_avg.get("support", 0),
        "hamming_accuracy": hamming_acc,
        "time": result.get("time", 0.0),
    }
    table_data.append(row)

df = pd.DataFrame(table_data)

# Sort by f1_score descending
df = df.sort_values("hamming_accuracy", ascending=False)

print("\n" + "=" * 80)
print("RESULTS COMPARISON")
print("=" * 80)
print(df.to_string(index=False))
print("=" * 80)


RESULTS COMPARISON
               experiment  precision   recall  f1_score  support  hamming_accuracy        time
neurosym_reproduction_036   0.231719 0.168844  0.131285   1050.0          0.458095 1997.639672
neurosym_reproduction_005   0.065306 0.142857  0.089636   1050.0          0.457143  175.275669
neurosym_reproduction_010   0.065306 0.142857  0.089636   1050.0          0.457143  228.695995
neurosym_reproduction_015   0.065306 0.142857  0.089636   1050.0          0.457143  287.440017
neurosym_reproduction_016   0.109355 0.163176  0.121230   1050.0          0.450476  287.440111
neurosym_reproduction_001   0.109226 0.163176  0.121111   1050.0          0.450476  124.019105
neurosym_reproduction_006   0.110447 0.163176  0.121485   1050.0          0.450476  175.275752
neurosym_reproduction_011   0.106818 0.161012  0.118857   1050.0          0.449524  228.696096
neurosym_reproduction_035   0.200326 0.153226  0.122567   1050.0          0.434286 1944.769693
neurosym_reproduction_037   0.

## Summary Statistics

In [6]:
print("\nSUMMARY STATISTICS")
print("=" * 80)
print(f"Total experiments: {len(df)}")
print(f"\nBest F1-score: {df['f1_score'].max():.6f}")
print(f"  Experiment: {df.loc[df['f1_score'].idxmax(), 'experiment']}")
print(f"\nBest Hamming accuracy: {df['hamming_accuracy'].max():.6f}")
print(f"  Experiment: {df.loc[df['hamming_accuracy'].idxmax(), 'experiment']}")
print(f"\nAverage F1-score: {df['f1_score'].mean():.6f}")
print(f"Average Hamming accuracy: {df['hamming_accuracy'].mean():.6f}")
print(f"Average training time: {df['time'].mean():.2f} seconds")

# Baseline vs Neurosym comparison
baseline_df = df[df['experiment'].str.contains('baseline|crim13_astar|crim13_iddfs|enumeration', case=False, na=False)]
neurosym_df = df[df['experiment'].str.contains('neurosym|reproduction', case=False, na=False)]

if not baseline_df.empty:
    print(f"\nBaseline experiments: {len(baseline_df)}")
    print(f"  Avg F1: {baseline_df['f1_score'].mean():.6f}")
    print(f"  Avg Hamming Acc: {baseline_df['hamming_accuracy'].mean():.6f}")
    
if not neurosym_df.empty:
    print(f"\nNeurosym-lib experiments: {len(neurosym_df)}")
    print(f"  Avg F1: {neurosym_df['f1_score'].mean():.6f}")
    print(f"  Avg Hamming Acc: {neurosym_df['hamming_accuracy'].mean():.6f}")

# Reported program
reported_df = df[df['experiment'] == 'reported_program']
if not reported_df.empty:
    print("\nReported Program:")
    print(f"  F1: {reported_df['f1_score'].values[0]:.6f}")
    print(f"  Hamming Acc: {reported_df['hamming_accuracy'].values[0]:.6f}")
    print(f"  Time: {reported_df['time'].values[0]:.2f}s")

print("=" * 80)


SUMMARY STATISTICS
Total experiments: 40

Best F1-score: 0.165125
  Experiment: neurosym_reproduction_028

Best Hamming accuracy: 0.458095
  Experiment: neurosym_reproduction_036

Average F1-score: 0.068334
Average Hamming accuracy: 0.216500
Average training time: 712.08 seconds

Neurosym-lib experiments: 40
  Avg F1: 0.068334
  Avg Hamming Acc: 0.216500


## Save Results

In [7]:
# Save CSV
csv_path = "outputs/flyvfly_results/comparison.csv"
Path(csv_path).parent.mkdir(parents=True, exist_ok=True)
df.to_csv(csv_path, index=False)
print(f"Saved CSV to: {csv_path}")

# Save Markdown
md_path = "outputs/flyvfly_results/comparison.md"
with open(md_path, "w") as f:
    f.write("# MICE-DSL NEAR Results Comparison\n\n")
    f.write(df.to_markdown(index=False))
    f.write("\n")
print(f"Saved Markdown to: {md_path}")

Saved CSV to: outputs/flyvfly_results/comparison.csv
Saved Markdown to: outputs/flyvfly_results/comparison.md


## Display Top Programs

In [8]:
# Show top 10 programs by F1 score
print("\nTop 10 Programs by F1-score:")
print("=" * 80)
top_10 = df.head(10)
print(top_10.to_string(index=False))

# Show program strings for top results if available
print("\nProgram Details:")
print("=" * 80)
for idx, row in top_10.iterrows():
    exp_name = row['experiment']
    if exp_name in all_results:
        result = all_results[exp_name]
        program_str = result.get('program_str', str(result.get('program', 'N/A')))
        print(f"\n{exp_name}:")
        print(f"  Program: {program_str}")
        print(f"  F1: {row['f1_score']:.6f}, Hamming Acc: {row['hamming_accuracy']:.6f}, Time: {row['time']:.2f}s")


Top 10 Programs by F1-score:
               experiment  precision   recall  f1_score  support  hamming_accuracy        time
neurosym_reproduction_036   0.231719 0.168844  0.131285   1050.0          0.458095 1997.639672
neurosym_reproduction_005   0.065306 0.142857  0.089636   1050.0          0.457143  175.275669
neurosym_reproduction_010   0.065306 0.142857  0.089636   1050.0          0.457143  228.695995
neurosym_reproduction_015   0.065306 0.142857  0.089636   1050.0          0.457143  287.440017
neurosym_reproduction_016   0.109355 0.163176  0.121230   1050.0          0.450476  287.440111
neurosym_reproduction_001   0.109226 0.163176  0.121111   1050.0          0.450476  124.019105
neurosym_reproduction_006   0.110447 0.163176  0.121485   1050.0          0.450476  175.275752
neurosym_reproduction_011   0.106818 0.161012  0.118857   1050.0          0.449524  228.696096
neurosym_reproduction_035   0.200326 0.153226  0.122567   1050.0          0.434286 1944.769693
neurosym_reproductio