In [1]:
import pandas as pd
from collections import Counter
import numpy as np
import json
import os

# Merge Comparison

In [153]:
def clean_opensource_dataframe(opensource_path, correct_path):
    opensource_df = pd.read_csv(opensource_path)
    correct_df = pd.read_csv(correct_path)
    sorted_opensource_options_presented = [str(sorted(eval(x))) for x in opensource_df.options_presented_ids]
    opensource_operation_ids = opensource_df.operation_id
    opensource_df['old_operation_id'] = opensource_df.operation_id
    for i, (option_presented, operation_id) in enumerate(zip(correct_df.options_presented_ids, correct_df.operation_id)):
        if str(sorted(eval(option_presented))) in sorted_opensource_options_presented:
            index = sorted_opensource_options_presented.index(str(sorted(eval(option_presented))))
            opensource_df.loc[index, 'operation_id'] = operation_id
        else:
            print(f"Option {option_presented} not found in opensource_df")
    opensource_df = opensource_df[opensource_df['operation_id'] != opensource_df['old_operation_id']]
    opensource_df.to_csv(opensource_path[:-len("_rough.csv")]+".csv", index=False)
    print(opensource_path[:-len("_rough.csv")]+".csv")

In [179]:
import ast
import warnings
import pandas as pd
from pathlib import Path

def clean_opensource_dataframe(
        opensource_path: str,
        correct_path: str,
        output_path: str | None = None,
) -> pd.DataFrame:
    """
    Align *opensource* rows with the correct `operation_id` **and**
    `expected_choice_ids`, matched via `options_presented_ids`.  The
    cleaned frame is sorted by `operation_id` and written to disk.

    Returns
    -------
    pd.DataFrame
        The cleaned, sorted DataFrame.
    """
    # ------------------------------------------------------------------
    # 1. Load CSVs
    # ------------------------------------------------------------------
    opensource_df = pd.read_csv(opensource_path)
    correct_df    = pd.read_csv(correct_path)

    # ------------------------------------------------------------------
    # 2. Canonical key: tuple(sorted(list-of-ids))
    # ------------------------------------------------------------------
    key_func = lambda s: tuple(sorted(ast.literal_eval(str(s))))
    opensource_df["options_key"] = opensource_df["options_presented_ids"].map(key_func)
    correct_df["options_key"]    = correct_df["options_presented_ids"].map(key_func)

    # ------------------------------------------------------------------
    # 3. Ensure each key is unique in *correct*
    # ------------------------------------------------------------------
    dup_keys = correct_df["options_key"][correct_df["options_key"].duplicated()]
    if not dup_keys.empty:
        warnings.warn(
            f"{dup_keys.nunique()} duplicate option-sets in `correct_path`; "
            "only the first occurrence for each is used."
        )
        correct_df = correct_df.drop_duplicates(subset="options_key", keep="first")

    # ------------------------------------------------------------------
    # 4. Build look-up dicts
    # ------------------------------------------------------------------
    opid_map      = dict(zip(correct_df["options_key"], correct_df["operation_id"]))
    expected_map  = dict(zip(correct_df["options_key"], correct_df["expected_choice_ids"]))

    # ------------------------------------------------------------------
    # 5. Overwrite columns (keep originals for provenance if you like)
    # ------------------------------------------------------------------
    opensource_df["old_operation_id"]       = opensource_df["operation_id"]
    opensource_df["old_expected_choice_ids"] = opensource_df.get("expected_choice_ids")

    opensource_df["operation_id"]         = opensource_df["options_key"].map(opid_map)
    opensource_df["expected_choice_ids"]  = opensource_df["options_key"].map(expected_map)

    missing = opensource_df["operation_id"].isna().sum()
    if missing:
        warnings.warn(f"{missing} rows in `opensource_path` had no match in `correct_path`.")

    # ------------------------------------------------------------------
    # 6. Sort, drop helpers, save
    # ------------------------------------------------------------------
    opensource_df = (
        opensource_df
        .sort_values("operation_id", na_position="last")
        .drop(columns=["options_key"])             # remove helper
        .reset_index(drop=True)
    )

    if output_path is None:
        # replace “…_rough.csv” → “….csv” or keep stem otherwise
        output_path = (
            Path(opensource_path)
            .with_suffix("")        # strip .csv
            .with_name(Path(opensource_path).stem.replace("_rough", ""))
            .with_suffix(".csv")
        )

    opensource_df.to_csv(output_path, index=False)

    return opensource_df


In [180]:
clean_opensource_dataframe("/Users/jbrown/Documents/boyden_lab/ai-proofreading/script/output/mouse_merge_2048nm/intern-vl3_merge_comparison_informative_analysis_results_rough.csv", "/Users/jbrown/Documents/boyden_lab/ai-proofreading/script/output/mouse_merge_2048nm/gpt-4.1_merge_identification_informative_analysis_results.csv")



Unnamed: 0,operation_id,base_neuron_id,correct_merged_pair,options_presented_ids,expected_choice_ids,num_options_presented,prompt_options,views,use_zoomed_images,image_paths,option_index_to_id,index,model_chosen_id,model_raw_answer,model_analysis,model,zoom_margin,old_operation_id,old_expected_choice_ids
0,117152,864691135657866370,[],"['864691135896429988', '864691135312634659']",['864691135896429988'],2,"[{'id': 864691135896429988, 'paths': {'zoomed'...",top;front;side,True,{'864691135896429988': {'zoomed': {'front': '/...,"{1: '864691135896429988', 2: '8646911353126346...",1,8.646911e+17,<analysis>\n### Analysis of Option 1 (Image 1)...,### Analysis of Option 1 (Image 1)\n\n**Top Vi...,intern-vl3,4096,merge_864691135657866370_1186474_658194_876245,['864691135896429988']
1,121378,864691136099868533,[],"['864691135750118549', '864691135749056149']",['864691135749056149'],2,"[{'id': 864691135750118549, 'paths': {'zoomed'...",top;front;side,True,{'864691135750118549': {'zoomed': {'front': '/...,"{1: '864691135750118549', 2: '8646911357490561...",1,8.646911e+17,<analysis>\n### Analysis of Option 1 (Image 1)...,### Analysis of Option 1 (Image 1)\n\n- **Top ...,intern-vl3,4096,merge_864691136099868533_1306053_550903_723702,['864691135750118549']
2,122837,864691136118515992,[],"['864691136739730286', '864691135736487572']",['864691136739730286'],2,"[{'id': 864691136739730286, 'paths': {'zoomed'...",top;front;side,True,{'864691136739730286': {'zoomed': {'front': '/...,"{1: '864691136739730286', 2: '8646911357364875...",1,8.646911e+17,<analysis>\n### Analysis of Option 1 (Image 1)...,### Analysis of Option 1 (Image 1)\n\n**Top Vi...,intern-vl3,4096,merge_864691136118515992_1234663_856369_1015602,['864691136739730286']
3,124669,864691135614176459,[],"['864691136008548780', '864691135483246276']",['864691136008548780'],2,"[{'id': 864691136008548780, 'paths': {'zoomed'...",top;front;side,True,{'864691136008548780': {'zoomed': {'front': '/...,"{1: '864691136008548780', 2: '8646911354832462...",2,8.646911e+17,<analysis>\n### Analysis of Option 1 (Image 1)...,### Analysis of Option 1 (Image 1)\n\n- **Top ...,intern-vl3,4096,merge_864691135614176459_1216530_953683_758509,['864691135483246276']
4,125581,864691136143459636,[],"['864691136108877240', '864691133867151046']",['864691136108877240'],2,"[{'id': 864691136108877240, 'paths': {'zoomed'...",top;front;side,True,{'864691136108877240': {'zoomed': {'front': '/...,"{1: '864691136108877240', 2: '8646911338671510...",1,8.646911e+17,<analysis>\n### Analysis of Option 1 (Image 1)...,### Analysis of Option 1 (Image 1)\n\n- **Top ...,intern-vl3,4096,merge_864691136143459636_519907_642694_794230,['864691136108877240']
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
468,1393063,864691135760313422,[],"['864691135123677223', '864691134949293052']",['864691134949293052'],2,"[{'id': 864691135123677223, 'paths': {'zoomed'...",top;front;side,True,{'864691135123677223': {'zoomed': {'front': '/...,"{1: '864691135123677223', 2: '8646911349492930...",1,8.646911e+17,<analysis>\n### Analysis of Option 1 (Image 1)...,### Analysis of Option 1 (Image 1)\n\n**Top Vi...,intern-vl3,4096,merge_864691135760313422_645204_416083_934426,['864691135123677223']
469,1393090,864691135939045380,[],"['864691136787212910', '864691135996376746']",['864691135996376746'],2,"[{'id': 864691136787212910, 'paths': {'zoomed'...",top;front;side,True,{'864691136787212910': {'zoomed': {'front': '/...,"{1: '864691136787212910', 2: '8646911359963767...",1,8.646911e+17,<analysis>\n### Analysis of Option 1 (Image 1)...,### Analysis of Option 1 (Image 1)\n\n**Top Vi...,intern-vl3,4096,merge_864691135939045380_754162_381391_969927,['864691136787212910']
470,1393183,864691135891033481,[],"['864691135866017662', '864691133174542584']",['864691135866017662'],2,"[{'id': 864691135866017662, 'paths': {'zoomed'...",top;front;side,True,{'864691135866017662': {'zoomed': {'front': '/...,"{1: '864691135866017662', 2: '8646911331745425...",1,8.646911e+17,<analysis>\n### Analysis of Option 1 (Image 1)...,### Analysis of Option 1 (Image 1)\n\n- **Top ...,intern-vl3,4096,merge_864691135891033481_787374_765954_830623,['864691135866017662']
471,1394583,864691135360485464,[],"['864691135342744005', '864691135067764943']",['864691135342744005'],2,"[{'id': 864691135342744005, 'paths': {'zoomed'...",top;front;side,True,{'864691135342744005': {'zoomed': {'front': '/...,"{1: '864691135342744005', 2: '8646911350677649...",1,8.646911e+17,<analysis>\n### Analysis of Option 1 (Image 1)...,### Analysis of Option 1 (Image 1)\n\n**Top Vi...,intern-vl3,4096,merge_864691135360485464_1229917_516978_1031184,['864691135342744005']


In [181]:
specieses = ["mouse"]
models = [ "claude-3-7-sonnet-20250219", "o4-mini", "gpt-4.1", "gpt-4o", "intern-vl3"]
prompt_modes = ["informative", "null"]
run_indexes =[""]#, "_run2"]
for run_index in run_indexes:
    for species in specieses:
        for model in models:
            for prompt_mode in prompt_modes:
                datapath = f"/Users/jbrown/Documents/boyden_lab/ai-proofreading/script/output/{species}_merge_2048nm/{model}_merge_comparison_{prompt_mode}_analysis_results{run_index}.csv"
                if not os.path.exists(datapath):
                    print(f"Combination {species} {model} {prompt_mode} {run_index} does not exist")
                    continue
                df = pd.read_csv(datapath)


                # answer_balance = Counter([x[x.index("<answer>")+len("<answer>"):x.index("</answer>")].strip() for x in df['model_raw_answer']])
                # print("answer balance", answer_balance)

                ground_truth = []
                predictions = []
                for operation in df['operation_id'].unique():

                    df_operation = df[df['operation_id'] == operation]
                    if model == "intern-vl3":
                        answers = [eval(x)[y] if y != -1 else "none" for x,y in zip(df_operation['option_index_to_id'], df_operation['index'])]
                    else:
                        answers = df_operation['model_chosen_id'].tolist()
                    
                    options_presented = eval(df_operation['options_presented_ids'].iloc[0])
                    if len(options_presented) < 2:
                        continue
                    try:
                        expected_answer = eval(df_operation['expected_choice_ids'].iloc[0])[0]
                    except:
                        print("Error in expected_answer")
                        continue

                    most_common_answer = Counter(answers).most_common(1)[0][0]
                    print(answers)
                    ground_truth.append(expected_answer)
                    predictions.append(most_common_answer)
                Counter(df['index'])
                pred_positive_and_correct = [pred == gt for pred, gt in zip(predictions, ground_truth) if pred != "none" ]
                pred_positive_and_incorrect = [pred != gt for pred, gt in zip(predictions, ground_truth) if pred != "none"]
                pred_negative_and_correct = [pred == gt for pred, gt in zip(predictions, ground_truth) if pred == "none"]
                pred_negative_and_incorrect = [pred != gt for pred, gt in zip(predictions, ground_truth) if pred == "none"]
                print(f"Model: {model}, Prompt Mode: {prompt_mode}, Species {species}, Run Index \"{run_index}\"")
                print(f"Total merge events evaluated: {len(ground_truth)}")
                print(f"Accuracy: {sum(pred_positive_and_correct) / (len(pred_positive_and_correct)+len(pred_negative_and_correct))}")
                print(f"Correctly predicted merge partner: {sum(pred_positive_and_correct)}")
                print(f"Incorrectly predicted merge partner: {sum(pred_positive_and_incorrect)}")
                print(f"Correctly identified no merge: {sum(pred_negative_and_correct)}")
                print(f"Incorrectly identified no merge: {sum(pred_negative_and_incorrect)}")
                print(f"Answer balance: {Counter(df['index'])}")
                print("="*100)

['864691135214939718']
['864691135614153924']
['864691135941420916']
['864691135334739945']
['864691135350062551']
['864691135374896328']
['864691135473456178']
['864691136848395118']
['864691135623854788']
['864691136237702122']
['864691135583085677']
['864691136085621356']
['864691135661932528']
['864691135741888107']
['864691136125173542']
['864691135685721271']
['864691135046559602']
['864691135416632250']
['864691135544447428']
['864691136662846430']
['864691135446280082']
['864691135386069589']
['864691135546078504']
['864691136084986988']
['864691135658570114']
['864691135851826631']
['864691135490379623']
['864691136031929787']
['864691135600165572']
['864691136537233500']
['864691135093685744']
['864691136952257247']
['864691136812338547']
['864691135738888305']
['864691135941552500']
['864691135575649220']
['864691134839552359']
['864691135609931527']
['864691135975974723']
['864691135594567979']
['864691136819564910']
['864691135450909165']
['864691135539132868']
['864691135

# Merge Identification

In [None]:
specieses = ["mouse"]
models = ["o4-mini"]
prompt_modes = ["informative+heuristic1+heuristic2+heuristic3"]
for species in specieses:
    for model in models:
        for prompt_mode in prompt_modes:
            datapath = f"/Users/jbrown/Documents/boyden_lab/ai-proofreading/connectomebench/scripts/output/{species}_merge_2048nm/{model}_merge_identification_{prompt_mode}_analysis_results.csv"
            df = pd.read_csv(datapath)
   
            ground_truth = []
            predictions = []
            for operation in df['id'].unique():
                df_operation = df[df['id'] == operation]

                # options_presented = eval(df_operation['options_presented_ids'].iloc[0])
                # if len(options_presented) < 2:
                #     continue
                for index, df_row in df_operation.iterrows():

                    answers = df_row['model_answer']
                    print(answers)
                    # expected_answer = eval(df_operation['expected_choice_ids'].iloc[0])[0]
                    is_merge = df_row['is_correct_merge']
                    print(is_merge)
                    most_common_answer = Counter(answers).most_common(1)[0][0]

                    if most_common_answer == "none":
                        predictions.append("none")
                    else:
                        predictions.append("1")

                    if is_merge:
                        ground_truth.append(1)
                    else:
                        ground_truth.append("none")

            pred_positive_and_correct = [pred == gt for pred, gt in zip(predictions, ground_truth) if pred != "none" ]
            pred_positive_and_incorrect = [pred != gt for pred, gt in zip(predictions, ground_truth) if pred != "none"]
            pred_negative_and_correct = [pred == gt for pred, gt in zip(predictions, ground_truth) if pred == "none"]
            pred_negative_and_incorrect = [pred != gt for pred, gt in zip(predictions, ground_truth) if pred == "none"]

            accuracy = (sum(pred_positive_and_correct) + sum(pred_negative_and_correct)) / (len(pred_positive_and_correct)+len(pred_negative_and_correct))
            precision = sum(pred_positive_and_correct) / (sum(pred_positive_and_correct) + sum(pred_positive_and_incorrect))
            recall = sum(pred_positive_and_correct) / (sum(pred_positive_and_correct) + sum(pred_negative_and_incorrect))
            print(f"Model: {model}, Species {species}, Prompt Mode {prompt_mode}, Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}")
            print("="*100)


none
True
none
True
none
False
1
False
none
False
1
False
1
False
none
False
none
False
none
False


ZeroDivisionError: division by zero

In [29]:
predictions

['1', '1', '1', '1', '1', '1', '1', '1', '1', '1']

In [15]:
predictions

['none', 'none', 'none', 'none', 'none', 'none', '1', '1', 'none', 'none']

In [16]:
ground_truth

[1, 1, 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none']

# Split Comparison

In [26]:
df['model_prediction']

0       2
1       1
2       1
3       2
4       2
5       1
6    none
7    none
8       1
9       2
Name: model_prediction, dtype: object

In [27]:
df['correct_answer']

0    1
1    2
2    1
3    2
4    1
5    2
6    1
7    2
8    1
9    2
Name: correct_answer, dtype: int64

In [29]:
datapath = "/Users/jbrown/Documents/boyden_lab/ai-proofreading/script/output/fly_split_8192nm/claude-3-7-sonnet-20250219_split_comparison_informative_analysis_results.csv"
df = pd.read_csv(datapath)
np.mean(df['model_prediction'].astype(str) == df['correct_answer'].astype(str))

np.float64(0.4)

In [21]:
specieses = ["mouse", "fly"] #,"mouse"]
models = ["claude-3-7-sonnet-20250219", "o4-mini", "gpt-4.1", "gpt-4o"]
prompt_modes = ["informative", "null"] #, "null"]

summary_list = []
for species in specieses:
    for model in models:
        for prompt_mode in prompt_modes:
            datapath = f"/Users/jbrown/Documents/boyden_lab/ai-proofreading/script/output/{species}_split_8192nm/{model}_split_comparison_{prompt_mode}_analysis_results.csv"
            df = pd.read_csv(datapath)
            print("Answer balance", Counter(df['model_prediction']))
            TP = sum(df['model_prediction'].astype(str) == "1")
            FP = sum(df['model_prediction'].astype(str) == "2")
            FN = sum(df['model_prediction'].astype(str) == "none")

            accuracy = TP/(TP+FP+FN)
            precision = TP/(TP+FP)
            recall = TP/(TP+FN)

            summary_list.append({"model": model, "species": species, "prompt_mode": prompt_mode, "accuracy": accuracy, "precision": precision, "recall": recall})

            print(f"Model: {model}, Species {species}, Prompt Mode {prompt_mode}, Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}")
            print("="*100)
summary_df = pd.DataFrame.from_dict(summary_list)



Answer balance Counter({'1': 86, '2': 31, 'none': 31})
Model: claude-3-7-sonnet-20250219, Species mouse, Prompt Mode informative, Accuracy: 0.581081081081081, Precision: 0.7350427350427351, Recall: 0.7350427350427351
Answer balance Counter({'none': 76, '1': 45, '2': 27})
Model: claude-3-7-sonnet-20250219, Species mouse, Prompt Mode null, Accuracy: 0.30405405405405406, Precision: 0.625, Recall: 0.371900826446281
Answer balance Counter({'1': 111, '2': 36, 'none': 1})
Model: o4-mini, Species mouse, Prompt Mode informative, Accuracy: 0.75, Precision: 0.7551020408163265, Recall: 0.9910714285714286
Answer balance Counter({1: 115, 2: 33})
Model: o4-mini, Species mouse, Prompt Mode null, Accuracy: 0.777027027027027, Precision: 0.777027027027027, Recall: 1.0
Answer balance Counter({'1': 84, 'none': 46, '2': 18})
Model: gpt-4.1, Species mouse, Prompt Mode informative, Accuracy: 0.5675675675675675, Precision: 0.8235294117647058, Recall: 0.6461538461538462
Answer balance Counter({'1': 80, 'none': 

Counter({'none': 71, '1': 38, '2': 28})

# Split Identification

In [142]:
models = ["claude-3-7-sonnet-20250219", "o4-mini", "gpt-4.1", "gpt-4o"]
specieses = ["mouse", "fly"]
prompt_modes = ["informative", "null"]
for species in specieses:
    for model in models:

        for prompt_mode in prompt_modes:
            datapath = f"/Users/jbrown/Documents/boyden_lab/ai-proofreading/script/output/{species}_split_8192nm_paper_final/{model}_split_identification_{prompt_mode}_analysis_results.csv"
            df = pd.read_csv(datapath)
            TP = sum((df['is_split']) & (df['model_prediction'] == "1"))
            FP = sum((~df['is_split']) & (df['model_prediction'] == "1"))
            FN = sum((df['is_split']) & (df['model_prediction'] == "none"))
            TN = sum((~df['is_split']) & (df['model_prediction'] == "none"))
            print(f"TP: {TP}, FP: {FP}, FN: {FN}, TN: {TN}")
            print(f"Model: {model}, Species {species}, Prompt Mode {prompt_mode}. Accuracy: {(TP+TN)/(TP+FP+FN+TN)}, Precision: {TP/(TP+FP)}, Recall: {TP/(TP+FN)}")
            print("="*100)






TP: 53, FP: 33, FN: 95, TN: 115
Model: claude-3-7-sonnet-20250219, Species mouse, Prompt Mode informative. Accuracy: 0.5675675675675675, Precision: 0.6162790697674418, Recall: 0.3581081081081081
TP: 3, FP: 16, FN: 145, TN: 132
Model: claude-3-7-sonnet-20250219, Species mouse, Prompt Mode null. Accuracy: 0.4560810810810811, Precision: 0.15789473684210525, Recall: 0.02027027027027027
TP: 115, FP: 67, FN: 33, TN: 81
Model: o4-mini, Species mouse, Prompt Mode informative. Accuracy: 0.6621621621621622, Precision: 0.6318681318681318, Recall: 0.777027027027027
TP: 117, FP: 92, FN: 31, TN: 56
Model: o4-mini, Species mouse, Prompt Mode null. Accuracy: 0.5844594594594594, Precision: 0.5598086124401914, Recall: 0.7905405405405406
TP: 25, FP: 11, FN: 123, TN: 137
Model: gpt-4.1, Species mouse, Prompt Mode informative. Accuracy: 0.5472972972972973, Precision: 0.6944444444444444, Recall: 0.16891891891891891
TP: 6, FP: 3, FN: 142, TN: 145
Model: gpt-4.1, Species mouse, Prompt Mode null. Accuracy: 0.5

TP: 48, FP: 36, FN: 110, TN: 122
Model: claude-3-7-sonnet-20250219, Accuracy: 0.5379746835443038, Precision: 0.5714285714285714, Recall: 0.3037974683544304


In [380]:
df['is_split']

0       True
1      False
2       True
3      False
4       True
       ...  
311    False
312     True
313    False
314     True
315    False
Name: is_split, Length: 316, dtype: bool