# Sparse-MENDER Results Analysis

### Import Dependencies

In [31]:
import os
import json
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
import re
import sys
sys.path.append("..")

## MERFISH Analysis

### Define Directories and Constants

In [32]:
# Define base directory and subdirectories
base_dir = os.path.join(os.path.pardir, "results")
merfish_dir = os.path.join(base_dir, "merfish")
visium_dir = os.path.join(base_dir, "visium")

subdirs = [
    "annoy/pca", "annoy/fa", "annoy/ica", "annoy/nmf",
    "hnsw/pca", "hnsw/fa", "hnsw/ica", "hnsw/nmf",
    "none/pca", "none/fa", "none/ica", "none/nmf"
]
merfish_output_dir = os.path.join(os.path.pardir, "plots", "merfish")
visium_output_dir = os.path.join(os.path.pardir, "plots", "visium")
Path(merfish_output_dir).mkdir(parents=True, exist_ok=True)
Path(visium_output_dir).mkdir(parents=True, exist_ok=True)

### Read Results from JSON files

In [33]:
# Initialize data storage
data = []
performance_data = []

# Regex patterns
batch_pattern = re.compile(r"metrics_per_batch_.*\.json$")
results_pattern = re.compile(r"smender_MERFISH_.*\.json$")

# Read all relevant JSON files
for subdir in subdirs:
    dir_path = os.path.join(merfish_dir, subdir)
    if not os.path.exists(dir_path):
        print(f"Directory not found: {dir_path}")
        continue
    for file in os.listdir(dir_path):
        file_path = os.path.join(dir_path, file)
        
        # Handle batch metrics files
        if batch_pattern.match(file):
            print(f"Parsing batch file: {file}")
            with open(file_path, "r") as f:
                metrics = json.load(f)
            
            # Parse filename
            parts = file.split("_")
            if len(parts) < 6:
                print(f"Invalid batch filename format, skipping: {file}")
                continue
            type_ = parts[3]
            ann_method = parts[4] if parts[4] != "none" else "none"
            dim_reduction = parts[5].split(".")[0]
            print(f"Extracted: type={type_}, ann_method={ann_method}, dim_reduction={dim_reduction}")
            
            # Extract metrics for each donor
            for donor, values in metrics.items():
                row = {
                    "type": type_,
                    "ann_method": ann_method,
                    "dim_reduction": dim_reduction,
                    "donor": donor
                }
                row.update(values)
                data.append(row)
        
        # Handle results files
        elif results_pattern.match(file):
            print(f"Parsing results file: {file}")
            with open(file_path, "r") as f:
                results = json.load(f)
            
            # Parse filename
            parts = file.split("_")
            if len(parts) < 4:
                print(f"Invalid results filename format, skipping: {file}")
                continue
            ann_method = parts[2].lower() if parts[2].lower() != "none" else "none"
            dim_reduction = parts[3].split(".")[0].lower()
            print(f"Extracted: ann_method={ann_method}, dim_reduction={dim_reduction}")
            
            # Extract performance metrics
            perf = results.get("performance", {})
            row = {
                "ann_method": ann_method,
                "dim_reduction": dim_reduction,
                "smender_time_seconds": perf.get("smender_time_seconds", None),
                "smender_memory_mb": perf.get("smender_memory_mb", None),
                "dim_reduction_time_seconds": perf.get("dim_reduction_time_seconds", None),
                "dim_reduction_memory_mb": perf.get("dim_reduction_memory_mb", None),
                "nn_time_seconds": perf.get("nn_time_seconds", None),
                "nn_memory_mb": perf.get("nn_memory_mb", None)
            }
            performance_data.append(row)
        else:
            print(f"Skipping file: {file}")

Parsing batch file: metrics_per_batch_gt_annoy_pca.json
Extracted: type=gt, ann_method=annoy, dim_reduction=pca
Parsing batch file: metrics_per_batch_MENDER_annoy_pca.json
Extracted: type=MENDER, ann_method=annoy, dim_reduction=pca
Parsing results file: smender_MERFISH_Annoy_PCA_results.json
Extracted: ann_method=annoy, dim_reduction=pca
Parsing batch file: metrics_per_batch_gt_annoy_fa.json
Extracted: type=gt, ann_method=annoy, dim_reduction=fa
Parsing batch file: metrics_per_batch_MENDER_annoy_fa.json
Extracted: type=MENDER, ann_method=annoy, dim_reduction=fa
Parsing results file: smender_MERFISH_annoy_fa_results.json
Extracted: ann_method=annoy, dim_reduction=fa
Parsing batch file: metrics_per_batch_gt_annoy_ica.json
Extracted: type=gt, ann_method=annoy, dim_reduction=ica
Parsing batch file: metrics_per_batch_MENDER_annoy_ica.json
Extracted: type=MENDER, ann_method=annoy, dim_reduction=ica
Parsing results file: smender_MERFISH_annoy_ica_results.json
Extracted: ann_method=annoy, dim_

#### Per Batch Boxplots:

In [34]:
# Create DataFrame
df = pd.DataFrame(data)

# Print DataFrame columns and sample for debugging
print("DataFrame columns:", df.columns.tolist())
print("DataFrame sample:\n", df.head())

# Define metrics for MENDER
metrics = {
    "MENDER": ["NMI", "ARI", "PAS", "CHAOS"]
}
ann_methods = ["annoy", "hnsw", "none"]
dim_reductions = ["pca", "fa", "ica", "nmf"]

# Check if 'type' column exists
if 'type' not in df.columns:
    print("Error: 'type' column not found in DataFrame. Check filename parsing.")
    raise KeyError("'type' column missing in DataFrame")

# Function to create box plots
def create_box_plot(df_subset, type_, metric, group_by, group_values, title_prefix, filename_prefix):
    # Filter out rows where the metric is missing (NaN)
    df_subset = df_subset[df_subset[metric].notnull()]
    if df_subset.empty:
        print(f"Warning: No data for {metric} in {type_} for {group_by} plot. Skipping.")
        return
    plt.figure(figsize=(10, 6))
    ax = sns.boxplot(
        x=group_by,
        y=metric,
        data=df_subset,
        order=group_values,
        flierprops=dict(marker='o', markerfacecolor='maroon', markeredgecolor='maroon', markersize=8)
    )
    # Set transparency for box plots
    for patch in ax.patches:
        patch.set_alpha(0.4)
    plt.title(f"{title_prefix} SMENDER")
    plt.xlabel(group_by.replace("_", " ").title())
    plt.ylabel(metric)
    plt.tight_layout()
    # Save without redundant metric in filename
    plt.savefig(os.path.join(merfish_output_dir, f"{filename_prefix}.png"))
    plt.close()

# Generate plots for MENDER only
type_ = "MENDER"
df_type = df[df["type"] == type_]
print(f"Processing type: {type_}, number of rows: {len(df_type)}")
for metric in metrics[type_]:
    # Box plot by ANN method (only pca dim_reduction)
    df_ann = df_type[df_type["dim_reduction"] == "pca"]
    print(f"ANN plot for {metric}, pca rows: {len(df_ann)}")
    create_box_plot(
        df_ann,
        type_,
        metric,
        "ann_method",
        ann_methods,
        f"Box Plot of {metric} by ANN Method for",
        f"{type_.lower()}_ann_{metric.lower()}"
    )
    
    # Box plot by dimensionality reduction (only none ann_method)
    df_dim = df_type[df_type["ann_method"] == "none"]
    print(f"Dim reduction plot for {metric}, none ANN rows: {len(df_dim)}")
    create_box_plot(
        df_dim,
        type_,
        metric,
        "dim_reduction",
        dim_reductions,
        f"Box Plot of {metric} by Linear Dimensionality Reduction for",
        f"{type_.lower()}_dim_{metric.lower()}"
    )

print(f"Plots saved to {merfish_output_dir}")

DataFrame columns: ['type', 'ann_method', 'dim_reduction', 'donor', 'PAS', 'CHAOS', 'NMI', 'ARI']
DataFrame sample:
   type ann_method dim_reduction                         donor    PAS  CHAOS  \
0   gt      annoy           pca  MsBrainAgingSpatialDonor_1_0  0.046  0.015   
1   gt      annoy           pca  MsBrainAgingSpatialDonor_2_0  0.039  0.016   
2   gt      annoy           pca  MsBrainAgingSpatialDonor_2_1  0.029  0.013   
3   gt      annoy           pca  MsBrainAgingSpatialDonor_3_0  0.028  0.015   
4   gt      annoy           pca  MsBrainAgingSpatialDonor_3_1  0.028  0.016   

   NMI  ARI  
0  NaN  NaN  
1  NaN  NaN  
2  NaN  NaN  
3  NaN  NaN  
4  NaN  NaN  
Processing type: MENDER, number of rows: 372
ANN plot for NMI, pca rows: 93
Dim reduction plot for NMI, none ANN rows: 124
ANN plot for ARI, pca rows: 93
Dim reduction plot for ARI, none ANN rows: 124
ANN plot for PAS, pca rows: 93
Dim reduction plot for PAS, none ANN rows: 124
ANN plot for CHAOS, pca rows: 93
Dim reductio

#### Performance Bar Charts:

In [None]:
# Create performance DataFrame
perf_df = pd.DataFrame(performance_data)

# Print DataFrame columns and sample for debugging
print("Performance DataFrame columns:", perf_df.columns.tolist())
print("Performance DataFrame sample:\n", perf_df.head())

# Calculate SMENDER remainder for time by subtracting dim_reduction and nn components
perf_df['smender_remainder_time_seconds'] = (
    perf_df['smender_time_seconds'] - 
    perf_df['dim_reduction_time_seconds'] - 
    perf_df['nn_time_seconds']
)

# Clip negative remainders to zero to avoid plotting issues
perf_df['smender_remainder_time_seconds'] = perf_df['smender_remainder_time_seconds'].clip(lower=0)

# Define performance metrics for time
time_metrics = ["smender_remainder_time_seconds", "dim_reduction_time_seconds", "nn_time_seconds"]

def create_stacked_bar_chart(df_subset, group_by, group_values, metrics, title, filename, ylabel):
    df_subset = df_subset.dropna(subset=metrics)
    if df_subset.empty:
        print(f"Warning: No data for {metrics} in {group_by} plot. Skipping.")
        return
    
    # Prepare data for plotting
    means = df_subset.groupby(group_by)[metrics].mean().reindex(group_values)
    
    # Get the total SMENDER time for percentage calculations
    total_means = df_subset.groupby(group_by)['smender_time_seconds'].mean().reindex(group_values)
    
    plt.figure(figsize=(10, 6))
    bottom = None
    colors = ['#1f77b4', '#ff7f0e', '#2ca02c']  # Colors for smender remainder, dim_reduction, nn
    labels = ['SMENDER Remainder', 'Dim Reduction', 'NN']
    
    for i, metric in enumerate(metrics):
        bars = plt.bar(
            means.index,
            means[metric],
            bottom=bottom,
            color=colors[i],
            label=labels[i],
            alpha=0.8
        )
        
        # Add value and percentage labels
        for j, bar in enumerate(bars):
            height = bar.get_height()
            if height > 0.1:  # Only add label if height is significant to avoid clutter
                x = bar.get_x() + bar.get_width() / 2
                value = means[metric].iloc[j]
                # Calculate percentage relative to total SMENDER time
                percentage = (value / total_means.iloc[j] * 100) if total_means.iloc[j] > 0 else 0
                label_text = f'{value:.2f}\n({percentage:.1f}%)'
                
                # Determine label position
                if i == len(metrics) - 1:  # Top bar
                    # Place label above the bar
                    y = bar.get_y() + height + 0.01 * plt.gca().get_ylim()[1]
                    va = 'bottom'  # Align text bottom to anchor above bar
                    bbox = dict(facecolor='black', alpha=0.68, edgecolor='none', pad=3.0)
                    text_color = 'white'
                else:
                    # Center label in the middle of the bar
                    y = bar.get_y() + height / 2
                    va = 'center'  # Center vertically
                    bbox = dict(facecolor='black', alpha=0.5, edgecolor='none', pad=3.0) 
                    text_color = 'white'
                
                plt.text(
                    x, y,
                    label_text,
                    ha='center',
                    va=va,
                    color=text_color,
                    fontsize=9,
                    bbox=bbox
                )
        
        if bottom is None:
            bottom = means[metric]
        else:
            bottom += means[metric]
    
    plt.title(title)
    plt.xlabel(group_by.replace("_", " ").title())
    plt.ylabel(ylabel)
    plt.legend()
    plt.margins(y=0.2)  # Keep margin to accommodate top labels
    plt.tight_layout()
    plt.savefig(os.path.join(merfish_output_dir, f"{filename}.png"))
    plt.close()

# Generate stacked bar charts for time only
# By ANN method (only pca dim_reduction)
df_ann = perf_df[perf_df["dim_reduction"] == "pca"]
create_stacked_bar_chart(
    df_ann,
    "ann_method",
    ann_methods,
    time_metrics,
    "Time Breakdown by ANN Method for SMENDER (PCA)",
    "mender_ann_time_breakdown",
    "Time (seconds)"
)

# By dimensionality reduction (only none ann_method)
df_dim = perf_df[perf_df["ann_method"] == "none"]
create_stacked_bar_chart(
    df_dim,
    "dim_reduction",
    dim_reductions,
    time_metrics,
    "Time Breakdown by Dimensionality Reduction for SMENDER (No ANN)",
    "mender_dim_time_breakdown",
    "Time (seconds)"
)

print(f"Performance plots saved to {merfish_output_dir}")

Performance DataFrame columns: ['ann_method', 'dim_reduction', 'smender_time_seconds', 'smender_memory_mb', 'dim_reduction_time_seconds', 'dim_reduction_memory_mb', 'nn_time_seconds', 'nn_memory_mb']
Performance DataFrame sample:
   ann_method dim_reduction  smender_time_seconds  smender_memory_mb  \
0      annoy           pca           1723.723596        2515.277344   
1      annoy            fa           2086.987428         457.097656   
2      annoy           ica            574.566487        1440.015625   
3      annoy           nmf           1373.537254         562.441406   
4       hnsw           pca           1263.668541         697.589844   

   dim_reduction_time_seconds  dim_reduction_memory_mb  nn_time_seconds  \
0                   92.169614              1279.722656       165.687547   
1                  469.974874              1180.550781       311.413160   
2                  194.863103              1154.699219       127.831301   
3                 1815.967672             

## Visium MOB Analysis

In [36]:
# TODO