# Sparse-MENDER Results Analysis

### Import Dependencies

In [5]:
import os
import json
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
import re
import sys
sys.path.append("..")

## MERFISH Analysis

### Define Directories and Constants

In [6]:
# Define base directory and subdirectories
base_dir = os.path.join(os.path.pardir, "results")
merfish_dir = os.path.join(base_dir, "merfish")
visium_dir = os.path.join(base_dir, "visium")

subdirs = [
    "annoy/pca", "annoy/fa", "annoy/ica", "annoy/nmf",
    "hnsw/pca", "hnsw/fa", "hnsw/ica", "hnsw/nmf",
    "none/pca", "none/fa", "none/ica", "none/nmf"
]
merfish_output_dir = os.path.join(os.path.pardir, "plots", "merfish")
visium_output_dir = os.path.join(os.path.pardir, "plots", "visium")
Path(merfish_output_dir).mkdir(parents=True, exist_ok=True)

### Read Results from JSON files

In [7]:
# Initialize data storage
data = []

# Regex pattern to match only metrics_per_batch_*.json files
pattern = re.compile(r"metrics_per_batch_.*\.json$")

# Read all relevant JSON files
for subdir in subdirs:
    dir_path = os.path.join(merfish_dir, subdir)
    if not os.path.exists(dir_path):
        print(f"Directory not found: {dir_path}")
        continue
    for file in os.listdir(dir_path):
        if not pattern.match(file):
            print(f"Skipping file: {file}")
            continue  # Skip files that don't match the pattern
        file_path = os.path.join(dir_path, file)
        with open(file_path, "r") as f:
            metrics = json.load(f)
        
        # Parse filename to extract type, ann_method, dim_reduction
        parts = file.split("_")
        print(f"Parsing file: {file}, parts: {parts}")
        if len(parts) < 6:  # Adjusted for potential extra prefix
            print(f"Invalid filename format, skipping: {file}")
            continue
        type_ = parts[3]  # Should be 'MENDER' (or 'gt', but we'll filter later)
        ann_method = parts[4] if parts[4] != "none" else "none"
        dim_reduction = parts[5].split(".")[0]
        print(f"Extracted: type={type_}, ann_method={ann_method}, dim_reduction={dim_reduction}")
        
        # Extract metrics for each donor
        for donor, values in metrics.items():
            row = {
                "type": type_,
                "ann_method": ann_method,
                "dim_reduction": dim_reduction,
                "donor": donor
            }
            # Include all metrics, even if missing (will be NaN in DataFrame)
            row.update(values)
            data.append(row)


Parsing file: metrics_per_batch_gt_annoy_pca.json, parts: ['metrics', 'per', 'batch', 'gt', 'annoy', 'pca.json']
Extracted: type=gt, ann_method=annoy, dim_reduction=pca
Parsing file: metrics_per_batch_MENDER_annoy_pca.json, parts: ['metrics', 'per', 'batch', 'MENDER', 'annoy', 'pca.json']
Extracted: type=MENDER, ann_method=annoy, dim_reduction=pca
Skipping file: smender_MERFISH_Annoy_PCA_results.json
Parsing file: metrics_per_batch_gt_annoy_fa.json, parts: ['metrics', 'per', 'batch', 'gt', 'annoy', 'fa.json']
Extracted: type=gt, ann_method=annoy, dim_reduction=fa
Parsing file: metrics_per_batch_MENDER_annoy_fa.json, parts: ['metrics', 'per', 'batch', 'MENDER', 'annoy', 'fa.json']
Extracted: type=MENDER, ann_method=annoy, dim_reduction=fa
Skipping file: smender_MERFISH_annoy_fa_results.json
Parsing file: metrics_per_batch_gt_annoy_ica.json, parts: ['metrics', 'per', 'batch', 'gt', 'annoy', 'ica.json']
Extracted: type=gt, ann_method=annoy, dim_reduction=ica
Parsing file: metrics_per_batc

#### Per Batch Boxplots:

In [None]:
# Create DataFrame
df = pd.DataFrame(data)

# Print DataFrame columns and sample for debugging
print("DataFrame columns:", df.columns.tolist())
print("DataFrame sample:\n", df.head())

# Define metrics for MENDER
metrics = {
    "MENDER": ["NMI", "ARI", "PAS", "CHAOS"]
}
ann_methods = ["annoy", "hnsw", "none"]
dim_reductions = ["pca", "fa", "ica", "nmf"]

# Check if 'type' column exists
if 'type' not in df.columns:
    print("Error: 'type' column not found in DataFrame. Check filename parsing.")
    raise KeyError("'type' column missing in DataFrame")

# Function to create box plots
def create_box_plot(df_subset, type_, metric, group_by, group_values, title_prefix, filename_prefix):
    # Filter out rows where the metric is missing (NaN)
    df_subset = df_subset[df_subset[metric].notnull()]
    if df_subset.empty:
        print(f"Warning: No data for {metric} in {type_} for {group_by} plot. Skipping.")
        return
    plt.figure(figsize=(10, 6))
    ax = sns.boxplot(
        x=group_by,
        y=metric,
        data=df_subset,
        order=group_values,
        flierprops=dict(marker='o', markerfacecolor='maroon', markeredgecolor='maroon', markersize=8)
    )
    # Set transparency for box plots
    for patch in ax.patches:
        patch.set_alpha(0.4)
    plt.title(f"{title_prefix} SMENDER")
    plt.xlabel(group_by.replace("_", " ").title())
    plt.ylabel(metric)
    plt.tight_layout()
    # Save without redundant metric in filename
    plt.savefig(os.path.join(merfish_output_dir, f"{filename_prefix}.png"))
    plt.close()

# Generate plots for MENDER only
type_ = "MENDER"
df_type = df[df["type"] == type_]
print(f"Processing type: {type_}, number of rows: {len(df_type)}")
for metric in metrics[type_]:
    # Box plot by ANN method (only pca dim_reduction)
    df_ann = df_type[df_type["dim_reduction"] == "pca"]
    print(f"ANN plot for {metric}, pca rows: {len(df_ann)}")
    create_box_plot(
        df_ann,
        type_,
        metric,
        "ann_method",
        ann_methods,
        f"Box Plot of {metric} by ANN Method for",
        f"{type_.lower()}_ann_{metric.lower()}"
    )
    
    # Box plot by dimensionality reduction (only none ann_method)
    df_dim = df_type[df_type["ann_method"] == "none"]
    print(f"Dim reduction plot for {metric}, none ANN rows: {len(df_dim)}")
    create_box_plot(
        df_dim,
        type_,
        metric,
        "dim_reduction",
        dim_reductions,
        f"Box Plot of {metric} by Linear Dimensionality Reduction for",
        f"{type_.lower()}_dim_{metric.lower()}"
    )

print(f"Plots saved to {merfish_output_dir}")

DataFrame columns: ['type', 'ann_method', 'dim_reduction', 'donor', 'PAS', 'CHAOS', 'NMI', 'ARI']
DataFrame sample:
   type ann_method dim_reduction                         donor    PAS  CHAOS  \
0   gt      annoy           pca  MsBrainAgingSpatialDonor_1_0  0.046  0.015   
1   gt      annoy           pca  MsBrainAgingSpatialDonor_2_0  0.039  0.016   
2   gt      annoy           pca  MsBrainAgingSpatialDonor_2_1  0.029  0.013   
3   gt      annoy           pca  MsBrainAgingSpatialDonor_3_0  0.028  0.015   
4   gt      annoy           pca  MsBrainAgingSpatialDonor_3_1  0.028  0.016   

   NMI  ARI  
0  NaN  NaN  
1  NaN  NaN  
2  NaN  NaN  
3  NaN  NaN  
4  NaN  NaN  
Processing type: MENDER, number of rows: 372
ANN plot for NMI, pca rows: 93
Dim reduction plot for NMI, none ANN rows: 124
ANN plot for ARI, pca rows: 93
Dim reduction plot for ARI, none ANN rows: 124
ANN plot for PAS, pca rows: 93
Dim reduction plot for PAS, none ANN rows: 124
ANN plot for CHAOS, pca rows: 93
Dim reductio

NameError: name 'output_dir' is not defined

## Visium MOB Analysis

In [None]:
# TODO