In [None]:
# This code will perform the nsForest method of minimal marker gene identification for the MIST dataset 


In [None]:
import sys
import os
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
CODE_PATH = "/path/to/analysis/directory/nsForest/NSForest" # location of NSForest folder
sys.path.insert(0, os.path.abspath(CODE_PATH))
from nsforest import ns, nsforesting, utils, NSFOREST_VERSION


In [None]:
os.environ["OMP_NUM_THREADS"] = "48"
os.environ["OPENBLAS_NUM_THREADS"] = "48"
os.environ["MKL_NUM_THREADS"] = "48"
os.environ["VECLIB_MAXIMUM_THREADS"] = "48"
os.environ["NUMEXPR_NUM_THREADS"] = "48"

In [None]:
# In this sec
data_folder = "/path/to/analysis/directory/Seurat_Files/"
file = data_folder + "Analysis2.5_labeled_Ileum.h5ad"
adata = sc.read_h5ad(file)
adata

In [None]:
adata.obs["CoarseCellType"].unique()
adata.obs["secondlevel"].unique()
adata.obs["FinestCellType"].unique()

In [None]:
import nsforest
print(nsforest.__version__)
# or
import pkg_resources
print(pkg_resources.get_distribution("nsforest").version)

In [None]:
import nsforest
dir(nsforest)

In [None]:
import nsforest.preprocessing as pre
print(pre.__file__)
print(dir(pre))


In [None]:
from nsforest.preprocessing import get_medians, prep_medians, prep_binary_scores


In [None]:
from nsforest.preprocessing import prep_medians, prep_binary_scores
from nsforest.nsforesting import NSForest

# Add medians
prep_medians(adata, cluster_header="CoarseCellType")

# Add binary scores
prep_binary_scores(adata, cluster_header="CoarseCellType")

#Run NSForest
ns = NSForest(
    adata,
    cluster_header="CoarseCellType",
    gene_selection="BinaryFirst_high",
    n_trees=1000,
    n_top_genes=100,
    n_binary_genes=50,
    n_genes_eval=10,
    n_jobs=-1,
    beta=1.5
)

In [None]:
help(NSForest)

In [None]:
dir(ns)

In [None]:
#Save the results 
import pandas as pd
outdir = "/path/to/analysis/directory/nsForest/NSForest/NSForest_CoarseCellType_results"
os.makedirs(outdir, exist_ok=True)

# Marker results
ns.NSForest_markers.to_csv(os.path.join(outdir, "marker_results.csv"))

# Binary genes table
ns.binary_genes.to_csv(os.path.join(outdir, "binary_genes.csv"))

# Convert each cluster's marker list to a comma-separated string
markers_dict = {k: ",".join(v) for k, v in ns.NSForest_markers.items()}

# Save as CSV
pd.DataFrame.from_dict(markers_dict, orient="index", columns=["Markers"]).to_csv(
    os.path.join(outdir, "NSForest_markers.csv")
)

# Precision, recall, F-score
metrics = pd.DataFrame({
    "precision": ns.precision,
    "recall": ns.recall,
    "f_score": ns.f_score,
})

metrics.to_csv(os.path.join(outdir, "cluster_metrics.csv"))

# Confusion counts
confusion = pd.DataFrame({
    "TP": ns.TP,
    "FP": ns.FP,
    "TN": ns.TN,
    "FN": ns.FN
})
confusion.to_csv(os.path.join(outdir, "confusion_counts.csv"))


In [None]:
flat_list = []

for cluster, genes in zip(ns.clusterName, ns.binary_genes):
    for gene in genes:
        flat_list.append({"Cluster": cluster, "Gene": gene})

flat_df = pd.DataFrame(flat_list)
flat_df.to_csv(os.path.join(outdir, "NSForest_CoarseCellType_binary_genes_flat.csv"), index=False)


In [None]:
import os
import pandas as pd
from nsforest.preprocessing import prep_medians, prep_binary_scores
from nsforest.nsforesting import NSForest


top_level = "CoarseCellType"        
second_level = "secondlevel"  # level you want to run within each coarse type
third_level = "FinestCellType"
base_outdir = "/path/to/analysis/directory/nsForest/NSForest"



# Helper function - universal flattener

def flatten_nsforest_output(cluster_names, list_of_lists, value_column_name):
    flat = []
    for cluster, values in zip(cluster_names, list_of_lists):
        for v in values:
            flat.append({"Cluster": cluster, value_column_name: v})
    return pd.DataFrame(flat)



# function to Run nsForest and save

def run_nsforest_and_save(adata_sub, level_name, outdir):
    os.makedirs(outdir, exist_ok=True)

    # Prep medians and binary scores
    prep_medians(adata_sub, cluster_header=level_name)
    prep_binary_scores(adata_sub, cluster_header=level_name)

    # Run NSForest
    ns = NSForest(
        adata_sub,
        cluster_header=level_name,
        gene_selection="BinaryFirst_low",
        n_trees=1000,
        n_top_genes=100,
        n_binary_genes=50,
        n_genes_eval=10,
        n_jobs=-1,
        beta=1.5
    )


    # Main marker output
    ns.NSForest_markers.to_csv(os.path.join(outdir, "marker_results.csv"))

    # Raw binary gene table
    ns.binary_genes.to_csv(os.path.join(outdir, "binary_genes.csv"))

    # Marker list file
    markers_dict = {cluster: ",".join(genes) for cluster, genes in ns.NSForest_markers.items()}
    pd.DataFrame.from_dict(markers_dict, orient="index", columns=["Markers"]).to_csv(
        os.path.join(outdir, "NSForest_markers.csv")
    )

    # Metrics
    metrics = pd.DataFrame({
        "Cluster": ns.clusterName,
        "Markers": ns.NSForest_markers,
        "precision": ns.precision,
        "recall": ns.recall,
        "f_score": ns.f_score,
    })
    metrics.to_csv(os.path.join(outdir, "cluster_metrics.csv"))

    # Confusion matrix counts
    confusion = pd.DataFrame({
        "Cluster": ns.clusterName,
        "Markers": ns.NSForest_markers,
        "TP": ns.TP,
        "FP": ns.FP,
        "TN": ns.TN,
        "FN": ns.FN
    })
    confusion.to_csv(os.path.join(outdir, "confusion_counts.csv"))


    # Save flattened outputs  - Binary genes
    binary_df = flatten_nsforest_output(ns.clusterName, ns.binary_genes, "Gene")
    binary_df.to_csv(os.path.join(outdir, f"{level_name}_binary_genes_flat.csv"), index=False)

    # Top marker genes
    top_df = flatten_nsforest_output(ns.clusterName, ns.NSForest_markers, "Gene")
    top_df.to_csv(os.path.join(outdir, f"{level_name}_NSForest_markers_flat.csv"), index=False)

    print(f"Finished NSForest for {level_name} → saved to {outdir}")



# Run NSForest on second-level annotations inside each coarse type


for coarse_type in sorted(adata.obs[top_level].unique()):
    print(f"\nChecking {coarse_type} ...")

    adata_sub = adata[adata.obs[top_level] == coarse_type].copy()

    n_secondlevel = adata_sub.obs[second_level].nunique()

    if n_secondlevel <= 1:
        print(f"Skipping {coarse_type} — only {n_secondlevel} unique '{second_level}' label(s).")
        continue

    print(f"=== Processing {coarse_type} ({n_secondlevel} secondlevel groups) ===")

    subset_outdir = os.path.join(
        base_outdir,
        f"NSForest_{coarse_type}_{second_level}"
    )

    run_nsforest_and_save(adata_sub, second_level, subset_outdir)


In [None]:
# run nsForest for FineCellTypes within the Coarse lineages
for coarse_type in sorted(adata.obs[top_level].unique()):
    print(f"\nChecking {coarse_type} ...")

    adata_sub = adata[adata.obs[top_level] == coarse_type].copy()

    n_third_level = adata_sub.obs[third_level].nunique()

    if n_third_level <= 1:
        print(f"Skipping {coarse_type} — only {n_third_level} unique '{third_level}' label(s).")
        continue

    print(f"=== Processing {coarse_type} ({n_third_level} third_level groups) ===")

    subset_outdir = os.path.join(
        base_outdir,
        f"NSForest_{coarse_type}_{third_level}"
    )

    run_nsforest_and_save(adata_sub, third_level, subset_outdir)


In [None]:
dir()

In [None]:
# Now I want to run nsForest on the T cells groups as they appear in the manuscript -separating out the CD4/NKT cells, the CD8/gd T cells, and the ILCs into distinct groups
group1 = [
    "CD8 Effector",
    "Cd160 gd T cells",
    "Cycling CD8 T cells",
    "DN ab T cells",
    "Gzma gd T cells",
    "Ikzf2 high CD8 Effector",
    "Itgae positives CD8aa ab T cells",
    "S100a6 high CD8 Effector",
    "Themis positive Cd160 gd T cells",
    "Themis positive Gzma gd T cells",
    "gd T17",
    "Sell high CD8 T cells",
    "Sox4 positive gd T cells"
]

group2 = [
    "Ccr2 high Treg",
    "Cycling CD4 T cells",
    "Follicular Treg",
    "Il12rb2 high CD4 Effector",
    "Il12rb2 high NKT",
    "Il17rb high CD4 Effector",
    "Il17rb high NKT",
    "Itgae high CD4 Effector",
    "Klrg1 high Treg",
    "NKT",
    "Rorc high CD4 Effector",
    "S100a6 high CD4 Effector",
    "Sell high CD4 T cell",
    "Tfh",
    "Sell high Treg"
]

group3 = [
    "Ccr9 negative resident ILC2s",
    "Ccr9 positive migratory ILC2s",
    "Cycling ILC2s",
    "Cycling ILC3s",
    "Cycling NK cells",
    "ILC1s",
    "ILC3s",
    "Il1rl1 positive resident ILC2s",
    "LTIs",
    "NK cells",
    "T cells"
]

groups = {
    "Group1_CD8_and_gdT": group1,
    "Group2_CD4_and_NKT": group2,
    "Group3_ILCs": group3
}


# Subset by coarse type

big_coarse_type = "T cells and ILCs"   
adata_big = adata[adata.obs[top_level] == big_coarse_type].copy()

print(f"Loaded big coarse type: {big_coarse_type}")
print(f"Total cells before splitting: {adata_big.n_obs}")



for group_name, ftypes in groups.items():

    print(f"\n=== Processing {group_name} ({len(ftypes)} FinestCellTypes) ===")

    # Subset ONLY these finest cell types
    adata_sub = adata_big[adata_big.obs[third_level].isin(ftypes)].copy()

    if adata_sub.n_obs == 0:
        print(f"WARNING: No cells found for {group_name}. Skipping.")
        continue

    print(f"Cells in subset: {adata_sub.n_obs}")

    # Output directory
    outdir = os.path.join(
        base_outdir,
        f"NSForest_{big_coarse_type}_{group_name}"
    )

    # Run NSForest at the FinestCellType level
    run_nsforest_and_save(adata_sub, third_level, outdir)

print("\nAll group-level NSForest runs completed.")