In [None]:
import scanpy as sc
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
import os, scipy
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as pyplot
from pynndescent import PyNNDescentTransformer
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
pd.set_option('display.max_columns', 500)
# R interface
from rpy2.robjects import pandas2ri
from rpy2.robjects import r
import rpy2.rinterface_lib.callbacks
import anndata2ri
import rpy2.robjects.numpy2ri
#import numpy2ri
import anndata
pandas2ri.activate()
anndata2ri.activate()
rpy2.robjects.numpy2ri.activate()
import scanpy.external as sce

%load_ext rpy2.ipython

# Herb et al., 2023 all fetal cells processing

In [None]:
%%R -o adata_fetal -o umap_emb_fetal -o adata_neurons -o umap_emb_neurons
library(Seurat)

sobj_fetal <- readRDS('/home/bns631/datadir/Herb_neurons/KaZhouAll.rds')
DefaultAssay(sobj_fetal) <- "RNA"
adata_fetal = as.SingleCellExperiment(sobj_fetal)
umap_emb_fetal = Embeddings(object = sobj_fetal, reduction = "umap")

sobj_neurons <- readRDS('/home/bns631/datadir/Herb_neurons/Data/EdKaZhouHypoNeurons.rds')
DefaultAssay(sobj_neurons) <- "RNA"
adata_neurons = as.SingleCellExperiment(sobj_neurons) 
umap_emb_neurons = Embeddings(object = sobj_neurons, reduction = "umap")

In [None]:
adata_fetal.obsm['umap'] = umap_emb_fetal
adata_fetal.obs = adata_fetal.obs[['Timepoint','sample','Timepoint_Study','ident','Study']]
adata_fetal.obs = adata_fetal.obs.rename(columns={'sample':'batch_key'})
adata_fetal.obs['Stage'] = 'Fetal'

adata_fetal.obs['Cell_types'] = adata_fetal.obs['ident']
adata_fetal.obs['Cell_types'] = adata_fetal.obs['Cell_types'].cat.add_categories(adata_neurons.obs.loc[list(set(adata_fetal.obs_names) & set(adata_neurons.obs_names)), 'AdultFetal_ExtrapolatedNuclei'].unique())

adata_fetal.obs.loc[list(set(adata_fetal.obs_names) & set(adata_neurons.obs_names)), 'Cell_types'] = adata_neurons[list(set(adata_fetal.obs_names) & set(adata_neurons.obs_names))].obs.AdultFetal_ExtrapolatedNuclei


adata_fetal.obs['Cell_types'] = adata_fetal.obs['Cell_types'].replace({'Unassigned':'Unclassified neurons'})

#del adata_fetal.layers

In [None]:
adata_fetal.obs

In [None]:
with plt.rc_context({ "figure.dpi": 300}): 
    sc.pl.umap(adata_fetal, color=['ident'], legend_fontsize=5)

In [None]:
with plt.rc_context({ "figure.dpi": 300}): 
    sc.pl.umap(adata_fetal, color=['Cell_types'], legend_fontsize=5)

In [None]:
from scipy.stats import median_abs_deviation

def is_outlier(adata, metric: str, nmads: int):
        M = adata.obs[metric]
        outlier = (M < np.median(M) - nmads * median_abs_deviation(M)) | (
            np.median(M) + nmads * median_abs_deviation(M) < M)
        return outlier

adata_dict_unfiltered ,adata_dict_filtered = {}, {}

for batch in adata_fetal.obs.batch_key.unique():
    
    print(batch)
    
    adata_temp = adata_fetal[adata_fetal.obs.batch_key.isin([batch])].copy()
    
    # mitochondrial genes
    adata_temp.var["mt"] = adata_temp.var_names.str.startswith("MT-")
    # ribosomal genes
    adata_temp.var["ribo"] = adata_temp.var_names.str.startswith(("RPS", "RPL"))
    
    sc.pp.calculate_qc_metrics(adata_temp, qc_vars=["mt", "ribo"], inplace=True, percent_top=[20], log1p=True)
    
    adata_temp.obs["outlier"] = (is_outlier(adata_temp, "log1p_total_counts", 3) | is_outlier(adata_temp, "log1p_n_genes_by_counts", 3))
       
    adata_temp.obs["mt_outlier"] = is_outlier(adata_temp, "pct_counts_mt", 3.5) | (adata_temp.obs["pct_counts_mt"] > 8)
    
    adata_temp.obs['pass_qc'] = (~adata_temp.obs.outlier) & (~adata_temp.obs.mt_outlier)
    adata_dict_unfiltered[batch] = adata_temp.copy()
    
    print(f"Total number of cells: {adata_temp.n_obs}")
    adata_temp = adata_temp[(~adata_temp.obs.outlier) & (~adata_temp.obs.mt_outlier)].copy()
    print(f"Number of cells after filtering of low quality cells: {adata_temp.n_obs}")
    print('______________________________________________________________________')
                                 
    # Remove rare genes
    sc.pp.filter_genes(adata_temp, min_cells=3)    

    # Remove mito and ribo genes
    ribo = adata_temp.var_names.str.startswith(('RPL', "RPS"))
    mito = adata_temp.var_names.str.startswith('MT-')
    remove = np.add(mito, ribo)
    #keep = np.invert(ribo)
    
    keep = np.invert(remove)
    adata_temp = adata_temp[:,np.array(keep)]
    
    
    adata_dict_filtered[batch] = adata_temp.copy()

In [None]:
adata_dict_unfiltered.keys()

In [None]:
# Check the results before and after filtering
batch = 'GW15_P'
with plt.rc_context({ "figure.dpi": 300}):
    # Compute qc metrix
    adata_dict_unfiltered[batch].var['mt'] = adata_dict_unfiltered[batch].var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
    sc.pp.calculate_qc_metrics(adata_dict_unfiltered[batch], qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
    fig, (ax0, ax1, ax2, ax3, ax4, ax5) = plt.subplots(1, 6,  figsize=(20,4), gridspec_kw={'wspace':0.5})
    ax0_dict = sc.pl.violin(adata_dict_unfiltered[batch],["pct_counts_mt"], jitter=0.5, show=False, ax=ax0, stripplot=False)
    ax1_dict = sc.pl.violin(adata_dict_unfiltered[batch],['n_genes_by_counts'], jitter=0.5, show=False, ax = ax1, stripplot=False) 
    ax2_dict = sc.pl.violin(adata_dict_unfiltered[batch],['total_counts'], jitter=0.5, show=False, ax = ax2, stripplot=False)
    ax3_dict = sns.histplot(adata_dict_unfiltered[batch].obs["n_genes_by_counts"],  ax = ax3)
    ax4_dict = sns.histplot(adata_dict_unfiltered[batch].obs["total_counts"], ax = ax4)
    ax5_dict = sc.pl.scatter(adata_dict_unfiltered[batch], x='total_counts', y='n_genes_by_counts', show=False, ax=ax5)    
    plt.show()
    plt.clf()

    # Compute qc metrix
    adata_dict_filtered[batch].var['mt'] = adata_dict_filtered[batch].var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
    sc.pp.calculate_qc_metrics(adata_dict_filtered[batch], qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
    fig, (ax0, ax1, ax2, ax3, ax4, ax5) = plt.subplots(1, 6,  figsize=(20,4), gridspec_kw={'wspace':0.5})
    ax0_dict = sc.pl.violin(adata_dict_filtered[batch],["pct_counts_mt"], jitter=0.5, show=False, ax=ax0, stripplot=False)
    ax1_dict = sc.pl.violin(adata_dict_filtered[batch],['n_genes_by_counts'], jitter=0.5, show=False, ax = ax1, stripplot=False) 
    ax2_dict = sc.pl.violin(adata_dict_filtered[batch],['total_counts'], jitter=0.5, show=False, ax = ax2, stripplot=False)
    ax3_dict = sns.histplot(adata_dict_filtered[batch].obs["n_genes_by_counts"],  ax = ax3)
    ax4_dict = sns.histplot(adata_dict_filtered[batch].obs["total_counts"], ax = ax4)
    ax5_dict = sc.pl.scatter(adata_dict_filtered[batch], x='total_counts', y='n_genes_by_counts', show=False, ax=ax5)
    plt.show()
    plt.clf()

adata_dict_unfiltered[batch].obs.pass_qc = adata_dict_unfiltered[batch].obs.pass_qc.astype(str)

with plt.rc_context({ "figure.dpi": 250}): 
    sc.pl.umap(adata_dict_unfiltered[batch], color='pass_qc')

In [None]:
# filter adata.var and adata.obs, and concatenate the objects and save to later use
result_dict = {}

for batch, adata_temp in adata_dict_filtered.items():
    adata_temp.obs = adata_temp.obs[['Timepoint', 'batch_key', 'Timepoint_Study', 'Cell_types', 'Study', 'Stage','ident']]
    del adata_temp.var
    result_dict[batch] = adata_temp.copy()
    
adata_fetal = list(result_dict.values())[0].concatenate(list(result_dict.values())[1:], batch_key=None, join='outer')



# Hypomap processing

In [None]:
adata_adult = sc.read('/datasets/renew_kirkeby/erno/human_HYPOMAP.h5ad')
adata_adult.X = adata_adult.raw.to_adata().X.copy()

In [None]:
adata_adult.obs = adata_adult.obs[['Dataset','sex','age_years','celltype_annotation','C0_named','C1_named','C2_named','C3_named','C4_named','region', 'Sample_ID']]
adata_adult.obs = adata_adult.obs.rename(columns={'Dataset':'Study','Sample_ID':'batch_key'})
adata_adult.obs['Stage'] = 'Adult'
adata_adult.obsm['umap'] = adata_adult.obsm['X_umap']
del adata_adult.var
del adata_adult.obsm['X_umap']
del adata_adult.obsm['X_scvi']
adata_adult.obs['Timepoint'] = 'years' + adata_adult.obs['age_years'].astype(str)




In [None]:
adata_adult.obs['Cell_types'] = adata_adult.obs['C1_named'].map({'C1-4 Oligo-Mature':'Oligo-Mature','C1-1 Astrocytes':'Astrocytes','C1-5 Pre-1':'Neurons','C1-10 Mid-2':'Neurons','C1-6 Mid-1':'Neurons',
'C1-11 Post-2':'Neurons','C1-8 Post-1':'Neurons','C1-9 Mid-3':'Neurons','C1-7 Pre-2':'Neurons','C1-3 Oligo-Precursor':'Oligo-Precursor','C1-13 Immune':'Immune','C1-12 Vascular':'Vascular','C1-2 Ependymal':'Ependymal'})

adata_adult.obs['Cell_types'] = adata_adult.obs.apply(lambda row: row['region'] if row['Cell_types'] == 'Neurons' else row['Cell_types'], axis=1)
adata_adult.obs['Cell_types'] = adata_adult.obs['Cell_types'].replace({'NA':'Unclassified neurons', 'Thalamaus':'Thalamus'})


with plt.rc_context({ "figure.dpi": 300}): 
    sc.pl.umap(adata_adult, color=['C1_named','Cell_types'], legend_loc='on data', legend_fontsize=5)

In [None]:
from scipy.stats import median_abs_deviation

def is_outlier(adata, metric: str, nmads: int):
        M = adata.obs[metric]
        outlier = (M < np.median(M) - nmads * median_abs_deviation(M)) | (
            np.median(M) + nmads * median_abs_deviation(M) < M)
        return outlier

adata_dict_unfiltered ,adata_dict_filtered = {}, {}

for batch in adata_adult.obs.batch_key.unique():
    
    print(batch)
    
    adata_temp = adata_adult[adata_adult.obs.batch_key.isin([batch])].copy()
    
    
    # mitochondrial genes
    adata_temp.var["mt"] = adata_temp.var_names.str.startswith("MT-")
    # ribosomal genes
    adata_temp.var["ribo"] = adata_temp.var_names.str.startswith(("RPS", "RPL"))
    
    sc.pp.calculate_qc_metrics(adata_temp, qc_vars=["mt", "ribo"], inplace=True, percent_top=[20], log1p=True)
    
    adata_temp.obs["outlier"] = (is_outlier(adata_temp, "log1p_total_counts", 3) | is_outlier(adata_temp, "log1p_n_genes_by_counts", 3))
       
    adata_temp.obs["mt_outlier"] = is_outlier(adata_temp, "pct_counts_mt", 3.5) | (adata_temp.obs["pct_counts_mt"] > 8)
    
    adata_temp.obs['pass_qc'] = (~adata_temp.obs.outlier) & (~adata_temp.obs.mt_outlier)
    adata_dict_unfiltered[batch] = adata_temp.copy()
    
    print(f"Total number of cells: {adata_temp.n_obs}")
    adata_temp = adata_temp[(~adata_temp.obs.outlier) & (~adata_temp.obs.mt_outlier)].copy()
    print(f"Number of cells after filtering of low quality cells: {adata_temp.n_obs}")
    print('______________________________________________________________________')
                                 
    # Remove rare genes
    sc.pp.filter_genes(adata_temp, min_cells=3)    

    # Remove mito and ribo genes
    ribo = adata_temp.var_names.str.startswith(('RPL', "RPS"))
    mito = adata_temp.var_names.str.startswith('MT-')
    remove = np.add(mito, ribo)
    #keep = np.invert(ribo)
    
    keep = np.invert(remove)
    adata_temp = adata_temp[:,np.array(keep)]
    
    adata_dict_filtered[batch] = adata_temp.copy()

In [None]:
# Check the results before and after filtering
#batch = 'znZv1_S10'
batch = 'znZv1_1AS3CB'

with plt.rc_context({ "figure.dpi": 300}):
    # Compute qc metrix
    adata_dict_unfiltered[batch].var['mt'] = adata_dict_unfiltered[batch].var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
    sc.pp.calculate_qc_metrics(adata_dict_unfiltered[batch], qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
    fig, (ax0, ax1, ax2, ax3, ax4, ax5) = plt.subplots(1, 6,  figsize=(20,4), gridspec_kw={'wspace':0.5})
    ax0_dict = sc.pl.violin(adata_dict_unfiltered[batch],["pct_counts_mt"], jitter=0.5, show=False, ax=ax0, stripplot=False)
    ax1_dict = sc.pl.violin(adata_dict_unfiltered[batch],['n_genes_by_counts'], jitter=0.5, show=False, ax = ax1, stripplot=False) 
    ax2_dict = sc.pl.violin(adata_dict_unfiltered[batch],['total_counts'], jitter=0.5, show=False, ax = ax2, stripplot=False)
    ax3_dict = sns.histplot(adata_dict_unfiltered[batch].obs["n_genes_by_counts"],  ax = ax3)
    ax4_dict = sns.histplot(adata_dict_unfiltered[batch].obs["total_counts"], ax = ax4)
    ax5_dict = sc.pl.scatter(adata_dict_unfiltered[batch], x='total_counts', y='n_genes_by_counts', show=False, ax=ax5)    
    plt.show()
    plt.clf()

    # Compute qc metrix
    adata_dict_filtered[batch].var['mt'] = adata_dict_filtered[batch].var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
    sc.pp.calculate_qc_metrics(adata_dict_filtered[batch], qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
    fig, (ax0, ax1, ax2, ax3, ax4, ax5) = plt.subplots(1, 6,  figsize=(20,4), gridspec_kw={'wspace':0.5})
    ax0_dict = sc.pl.violin(adata_dict_filtered[batch],["pct_counts_mt"], jitter=0.5, show=False, ax=ax0, stripplot=False)
    ax1_dict = sc.pl.violin(adata_dict_filtered[batch],['n_genes_by_counts'], jitter=0.5, show=False, ax = ax1, stripplot=False) 
    ax2_dict = sc.pl.violin(adata_dict_filtered[batch],['total_counts'], jitter=0.5, show=False, ax = ax2, stripplot=False)
    ax3_dict = sns.histplot(adata_dict_filtered[batch].obs["n_genes_by_counts"],  ax = ax3)
    ax4_dict = sns.histplot(adata_dict_filtered[batch].obs["total_counts"], ax = ax4)
    ax5_dict = sc.pl.scatter(adata_dict_filtered[batch], x='total_counts', y='n_genes_by_counts', show=False, ax=ax5)
    plt.show()
    plt.clf()

adata_dict_unfiltered[batch].obs.pass_qc = adata_dict_unfiltered[batch].obs.pass_qc.astype(str)

with plt.rc_context({ "figure.dpi": 250}): 
    sc.pl.umap(adata_dict_unfiltered[batch], color='pass_qc')

In [None]:
# filter adata.var and adata.obs, and concatenate the objects and save to later use
result_dict = {}

for batch, adata_temp in adata_dict_filtered.items():
    adata_temp.obs = adata_temp.obs[['Study', 'sex', 'age_years', 'celltype_annotation', 'C0_named', 'C1_named', 'C2_named', 'C3_named', 'C4_named', 'region', 'batch_key','Stage', 'Timepoint', 'Cell_types']]
    del adata_temp.var
    result_dict[batch] = adata_temp.copy()
    
adata_adult = list(result_dict.values())[0].concatenate(list(result_dict.values())[1:], batch_key=None, join='outer')


# Merge datasets

In [None]:
adata_merged = adata_fetal.concatenate(adata_adult, batch_key=None, join='inner')
adata_merged.write('Data/fetal_adult_hypo_ref.h5ad')

In [None]:
adata_merged

# Label transfer

In [None]:
adata_merged = sc.read('Data/fetal_adult_hypo_ref.h5ad')

adata_herb = sc.read('/datasets/renew_kirkeby/erno/herb_neurons.h5ad')
adata_herb.obs_names = [item.replace(":", "_") for item in adata_herb.obs_names]


In [None]:
adata_merged.obs_names = [item.rsplit('-', 2)[0] for item in adata_merged.obs_names]
adata_merged = adata_merged[list(set(adata_herb.obs_names) & set(adata_merged.obs_names)),:].copy()
adata_herb = adata_herb[adata_merged.obs_names].copy()

In [None]:
adata_herb.obs['adult_nuclei'] = adata_merged.obs['region']
adata_herb.obs['adult_subtype'] = adata_merged.obs['C4_named']
adata_herb.obs['adult_cell_type'] = adata_merged.obs['celltype_annotation']
adata_herb.obs['fetal_cell_type'] = adata_merged.obs['ident']
adata_herb.obs['Cell_types'] = adata_merged.obs['Cell_types']
adata_herb.obs['Stage'] = adata_merged.obs['Stage']
adata_herb.obs['batch_key'] = adata_merged.obs['batch_key']



adata_herb.obs['adult_nuclei']=adata_herb.obs['adult_nuclei'].replace({'Thalamaus':'Thalamus','NA':'Unassigned'})

adata_herb.obs['fetal_nuclei'] = None
adata_herb.obs.loc[adata_merged.obs['Stage'] == 'Fetal', 'fetal_nuclei'] = adata_herb.obs['Cell_types']

adata_herb = adata_herb[~adata_herb.obs.adult_nuclei.isin(['Fx/OT/ac','ME','Vascular'])].copy()

adata_herb.layers['counts'] = adata_herb.X.copy()

adata_herb.obs.loc[adata_herb.obs["batch_key"] == "GW18_P", "batch_key"] = 'GW18_Lane1'
adata_herb.obs['batch_key'] = adata_herb.obs['batch_key'].cat.remove_unused_categories()

adata_herb

In [None]:

result_list = []

cell_metadata = adata_herb.obs.columns.tolist()

for stage in adata_herb.obs.FetalAdult.unique():
    adata_stage = adata_herb[adata_herb.obs.FetalAdult.isin([stage])].copy()
    
    adata_dict_filtered = {}

    for batch in adata_stage.obs.batch_key.unique():

        print(batch)

        adata_temp = adata_stage[adata_stage.obs.batch_key.isin([batch])].copy()

        # Remove rare genes
        sc.pp.filter_genes(adata_temp, min_cells=3)    

        # Remove mito and ribo genes
        ribo = adata_temp.var_names.str.startswith(('RPL', "RPS"))
        mito = adata_temp.var_names.str.startswith('MT-')
        remove = np.add(mito, ribo)
        #keep = np.invert(ribo)

        keep = np.invert(remove)
        adata_temp = adata_temp[:,np.array(keep)]

        adata_dict_filtered[batch] = adata_temp.copy()

    result_dict = {}

    for batch, adata_temp in adata_dict_filtered.items():
        adata_temp.obs = adata_temp.obs[cell_metadata]
        del adata_temp.var
        result_dict[batch] = adata_temp.copy()


    result_list.append(list(result_dict.values())[0].concatenate(list(result_dict.values())[1:], batch_key=None, join='outer'))


In [None]:
adata_herb = result_list[0].concatenate(result_list[1], batch_key=None, join='inner')
adata_herb

In [None]:
with plt.rc_context({ "figure.dpi": 300}): 
    sc.pl.umap(adata_herb, color=['adult_nuclei','AdultFetal_ExtrapolatedNuclei','fetal_nuclei' ,'Stage'], palette='tab20', wspace=0.4, size=10,ncols=2,title=['hypomap_adult_nuclei', 'herb_adult_nuclei','herb_fetal_nuclei' ,'Stage'])

In [None]:
%%R -i adata_herb 

Csparse_validate = "CsparseMatrix_validate"
library(Seurat)
library(dplyr)

sobj <- as.Seurat(adata_herb, counts = "counts", data = NULL)

sobj_list <- SplitObject(sobj, split.by = 'batch_key')

sobj_list <- lapply(X = sobj_list, FUN = function(x) {
    x <- NormalizeData(x)
    x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 2000)
})

features <- SelectIntegrationFeatures(object.list = sobj_list, nfeatures=2000)

sobj_list <- lapply(X = sobj_list, FUN = function(x) {
    x <- ScaleData(x, features = features, verbose = FALSE)
    x <- RunPCA(x, features = features, verbose = FALSE)
  })

In [None]:
%%R -o adata_integrated -o umap_emb -o pca_emb -o hvg_list

anchors <- FindIntegrationAnchors(object.list = sobj_list, anchor.features = features, reduction = "rpca")
sobj <- IntegrateData(anchorset = anchors, k.weight = 60) 

DefaultAssay(sobj) <- "integrated"
sobj <- ScaleData(sobj) %>% RunPCA(.)
sobj <- RunUMAP(sobj, dims = 1:40)
sobj <- FindNeighbors(sobj, dims = 1:40)

pca_emb = Embeddings(object = sobj, reduction = "pca")
hvg_list = rownames(sobj)

DefaultAssay(sobj) <- "originalexp"

umap_emb = Embeddings(object = sobj, reduction = "umap")
adata_integrated = as.SingleCellExperiment(sobj) 

saveRDS(sobj,'Data/herb_integration_seurat_filtered.rds')

In [None]:
%%R -o adata_integrated -o umap_emb -o pca_emb -o hvg_list

library(Seurat)
sobj = readRDS('Data/herb_integration_seurat_filtered.rds')

sobj=UpdateSeuratObject(sobj)

DefaultAssay(sobj) <- "integrated"
hvg_list = rownames(sobj)

DefaultAssay(sobj) <- "originalexp"
umap_emb = Embeddings(object = sobj, reduction = "umap")
pca_emb = Embeddings(object = sobj, reduction = "pca")
adata_integrated = as.SingleCellExperiment(sobj) 

In [None]:
adata_integrated.obsm['X_pca'] = pca_emb
adata_integrated.obsm['X_umap'] = umap_emb

ref_idx = adata_integrated.obs["Stage"] == "Adult"
train_X = adata_integrated[ref_idx].obsm["X_pca"]
train_Y = adata_integrated[ref_idx].obs['adult_nuclei'].to_numpy()

knn = KNeighborsClassifier(n_neighbors=40)
knn.fit(train_X, train_Y)

# Predict probabilities for fetal cells
knn_probs = knn.predict_proba(adata_integrated[adata_integrated.obs["Stage"] == "Fetal"].obsm["X_pca"])

# Get the predicted labels and maximum probabilities
knn_pred = knn.classes_[np.argmax(knn_probs, axis=1)]
max_probs = np.max(knn_probs, axis=1)

# Assign "unassigned" for cells with max_probs below the threshold
adata_integrated.obs.loc[adata_integrated.obs["Stage"] == "Fetal", "predicted_nuclei"] = np.where(max_probs >= 0.5, knn_pred, "Unassigned")

adata_integrated.obs.loc[adata_integrated.obs["fetal_nuclei"] == "Fetal", "predicted_nuclei"] = 'Fetal neuron'

adata_integrated.obs.loc[adata_integrated.obs["Stage"] == "Adult", "predicted_nuclei"] = adata_integrated[adata_integrated.obs.Stage == 'Adult'].obs.adult_nuclei.copy()

adata_integrated[adata_integrated.obs.Stage.isin(['Fetal'])].obs.predicted_nuclei.value_counts()

In [None]:
with plt.rc_context({"figure.dpi": 250}):
    sc.pl.umap( adata_integrated,frameon=False, color=['Stage','adult_nuclei','predicted_nuclei'], size=8,wspace=0.3)

In [None]:
categories = adata_integrated.obs.predicted_nuclei.cat.categories

fig, axes = plt.subplots(5, 4, figsize= (20,20), sharex=True, sharey=True)

axes = axes.flatten()
for i, nuclei in enumerate(categories):
    with plt.rc_context({"figure.dpi": 250}):
        sc.pl.umap( adata_integrated,frameon=False,ax=axes[i], size=8,color=None, show=False)
        sc.pl.umap(adata_integrated[adata_integrated.obs.predicted_nuclei.isin([nuclei])],color=["Stage"],ax=axes[i],  size=8,title=f'{nuclei} label transfer',show=False)

for ax in axes[len(categories):]:
    ax.axis("off")

fig.tight_layout()
plt.show()

In [None]:
adata_fetal = adata_integrated[adata_integrated.obs.Stage == 'Fetal'].copy()
adata_fetal.layers['counts'] = adata_fetal.X.copy()
adata_fetal

In [None]:
%%R -i adata_fetal 

Csparse_validate = "CsparseMatrix_validate"
library(Seurat)
library(dplyr)

sobj <- as.Seurat(adata_fetal, counts = "counts", data = NULL)

sobj_list <- SplitObject(sobj, split.by = 'batch_key')

sobj_list <- lapply(X = sobj_list, FUN = function(x) {
    x <- NormalizeData(x)
    x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 2000)
})

features <- SelectIntegrationFeatures(object.list = sobj_list, nfeatures=2000)

sobj_list <- lapply(X = sobj_list, FUN = function(x) {
    x <- ScaleData(x, features = features, verbose = FALSE)
    x <- RunPCA(x, features = features, verbose = FALSE)
  })

In [None]:
%%R -o adata_fetal_integrated -o umap_emb -o pca_emb -o hvg_list

anchors <- FindIntegrationAnchors(object.list = sobj_list, anchor.features = features, reduction = "rpca")
sobj <- IntegrateData(anchorset = anchors, k.weight = 60) 

DefaultAssay(sobj) <- "integrated"
sobj <- ScaleData(sobj) %>% RunPCA(.)
sobj <- RunUMAP(sobj, dims = 1:40)
sobj <- FindNeighbors(sobj, dims = 1:40)

sobj <- FindClusters(sobj,resolution=1.6)
sobj <- FindClusters(sobj,resolution=1.8)
sobj <- FindClusters(sobj,resolution=1.9)
sobj <- FindClusters(sobj,resolution=2)
sobj <- FindClusters(sobj,resolution=2.1)
sobj <- FindClusters(sobj,resolution=2.2)




pca_emb = Embeddings(object = sobj, reduction = "pca")
hvg_list = rownames(sobj)

DefaultAssay(sobj) <- "originalexp"

umap_emb = Embeddings(object = sobj, reduction = "umap")
adata_fetal_integrated = as.SingleCellExperiment(sobj) 

saveRDS(sobj,'Data/herb_fetal_integration_seurat_filtered.rds')

In [None]:
%%R -o adata_fetal_integrated -o umap_emb -o pca_emb

library(Seurat)
sobj = readRDS('Data/herb_fetal_integration_seurat_filtered.rds')

sobj=UpdateSeuratObject(sobj)


DefaultAssay(sobj) <- "originalexp"
pca_emb = Embeddings(object = sobj, reduction = "pca")
umap_emb = Embeddings(object = sobj, reduction = "umap")
adata_fetal_integrated = as.SingleCellExperiment(sobj) 

In [None]:
adata_fetal_integrated.obsm['X_umap'] = umap_emb
adata_fetal_integrated.obsm['X_pca'] = pca_emb


In [None]:
adata17 = adata_fetal_integrated[adata_fetal_integrated.obs['integrated_snn_res.1.9'] == '17'].copy()
sc.pp.neighbors(adata17)
sc.tl.leiden(adata17, resolution = 0.1)

adata7 = adata_fetal_integrated[adata_fetal_integrated.obs['integrated_snn_res.1.9'] == '7'].copy()
sc.pp.neighbors(adata7)
sc.tl.leiden(adata7, resolution = 0.3)



In [None]:
adata_fetal_integrated.obs['assigned_nuclei'] = adata_fetal_integrated.obs['predicted_nuclei']

# Assign ARC cells
arc_indices_1 = adata_fetal_integrated.obs[adata_fetal_integrated.obs['integrated_snn_res.1.9'].isin(['20', '37', '18'])].index
arc_indices_2 = adata17.obs[adata17.obs['leiden'].isin(['0'])].index
arc_indices_3 = adata7.obs[adata7.obs['leiden'].isin(['0','2'])].index

arc_indices = set().union(*[arc_indices_1, arc_indices_2, arc_indices_3])
adata_fetal_integrated.obs.loc[(adata_fetal_integrated.obs.index.isin(arc_indices)), "assigned_nuclei"] = "ARC"

# DMH
dmh_indices = adata_fetal_integrated.obs[adata_fetal_integrated.obs['integrated_snn_res.1.9'].isin(['28'])].index
adata_fetal_integrated.obs.loc[(adata_fetal_integrated.obs.index.isin(dmh_indices)), "assigned_nuclei"] = "DMH"

# LH
lh_indices = adata_fetal_integrated.obs[adata_fetal_integrated.obs['integrated_snn_res.1.9'].isin(['25','13'])].index
adata_fetal_integrated.obs.loc[(adata_fetal_integrated.obs.index.isin(lh_indices)), "assigned_nuclei"] = "LH"
# Assign predicted LH cells outside cluster 25 and 13 to unassigned
non_lh_indices = adata_fetal_integrated.obs[(adata_fetal_integrated.obs['predicted_nuclei'] == "LH") & (~adata_fetal_integrated.obs.index.isin(lh_indices))].index
adata_fetal_integrated.obs.loc[non_lh_indices, "assigned_nuclei"] = "Unassigned"

# MAM
mam_indices = adata_fetal_integrated.obs[adata_fetal_integrated.obs['integrated_snn_res.1.9'].isin(['27','34','9','38','33'])].index
adata_fetal_integrated.obs.loc[(adata_fetal_integrated.obs.index.isin(mam_indices)), "assigned_nuclei"] = "MAM"

# Perivent
perivent_indices = adata_fetal_integrated.obs[adata_fetal_integrated.obs['integrated_snn_res.1.9'].isin(['31'])].index
adata_fetal_integrated.obs.loc[(adata_fetal_integrated.obs.index.isin(perivent_indices)), "assigned_nuclei"] = "Perivent"

# PVN
pvn_indices = adata_fetal_integrated.obs[adata_fetal_integrated.obs['integrated_snn_res.1.9'].isin(['11'])].index
adata_fetal_integrated.obs.loc[(adata_fetal_integrated.obs.index.isin(pvn_indices)), "assigned_nuclei"] = "PVN"

# TMN
tmn_indices = adata_fetal_integrated.obs[adata_fetal_integrated.obs['integrated_snn_res.1.9'].isin(['23','19'])].index
adata_fetal_integrated.obs.loc[(adata_fetal_integrated.obs.index.isin(tmn_indices)), "assigned_nuclei"] = "TMN"

# Thalamus
thalamus_indices = adata_fetal_integrated.obs[adata_fetal_integrated.obs['integrated_snn_res.1.9'].isin(['29'])].index
adata_fetal_integrated.obs.loc[(adata_fetal_integrated.obs.index.isin(thalamus_indices)), "assigned_nuclei"] = "Thalamus"
non_thalamus_indices = adata_fetal_integrated.obs[(adata_fetal_integrated.obs['predicted_nuclei'] == "Thalamus") & (~adata_fetal_integrated.obs.index.isin(thalamus_indices))].index
adata_fetal_integrated.obs.loc[non_thalamus_indices, "assigned_nuclei"] = "Unassigned"

# VMH
vmh_indices = adata_fetal_integrated.obs[adata_fetal_integrated.obs['integrated_snn_res.1.9'].isin(['10'])].index
adata_fetal_integrated.obs.loc[(adata_fetal_integrated.obs.index.isin(vmh_indices)), "assigned_nuclei"] = "VMH"



In [None]:
#nuclei = 'DMH'

fig, axes = plt.subplots(1, 2, figsize=(15, 5), dpi=250)  # Adjust the figsize as needed
sc.pl.umap(adata_fetal_integrated,frameon=False,
    color=['predicted_nuclei'],layer='logcounts',cmap='jet',size=8, ax=axes[0],show=False)

sc.pl.umap(adata_fetal_integrated,frameon=False,color=['assigned_nuclei'], size=8, ncols=3,ax=axes[1],  show=False, groups=['ARC', 'DMH', 'Fetal neuron', 'LH', 'LPOA', 'LTN', 'MAM', 'MPOA', 'POA',
       'PVN', 'Perivent', 'SCN', 'SON', 'TMN', 'Thalamus', 'VMH','Vent'],na_color='White', na_in_legend=False)
plt.tight_layout()
plt.show() 

In [None]:
nuclei = 'LH'

fig, axes = plt.subplots(1, 2, figsize=(10, 5), dpi=250)  # Adjust the figsize as needed

sc.pl.umap(adata_fetal_integrated,frameon=False, color=['predicted_nuclei'],layer='logcounts',cmap='jet',size=8, ax=axes[0],show=False , groups=nuclei)

sc.pl.umap(adata_fetal_integrated,frameon=False, color=['assigned_nuclei'],layer='logcounts',cmap='jet',size=8, ax=axes[1],show=False , groups=nuclei)


plt.tight_layout()
plt.show() 

#adata_fetal_integrated[adata_fetal_integrated.obs.assigned_nuclei == nuclei].obs.predicted_nuclei.value_counts()

# Merged assigned annotations with integrated data

In [None]:
adata_integrated.obs['assigned_nuclei'] = adata_integrated.obs.adult_nuclei.astype(str)
adata_integrated.obs.loc[adata_integrated.obs_names.isin(adata_fetal_integrated.obs_names), "assigned_nuclei"] = adata_fetal_integrated.obs.assigned_nuclei
adata_integrated.obsm['X_umap'] = umap_emb

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 5), dpi=250)  # Adjust the figsize as needed
sc.pl.umap(adata_integrated[adata_integrated.obs.Stage == 'Fetal'],frameon=False, groups=['ARC', 'DMH', 'Fetal neuron', 'LH', 'LPOA', 'LTN', 'MAM', 'MPOA', 'POA',
       'PVN', 'Perivent', 'SCN', 'SON', 'TMN', 'Thalamus', 'VMH','Vent'], color=['assigned_nuclei'],layer='logcounts',cmap='jet',size=8, ax=axes[0],show=False, title='Fetal - Assigned nuclei')

sc.pl.umap(adata_integrated[adata_integrated.obs.Stage == 'Adult'],frameon=False,color=['assigned_nuclei'], size=8, ncols=3,ax=axes[1],  show=False, groups=['ARC', 'DMH', 'Fetal neuron', 'LH', 'LPOA', 'LTN', 'MAM', 'MPOA', 'POA',
       'PVN', 'Perivent', 'SCN', 'SON', 'TMN', 'Thalamus', 'VMH','Vent'],na_color='White', na_in_legend=False, title='Adult - Assigned nuclei')
plt.tight_layout()
plt.show() 

In [None]:
with plt.rc_context({"figure.dpi": 250}):
    
    sc.pl.umap(adata_integrated,frameon=False, color=['assigned_nuclei','Stage'],layer='logcounts',cmap='jet',size=8, wspace=0.4, groups=['ARC', 'DMH', 'Fetal neuron', 'LH', 'LPOA', 'LTN', 'MAM', 'MPOA', 'POA',
       'PVN', 'Perivent', 'SCN', 'SON', 'TMN', 'Thalamus', 'VMH','Vent', 'Adult','Fetal'])

In [None]:
nuclei = 'ARC'

fig, axes = plt.subplots(1, 2, figsize=(10, 5), dpi=250)  # Adjust the figsize as needed

sc.pl.umap(adata_integrated[adata_integrated.obs.Stage=='Fetal'],frameon=False, color=['assigned_nuclei'],layer='logcounts',cmap='jet',size=8, ax=axes[0],show=False , groups=nuclei, title='Fetal')

sc.pl.umap(adata_integrated[adata_integrated.obs.Stage=='Adult'],frameon=False, color=['assigned_nuclei'],layer='logcounts',cmap='jet',size=8, ax=axes[1],show=False , groups=nuclei, title='Adult')


plt.tight_layout()
plt.show() 


# Merge annotations with the preprocessed data

In [None]:
adata_merged = sc.read('Data/fetal_adult_hypo_ref.h5ad')


In [None]:
# Ensure similar naming pattern of cells
adata_fetal_integrated.obs_names = [item.rsplit("-", 2)[0] for item in adata_fetal_integrated.obs_names]
adata_merged_temp = adata_merged.copy()
adata_merged_temp.obs_names = [item.rsplit("-", 2)[0] for item in adata_merged_temp.obs_names]

# Move the annotations using a temp adata 
adata_merged_temp.obs.Cell_types = adata_merged_temp.obs.Cell_types.astype(str)
adata_merged_temp.obs.loc[adata_merged_temp.obs_names.isin(adata_fetal_integrated.obs_names), "Cell_types"] = adata_fetal_integrated.obs.assigned_nuclei.astype(str)

adata_merged.obs['Cell_types_orig'] =  adata_merged.obs['Cell_types'].values.copy()
adata_merged.obs['Cell_types'] = adata_merged_temp.obs['Cell_types'].values.copy()

In [None]:
adata_merged.write('Data/fetal_adult_hypo_ref_annotations.h5ad')