In [2]:
import sfaira
import os
import re
import scanpy as sc
import warnings
import pandas as pd
import numpy as np
import scipy.sparse as sp

warnings.simplefilter('ignore', FutureWarning)
warnings.simplefilter('ignore', UserWarning)
warnings.simplefilter('ignore', RuntimeWarning)

In [3]:
os.chdir("/Users/hassansaei/Desktop/DIR_SFAIRA/")

In [4]:
# Load GTF file
gtf_file = "Homo_sapiens.GRCh38.104.gtf"
genes = []

with open(gtf_file, "r") as f:
    for line in f:
        if line.startswith("#"):
            continue  # Skip headers
        columns = line.strip().split("\t")
        if columns[2] == "gene":
            info = {kv.split(" ")[0]: kv.split(" ")[1].strip('";') for kv in columns[8].split("; ") if " " in kv}
            if info.get("gene_biotype") in ["protein_coding", "lncRNA"]:
                genes.append(info.get("gene_name"))

# Create a set of allowed genes
allowed_genes = set(genes)
allowed_genes = {gene for gene in genes if gene and pd.notna(gene)}
print(f"Total protein_coding & lncRNA genes: {len(allowed_genes)}")

Total protein_coding & lncRNA genes: 24100


In [4]:
def harmonize_matrix(df):
    """
    Matches gene names in df with the reference gene list (allowed_genes),
    keeps common genes, and adds missing ones as zero rows, preserving sparse format.
    """
    # Get existing genes in the dataset
    dataset_genes = set(df.index)
    # Find genes that exist in both datasets
    common_genes = sorted(allowed_genes.intersection(dataset_genes))
    # Find missing genes that need to be added with zero values
    missing_genes = sorted(allowed_genes - dataset_genes)
    # Keep only the common genes
    df = df.loc[common_genes, :]
    # Create a zero-filled sparse matrix for missing genes
    zero_matrix = sp.csr_matrix((len(missing_genes), df.shape[1]))  # Sparse zeros
    # Convert missing genes into a sparse DataFrame
    zero_df = pd.DataFrame.sparse.from_spmatrix(zero_matrix, index=missing_genes, columns=df.columns)
    # Append missing genes with zero values
    harmonized_df = pd.concat([df, zero_df])
    # Ensure correct gene order (sorted allowed_genes)
    harmonized_df = harmonized_df.loc[sorted(allowed_genes), :]
    # Transpose and return
    return harmonized_df.T

In [5]:
def memory_efficient_harmonize(input_file, output_file, harmonize_matrix):
    """
    Loads an AnnData object, creates a transposed (and possibly harmonized)
    DataFrame in a memory efficient way, then writes a new AnnData file.

    Parameters:
      input_file (str): Path to the input h5ad file.
      output_file (str): Path where the output h5ad will be saved.
      harmonize_matrix (callable): A function that takes a DataFrame and returns a harmonized DataFrame.
    """
    # Read the h5ad file
    adata = sc.read_h5ad(input_file)
    sc.pp.filter_cells(adata, min_genes=300)
    sc.pp.filter_genes(adata, min_cells=3)
    # Check if the underlying matrix is sparse
    if sp.issparse(adata.X):
        # Create a sparse DataFrame from the sparse matrix.
        # NOTE: We first create the DataFrame in the original orientation,
        # then transpose it to avoid doing a dense conversion.
        df = pd.DataFrame.sparse.from_spmatrix(
            adata.X,
            index=adata.obs_names,
            columns=adata.var_names
        )
    else:
        # Otherwise, create a regular DataFrame
        df = pd.DataFrame(
            adata.X,
            index=adata.obs_names,
            columns=adata.var_names
        )
    df = harmonize_matrix(df.T)
    adata_new = sc.AnnData(
        X=df, 
        obs=adata.obs.copy(), 
        var=pd.DataFrame(index=df.columns)
    )
    adata_new.write(output_file)

In [None]:
# 1

files = os.listdir("raw/dno_doi_kidney_organoid_GSE147393/GSE147393_RAW/")

files = list(set([re.sub(r"(_barcodes\.tsv|_matrix\.mtx|_features\.tsv)\.gz", "", x) + "_" for x in files]))

basepath = "raw/dno_doi_kidney_organoid_GSE147393/GSE147393_RAW/"
study = "dno_doi_kidney_organoid_2024_GSE147393"

adatas = []

for prefix in files:
    adata = sc.read_10x_mtx(
        basepath,
        var_names="gene_symbols",
        prefix = prefix # Ensure prefix is used correctly
    )
    adata.obs["sample_id"] = prefix # Add metadata to keep track of samples
    adata.obs["sample_id"] = [re.search(r'_(.*?)_', x).group(1) if re.search(r'_(.*?)_', x) else x for x in adata.obs.sample_id]
    adata.obs["study"] = study
    adata.obs['source'] = "Pd_iPSC" # Patient derived iPSC
    adata.obs['diff_protocol'] = "Howden_2019"
    adata.obs['sc_protocol'] = "10x_3_v3.1"
    adata.obs['sequencing'] = "Illumina_NovaSeq_6000"
    adata.obs['genome_build'] = "hg38"
    adata.obs['Age'] = "d25"
    adata.obs['study_id'] = "1"
    adata.obs_names_make_unique()
    adatas.append(adata)

adata = sc.concat(adatas, join='outer')

# Define the mapping dictionary
mapping = {"null": "HNF4A_del", "corrected": "HNF4A_corrected", "patient": "HNF4A_Het"}
mapping2 = {"null": "OMIM_600281", "corrected": "Control", "patient": "OMIM_600281"}


# Create the new column based on `sample_id`
adata.obs["disease"] = adata.obs["sample_id"].map(mapping2)
adata.obs["condition"] = adata.obs["sample_id"].map(mapping)

adata.write("raw/dno_doi_kidney_organoid_GSE147393/dno_doi_kidney_organoid_2024_GSE147393.h5ad")

In [None]:
memory_efficient_harmonize(input_file='raw/dno_doi_kidney_organoid_GSE147393/dno_doi_kidney_organoid_GSE147393.h5ad',
                           output_file='raw/dno_doi_kidney_organoid_GSE147393/dno_doi_kidney_organoid_GSE147393.h5ad',
                           harmonize_matrix= harmonize_matrix)

In [None]:
# 2

files = os.listdir("raw/d10_1016_j_celrep_2024_114310/GSE230848_RAW/")

files = list(set([re.sub(r"(.barcodes\.tsv|.matrix\.mtx|.features\.tsv)\.gz", "", x) + "." for x in files]))

basepath = "raw/d10_1016_j_celrep_2024_114310/GSE230848_RAW/"
study = "d10_1016_j_celrep_2024_114310"

adatas = []

for prefix in files:
    adata = sc.read_10x_mtx(
        basepath,
        var_names="gene_symbols",
        prefix = prefix # Ensure prefix is used correctly
    )
    adata.obs["sample_id"] = prefix # Add metadata to keep track of samples
    adata.obs["sample_id"] = [x.split("_")[0] for x in adata.obs.sample_id] 
    adata.obs["study"] = study
    adata.obs['source'] = "HESC" # Patient derived iPSC
    adata.obs['diff_protocol'] = "Freedman_2015"
    adata.obs['sc_protocol'] = "10x_3_v3"
    adata.obs['sequencing'] = "Illumina_NovaSeq_6000"
    adata.obs['genome_build'] = "hg38"
    adata.obs['Age'] = "d24"
    adata.obs['study_id'] = "2"
    adata.obs_names_make_unique()
    adatas.append(adata)

adata = sc.concat(adatas, join='outer')

adata.write("raw/d10_1016_j_celrep_2024_114310/d10_1016_j_celrep_2024_114310.h5ad")

In [9]:
memory_efficient_harmonize(input_file='raw/d10_1016_j_celrep_2024_114310/d10_1016_j_celrep_2024_114310.h5ad',
                          output_file='raw/d10_1016_j_celrep_2024_114310/d10_1016_j_celrep_2024_114310.h5ad',
                           harmonize_matrix= harmonize_matrix)

In [49]:
# 3

files = os.listdir("raw/d10_1016_j_stem_2023_12_003/GSE244608_RAW/")

names = [x.split("_")[0] for x in files] 

basepath = "raw/d10_1016_j_stem_2023_12_003/GSE244608_RAW/"
study = "d10_1016_j_stem_2023_12_003"

adatas = []

for prefix in files:
    adata = sc.read_10x_h5(
        basepath + prefix
    )
    adata.obs_names_make_unique()
    adata.var_names_make_unique()
    adata.obs["sample_id"] = prefix # Add metadata to keep track of samples
    adata.obs["sample_id"] = [x.split("_")[0] for x in adata.obs.sample_id] 
    adata.obs["study"] = study
    adata.obs['source'] = "Pd_iPSC" # Patient derived iPSC
    adata.obs['diff_protocol'] = "Low_2019"
    adata.obs['sc_protocol'] = "10x_ATAC_Gene"
    adata.obs['sequencing'] = "Illumina_NovaSeq_6000"
    adata.obs['genome_build'] = "hg38"
    #adata.obs['Age'] = "d25"
    adata.obs['study_id'] = "3"
    adata.obs['type'] = "snRNAseq"
    adata.obs["disease"] = "ARPKD"
    adatas.append(adata)

adata = sc.concat(adatas, join='outer')

# Define the mapping dictionary
mapping = {"GSM7821874": "PKHD1", "GSM7821876": "PKHD1_corrected", "GSM7821878": "PKHD1 mutation, KIF3A mutation"}

# Create the new column based on `sample_id`
adata.obs["condition"] = adata.obs["sample_id"].map(mapping)

adata.write("raw/d10_1016_j_stem_2023_12_003/d10_1016_j_stem_2023_12_003.h5ad")

In [None]:
memory_efficient_harmonize(input_file='raw/d10_1016_j_stem_2023_12_003/d10_1016_j_stem_2023_12_003.h5ad',
                          output_file='raw/d10_1016_j_stem_2023_12_003/d10_1016_j_stem_2023_12_003.h5ad',
                           harmonize_matrix= harmonize_matrix)

In [None]:
# 4

file_path = "raw/d10_1016_j_stem_2019_06_009/GSE132023_Day10_12_14_count.txt.gz"

# Read count matrix (assuming tab-separated values)
df = pd.read_csv(file_path, sep="\t", index_col=0, compression="gzip")

meta = df.iloc[0]
df = df.iloc[1:] 

adata = sc.AnnData(df.T)
 
adata.obs["sample_id"] = meta.values

adata.obs_names_make_unique()
adata.var_names_make_unique()
adata.obs["study"] = "d10_1016_j_stem_2019_06_009"
adata.obs['source'] = "hPSCs" # Patient derived iPSC
adata.obs['diff_protocol'] = "Low_2019"
adata.obs['sc_protocol'] = "10x_3_v1"
adata.obs['sequencing'] = "Illumina_HiSeq_4000 "
adata.obs['genome_build'] = "hg38"
adata.obs['study_id'] = "4"
adata.obs['type'] = "scRNAseq"
adata.obs["disease"] = "Control"

# Define the mapping dictionary
mapping1 = {"Day10rep1": "GSM3834523", "Day10rep2": "GSM3834524", "Day12": "GSM3834525", "Day14rep1": "GSM3834526", "Day14rep2": "GSM3834527"}
mapping2 = {"Day10rep1": "d10", "Day10rep2": "d10", "Day12": "d12", "Day14rep1": "d14", "Day14rep2": "d14"}

# Create the new column based on `sample_id`
adata.obs["sample"] = adata.obs["sample_id"].map(mapping1)
adata.obs["day"] = adata.obs["sample_id"].map(mapping2)

import numpy as np
import scipy.sparse as sp

# Convert `adata.X` to a proper format
if isinstance(adata.X, np.ndarray):
    adata.X = np.array(adata.X, dtype=np.float32)  # Ensure float type
elif isinstance(adata.X, sp.spmatrix):
    adata.X = adata.X.astype(np.float32)  # Ensure correct sparse format
else:
    raise TypeError(f"Unexpected type for adata.X: {type(adata.X)}")

adata.write("raw/d10_1016_j_stem_2019_06_009/d10_1016_j_stem_2019_06_009.h5ad")

In [None]:
memory_efficient_harmonize(input_file="raw/d10_1016_j_stem_2019_06_009/d10_1016_j_stem_2019_06_009.h5ad",
                          output_file="raw/d10_1016_j_stem_2019_06_009/d10_1016_j_stem_2019_06_009.h5ad",
                           harmonize_matrix= harmonize_matrix)

In [None]:
# Correct feature files
import glob
import gzip

basepath = "raw/dno_doi_kidney_organoid_GSE241972/GSE241972_RAW/"
feature_files = glob.glob(os.path.join(basepath, "*_features.tsv.gz"))

# Process each file
for file in feature_files:
    # Read features file
    df = pd.read_csv(file, sep="\t", header=None)

    # Add new column "Gene Expression"
    df["Gene Expression"] = "Gene Expression"

    # Write back to gzipped file without adding extra newlines
    with gzip.open(file, "wt", encoding="utf-8") as f:
        df.to_csv(f, sep="\t", header=False, index=False, lineterminator="\n")  # Ensures no extra newlines

In [53]:
# 5

files = os.listdir("raw/dno_doi_kidney_organoid_GSE241972/GSE241972_RAW/")

files = list(set([re.sub(r"(_barcodes\.tsv|_matrix\.mtx|_features\.tsv|_genes\.tsv)\.gz", "", x) + "_" for x in files]))

basepath = "raw/dno_doi_kidney_organoid_GSE241972/GSE241972_RAW/"
study = "dno_doi_kidney_organoid_GSE241972"

adatas = []

for prefix in files:
    adata = sc.read_10x_mtx(
        basepath,
        prefix = prefix # Ensure prefix is used correctly
    )
    adata.var_names_make_unique()
    adata.obs["sample_id"] = prefix # Add metadata to keep track of samples
    #adata.obs["sample_id"] = [x.split("_")[0] for x in adata.obs.sample_id] 
    adata.obs["study"] = study
    adata.obs['source'] = "KPC" # Kidney primary cells
    adata.obs['diff_protocol'] = "Guo_2023"
    adata.obs['sc_protocol'] = "SMARTER"
    adata.obs['sequencing'] = "Illumina_HiSeq_2500"
    adata.obs['genome_build'] = "hg38"
    #adata.obs['Age'] = "d24"
    adata.obs['study_id'] = "5"
    adata.obs['type'] = "scRNAseq"
    adatas.append(adata)

adata = sc.concat(adatas, join='outer')

adata.write("raw/dno_doi_kidney_organoid_GSE241972/dno_doi_kidney_organoid_GSE241972.h5ad")

In [None]:
memory_efficient_harmonize(input_file="raw/dno_doi_kidney_organoid_GSE241972/dno_doi_kidney_organoid_GSE241972.h5ad",
                          output_file="raw/dno_doi_kidney_organoid_GSE241972/dno_doi_kidney_organoid_GSE241972.h5ad",
                           harmonize_matrix= harmonize_matrix)

In [34]:
# 6

files = os.listdir("raw/d10_1038_s41467_022_33623_z/GSE184928_RAW/")

files = set(["_".join(x.split("_")[:2]) + "_" for x in files])

basepath = "raw/d10_1038_s41467_022_33623_z/GSE184928_RAW/"
study = "d10_1038_s41467_022_33623_z"

adatas = []

for prefix in files:
    adata = sc.read_10x_mtx(
        basepath,
        prefix= prefix,
        var_names="gene_symbols",
    )
    adata.obs_names_make_unique()
    adata.var_names_make_unique()
    adata.obs["sample_id"] = prefix # Add metadata to keep track of samples
    adata.obs["sample_id"] = [x.split("_")[0] for x in adata.obs.sample_id] 
    adata.obs["study"] = study
    adata.obs['source'] = "hiPSC" #  iPSC
    adata.obs['diff_protocol'] = "Vanslambrouck_2022"
    adata.obs['sc_protocol'] = "10x_3_v3.1"
    adata.obs['sequencing'] = "Illumina_NovaSeq_6000"
    adata.obs['genome_build'] = "hg38"
    #adata.obs['Age'] = "d25"
    #adata.obs['study_id'] = "6"
    adata.obs['type'] = "scRNAseq"
    adata.obs["disease"] = "Control"
    adatas.append(adata)

adata = sc.concat(adatas, join='outer')

# Define the mapping dictionary
mapping = {"GSM5600482": "d13", "GSM5600483": "d27", "GSM7821878": "d25"}

# Create the new column based on `sample_id`
adata.obs["Age"] = adata.obs["sample_id"].map(mapping)

adata.write("raw/d10_1038_s41467_022_33623_z/d10_1038_s41467_022_33623_z.h5ad")

In [None]:
memory_efficient_harmonize(input_file="raw/d10_1038_s41467_022_33623_z/d10_1038_s41467_022_33623_z.h5ad",
                          output_file="raw/d10_1038_s41467_022_33623_z/d10_1038_s41467_022_33623_z.h5ad",
                           harmonize_matrix= harmonize_matrix)

In [44]:
# 7

files = os.listdir("raw/d10_1016_j_stem_2022_06_005/GSE164564_RAW/")

#files = set(["_".join(x.split("_")[:2]) + "_" for x in files])

basepath = "raw/d10_1016_j_stem_2022_06_005/GSE164564_RAW/"
study = "d10_1016_j_stem_2022_06_005"

adatas = []

for prefix in files:
    adata = sc.read_10x_mtx(
        basepath + prefix,
        var_names="gene_symbols",
    )
    adata.obs_names_make_unique()
    adata.var_names_make_unique()
    adata.obs["sample_id"] = prefix # Add metadata to keep track of samples
    adata.obs["sample_id"] = [x.split("_")[0] for x in adata.obs.sample_id] 
    adata.obs["study"] = study
    adata.obs['source'] = "hiPSC" #  iPSC
    adata.obs['diff_protocol'] = "Ransick_2019"
    adata.obs['sc_protocol'] = "10x_3_v3"
    adata.obs['sequencing'] = "Illumina_NextSeq_500"
    adata.obs['genome_build'] = "hg38"
    #adata.obs['Age'] = "d25"
    #adata.obs['study_id'] = "6"
    adata.obs['type'] = "scRNAseq"
    adata.obs["disease"] = "Control"
    adatas.append(adata)

adata = sc.concat(adatas, join='outer')

# Define the mapping dictionary
mapping = {"GSM5014155": "d8", "GSM5014156": "d10", "GSM5014157": "d14", "GSM5014158": "d16", "GSM5014159": "d28"}

# Create the new column based on `sample_id`
adata.obs["Age"] = adata.obs["sample_id"].map(mapping)

adata.write("raw/d10_1016_j_stem_2022_06_005/d10_1016_j_stem_2022_06_005.h5ad")

In [None]:
memory_efficient_harmonize(input_file="raw/d10_1016_j_stem_2022_06_005/d10_1016_j_stem_2022_06_005.h5ad",
                          output_file="raw/d10_1016_j_stem_2022_06_005/d10_1016_j_stem_2022_06_005.h5ad",
                           harmonize_matrix= harmonize_matrix)

In [50]:
# 8

files = os.listdir("raw/d10_1242_dev_200198/GSE181954_RAW/")

files = set([x.split("_")[0] for x in files])

basepath = "raw/d10_1242_dev_200198/GSE181954_RAW/"
study = "d10_1242_dev_200198"

for prefix in files:
    adata = sc.read_10x_mtx(
        basepath,
        var_names="gene_symbols",
    )
    adata.obs_names_make_unique()
    adata.var_names_make_unique()
    adata.obs["sample_id"] = prefix # Add metadata to keep track of samples
    #adata.obs["sample_id"] = [x.split("_")[0] for x in adata.obs.sample_id] 
    adata.obs["study"] = study
    adata.obs['source'] = "hiPSC" #  iPSC
    adata.obs['diff_protocol'] = "Jansen_2022"
    adata.obs['sc_protocol'] = "10x_3_v3.1"
    adata.obs['sequencing'] = "Illumina_NovaSeq_6000"
    adata.obs['genome_build'] = "hg38"
    adata.obs['Age'] = "d24"
    #adata.obs['study_id'] = "6"
    adata.obs['type'] = "scRNAseq"
    adata.obs["disease"] = "Control"
    adata.write("raw/d10_1242_dev_200198/d10_1242_dev_200198.h5ad")

In [None]:
memory_efficient_harmonize(input_file="raw/d10_1242_dev_200198/d10_1242_dev_200198.h5ad",
                          output_file="raw/d10_1242_dev_200198/d10_1242_dev_200198.h5ad",
                           harmonize_matrix= harmonize_matrix)

In [None]:
from cloupe import Cloupe

# parse file
c = Cloupe("raw/dno_doi_kidney_organoid_GSE164647/GSE164647_Injured_kidney_organoids_gene_expression_profile_at_single_cell.cloupe", load_csr=True)

# extract barcodes, features and projections to CSVs
c.to_csv(outdir="raw/dno_doi_kidney_organoid_GSE164647")

# create anndata with the same name as input file but `.h5ad` extension
c.to_anndata("raw/dno_doi_kidney_organoid_GSE164647/dno_doi_kidney_organoid_GSE164647.h5ad")

In [3]:
# 9
adata = sc.read_h5ad("raw/dno_doi_kidney_organoid_GSE164647/dno_doi_kidney_organoid_GSE164647.h5ad")

In [None]:
memory_efficient_harmonize(input_file="raw/dno_doi_kidney_organoid_GSE164647/dno_doi_kidney_organoid_GSE164647.h5ad",
                          output_file="raw/dno_doi_kidney_organoid_GSE164647/dno_doi_kidney_organoid_GSE164647.h5ad",
                           harmonize_matrix= harmonize_matrix)

In [80]:
# 10

import numpy as np
import scipy.sparse as sp

file_path = "raw/d10_1093_stmcls_sxac009/GSE165266_RAW/GSM5029240_Sample1.csv"

# Read count matrix (assuming tab-separated values)
df = pd.read_csv(file_path, sep=",", index_col=0)

adata = sc.AnnData(df.T)
 
adata.obs["sample_id"] = "GSM5029240"

adata.obs_names_make_unique()
adata.var_names_make_unique()
adata.obs["study"] = "d10_1093_stmcls_sxac009"
adata.obs['source'] = "hiPSC" # iPSC
adata.obs['diff_protocol'] = "Takasato_2016"
adata.obs['sc_protocol'] = "10x_3_v3.1"
adata.obs['sequencing'] = "Illumina_NovaSeq_6000"
adata.obs['genome_build'] = "hg38"
adata.obs['type'] = "scRNAseq"
adata.obs["disease"] = "Control"

# Convert `adata.X` to a proper format
if isinstance(adata.X, np.ndarray):
    adata.X = np.array(adata.X, dtype=np.float32)  # Ensure float type
elif isinstance(adata.X, sp.spmatrix):
    adata.X = adata.X.astype(np.float32)  # Ensure correct sparse format
else:
    raise TypeError(f"Unexpected type for adata.X: {type(adata.X)}")

adata.write("raw/d10_1093_stmcls_sxac009/d10_1093_stmcls_sxac009.h5ad")

In [None]:
memory_efficient_harmonize(input_file="raw/d10_1093_stmcls_sxac009/d10_1093_stmcls_sxac009.h5ad",
                          output_file="raw/d10_1093_stmcls_sxac009/d10_1093_stmcls_sxac009.h5ad",
                           harmonize_matrix= harmonize_matrix)

In [83]:
# 11

files = os.listdir("raw/10_1016_j_stem_2020_12_001/GSE161255_RAW/")

files = set(["_".join(x.split("_")[:2]) + "_" for x in files])

basepath = "raw/10_1016_j_stem_2020_12_001/GSE161255_RAW/"
study = "10_1016_j_stem_2020_12_001"

adatas = []

for prefix in files:
    adata = sc.read_10x_mtx(
        basepath,
        var_names="gene_symbols",
        prefix= prefix
    )
    adata.obs_names_make_unique()
    adata.var_names_make_unique()
    adata.obs["sample_id"] = prefix # Add metadata to keep track of samples
    adata.obs["sample_id"] = [x.split("_")[0] for x in adata.obs.sample_id] 
    adata.obs["study"] = study
    adata.obs['source'] = "hiPSC" #  iPSC
    adata.obs['diff_protocol'] = "Howden_2019"
    adata.obs['sc_protocol'] = "10x_3_v3.1"
    adata.obs['sequencing'] = "Illumina_NovaSeq_6000"
    adata.obs['genome_build'] = "hg38"
    adata.obs['Age'] = "d25+14"
    #adata.obs['study_id'] = "6"
    adata.obs['type'] = "scRNAseq"
    adata.obs["disease"] = "Conmemory_efficient_harmonize(input_file="raw/d10_1016_j_stem_2020_12_001/d10_1016_j_stem_2020_12_001.h5ad",
                          output_file="raw/d10_1016_j_stem_2020_12_001/d10_1016_j_stem_2020_12_001.h5ad",
                           harmonize_matrix= harmonize_matrix)trol"
    adatas.append(adata)

adata = sc.concat(adatas, join='outer')

adata.write("raw/d10_1016_j_stem_2020_12_001/d10_1016_j_stem_2020_12_001.h5ad")

In [None]:
memory_efficient_harmonize(input_file="raw/d10_1016_j_stem_2020_12_001/d10_1016_j_stem_2020_12_001.h5ad",
                          output_file="raw/d10_1016_j_stem_2020_12_001/d10_1016_j_stem_2020_12_001.h5ad",
                           harmonize_matrix= harmonize_matrix)

In [None]:
# 12

files = os.listdir("raw/d10_1038_s41563_020_00853_9/GSE152014_RAW/")

#files = set(["_".join(x.split("_")[:2]) + "_" for x in files])

basepath = "raw/d10_1038_s41563_020_00853_9/GSE152014_RAW/"
study = "d10_1038_s41563_020_00853_9"

adatas = []

for prefix in files:
    adata = sc.read_10x_mtx(
        basepath + prefix
    )
    adata.obs_names_make_unique()
    adata.var_names_make_unique()
    adata.obs["sample_id"] = prefix # Add metadata to keep track of samples
    #adata.obs["sample_id"] = [x.split("_")[0] for x in adata.obs.sample_id] 
    adata.obs["study"] = study
    adata.obs['source'] = "hiPSC" #  iPSC
    adata.obs['diff_protocol'] = "Howden_2019"
    adata.obs['sc_protocol'] = "10x_3_v3.1"
    adata.obs['sequencing'] = "Illumina_NovaSeq_6000"
    adata.obs['genome_build'] = "hg38"
    #adata.obs['Age'] = "d25"
    #adata.obs['study_id'] = "6"
    adata.obs['type'] = "scRNAseq"
    adata.obs["disease"] = "Control"
    adatas.append(adata)


adata = sc.concat(adatas, join='outer')


adata.write("raw/d10_1038_s41563_020_00853_9/d10_1038_s41563_020_00853_9.h5ad")

In [None]:
memory_efficient_harmonize(input_file="raw/d10_1038_s41563_020_00853_9/d10_1038_s41563_020_00853_9.h5ad",
                          output_file="raw/d10_1038_s41563_020_00853_9/d10_1038_s41563_020_00853_9.h5ad",
                           harmonize_matrix= harmonize_matrix)

In [20]:
# Correct feature files
import glob
import gzip

basepath = "raw/d10_1038_s41467_019_13382_0/GSE136314_RAW/"
feature_files = glob.glob(os.path.join(basepath, "*_features.tsv.gz"))

# Process each file
for file in feature_files:
    # Read features file
    df = pd.read_csv(file, sep="\t", header=None)

    # Add new column "Gene Expression"
    df['1'] = df.iloc[:, 0]
    df["Gene Expression"] = "Gene Expression"
    
    # Write back to gzipped file without adding extra newlines
    with gzip.open(file, "wt", encoding="utf-8") as f:
        df.to_csv(f, sep="\t", header=False, index=False, lineterminator="\n")  # Ensures no extra newlines

In [None]:
# 13

files = os.listdir("raw/d10_1038_s41467_019_13382_0/GSE136314_RAW/")

files = set(["_".join(x.split("_")[:4]) + "_" for x in files])

basepath = "raw/d10_1038_s41467_019_13382_0/GSE136314_RAW/"
study = "d10_1038_s41467_019_13382_0"

adatas = []

for prefix in files:
    adata = sc.read_10x_mtx(
        basepath,
        var_names="gene_symbols",
        prefix= prefix
    )
    adata.obs_names_make_unique()
    adata.var_names_make_unique()
    adata.obs["sample_id"] = prefix # Add metadata to keep track of samples
    adata.obs["sample_id"] = [x.split("_")[0] for x in adata.obs.sample_id] 
    adata.obs["study"] = study
    adata.obs['source'] = "hiPSC" #  iPSC
    adata.obs['diff_protocol'] = "Takasato_2016"
    adata.obs['sc_protocol'] = "10x_3_v3.1"
    adata.obs['sequencing'] = "HiSeq_X_Ten"
    adata.obs['genome_build'] = "hg38"
    #adata.obs['Age'] = ""
    adata.obs['type'] = "scRNAseq"
    adata.obs["disease"] = "Control"
    adatas.append(adata)
    print(f"Sample {prefix} is processed")

adata = sc.concat(adatas, join='outer')

# Define the mapping dictionary
mapping = {"GSM4044536": "d29", "GSM4044537": "d15", "GSM4044538": "d7", "GSM4044539": "d0", "GSM4044540": "d29", 
          "GSM4044541":"d15", "GSM4044542":"d7", "GSM4044543":"d0", "GSM4044544":"d32"}

# Create the new column based on `sample_id`
adata.obs["Age"] = adata.obs["sample_id"].map(mapping)


adata.write("raw/d10_1038_s41467_019_13382_0/d10_1038_s41467_019_13382_0.h5ad")

In [None]:
memory_efficient_harmonize(input_file="raw/d10_1038_s41467_019_13382_0/d10_1038_s41467_019_13382_0.h5ad",
                          output_file="raw/d10_1038_s41467_019_13382_0/d10_1038_s41467_019_13382_0.h5ad",
                           harmonize_matrix= harmonize_matrix)

In [None]:
# 14

files = os.listdir("raw/d10_1016_j_devcel_2019_06_001/GSE124472_RAW/")

#files = set(["_".join(x.split("_")[:2]) + "_" for x in files])

basepath = "raw/d10_1016_j_devcel_2019_06_001/GSE124472_RAW/"
study = "d10_1016_j_devcel_2019_06_001"

adatas = []

for prefix in files:
    adata = sc.read_10x_mtx(
        basepath + prefix
    )
    adata.obs_names_make_unique()
    adata.var_names_make_unique()
    adata.obs["sample_id"] = prefix # Add metadata to keep track of samples
    adata.obs["sample_id"] = [x.split("_")[0] for x in adata.obs.sample_id] 
    adata.obs["study"] = study
    adata.obs['source'] = "hiPSC" #  iPSC
    adata.obs['diff_protocol'] = "Tran_2019"
    adata.obs['sc_protocol'] = "10x_3_v2"
    adata.obs['sequencing'] = "Illumina_NextSeq_500"
    adata.obs['genome_build'] = "hg38"
    #adata.obs['Age'] = "d25"
    adata.obs['type'] = "scRNAseq"
    adata.obs["disease"] = "Control"
    adatas.append(adata)
    print(f"Sample {prefix} is processed")

adata = sc.concat(adatas, join='outer')

# Define the mapping dictionary
mapping = {"GSM3534660": "d16", "GSM3534661": "d16", "GSM3534662": "d28", "GSM3534663": "d28"}

# Create the new column based on `sample_id`
adata.obs["Age"] = adata.obs["sample_id"].map(mapping)

adata.write("raw/d10_1016_j_devcel_2019_06_001/d10_1016_j_devcel_2019_06_001.h5ad")

In [None]:
memory_efficient_harmonize(input_file="raw/d10_1016_j_devcel_2019_06_001/d10_1016_j_devcel_2019_06_001.h5ad",
                          output_file="raw/d10_1016_j_devcel_2019_06_001/d10_1016_j_devcel_2019_06_001.h5ad",
                           harmonize_matrix= harmonize_matrix)

In [29]:
# Correct feature files
import glob
import gzip

basepath = "raw/d10_15252_embr_201847483/GSE119561_RAW/"
feature_files = glob.glob(os.path.join(basepath, "*_features.tsv.gz"))

# Process each file
for file in feature_files:
    # Read features file
    df = pd.read_csv(file, sep="\t", header=None)

    # Add new column "Gene Expression"
    df["Gene Expression"] = "Gene Expression"
    
    # Write back to gzipped file without adding extra newlines
    with gzip.open(file, "wt", encoding="utf-8") as f:
        df.to_csv(f, sep="\t", header=False, index=False, lineterminator="\n")  # Ensures no extra newlines

In [31]:
# 15

files = os.listdir("raw/d10_15252_embr_201847483/GSE119561_RAW/")

files = set(["_".join(x.split("_")[:2]) + "_" for x in files])

basepath = "raw/d10_15252_embr_201847483/GSE119561_RAW/"
study = "d10_15252_embr_201847483"

adatas = []

for prefix in files:
    adata = sc.read_10x_mtx(
        basepath,
        var_names="gene_symbols",
        prefix= prefix
    )
    adata.obs_names_make_unique()
    adata.var_names_make_unique()
    adata.obs["sample_id"] = prefix # Add metadata to keep track of samples
    adata.obs["sample_id"] = [x.split("_")[0] for x in adata.obs.sample_id] 
    adata.obs["study"] = study
    adata.obs['source'] = "hiPSC" #  iPSC
    adata.obs['diff_protocol'] = "Takasato_2016"
    adata.obs['sc_protocol'] = "10x_3_v2"
    adata.obs['sequencing'] = "Illumina_HiSeq_2500"
    adata.obs['genome_build'] = "hg38"
    #adata.obs['Age'] = ""
    adata.obs['type'] = "scRNAseq"
    adata.obs["disease"] = "Control"
    adatas.append(adata)
    print(f"Sample {prefix} is processed")

adata = sc.concat(adatas, join='outer')

# Define the mapping dictionary
mapping = {"GSM3377671": "d25", "GSM3377672": "d18"}

# Create the new column based on `sample_id`
adata.obs["Age"] = adata.obs["sample_id"].map(mapping)


adata.write("raw/d10_15252_embr_201847483/d10_15252_embr_201847483.h5ad")

Sample GSM3377671_E6_ is processed
Sample GSM3377672_SIX2_ is processed


In [None]:
memory_efficient_harmonize(input_file="raw/d10_15252_embr_201847483/d10_15252_embr_201847483.h5ad",
                          output_file="raw/d10_15252_embr_201847483/d10_15252_embr_201847483.h5ad",
                           harmonize_matrix= harmonize_matrix)

In [32]:
# 16

files = os.listdir("raw/d10_1038_s41592_018_0253_2/")

files = set(["_".join(x.split("_")[:2]) + "_" for x in files])

basepath = "raw/d10_1038_s41592_018_0253_2/"
study = "d10_1038_s41592_018_0253_2"

adatas = []

for prefix in files:
    adata = sc.read_10x_mtx(
        basepath,
        var_names="gene_symbols",
        prefix= prefix
    )
    adata.obs_names_make_unique()
    adata.var_names_make_unique()
    adata.obs["sample_id"] = prefix # Add metadata to keep track of samples
    adata.obs["sample_id"] = [x.split("_")[0] for x in adata.obs.sample_id] 
    adata.obs["study"] = study
    adata.obs['source'] = "hiPSC" #  iPSC
    adata.obs['diff_protocol'] = "Takasato_2016"
    adata.obs['sc_protocol'] = "10x_3_v2"
    adata.obs['sequencing'] = "Illumina_HiSeq_2500"
    adata.obs['genome_build'] = "hg38"
    adata.obs['Age'] = "d25"
    adata.obs['type'] = "scRNAseq"
    adata.obs["disease"] = "Control"
    adatas.append(adata)
    print(f"Sample {prefix} is processed")

adata = sc.concat(adatas, join='outer')

adata.write("raw/d10_1038_s41592_018_0253_2/d10_1038_s41592_018_0253_2.h5ad")

Sample GSE114802_org4_ is processed
Sample GSE114802_org_ is processed


In [None]:
memory_efficient_harmonize(input_file="raw/d10_1038_s41592_018_0253_2/d10_1038_s41592_018_0253_2.h5ad",
                          output_file="raw/d10_1038_s41592_018_0253_2/d10_1038_s41592_018_0253_2.h5ad",
                           harmonize_matrix= harmonize_matrix)

In [46]:
# 17
import scipy.sparse as sp

# Path to your folder containing .txt files
folder_path = "raw/d10_1016_j_stem_2018_10_010/GSE118184/"

# Get a list of all .txt files in the folder
txt_files = [f for f in os.listdir(folder_path) if f.endswith(".txt")]

# List to store individual AnnData objects
adata_list = []

# Process each file
for file in txt_files:
    file_path = os.path.join(folder_path, file)
    
    # Extract filename without extension
    sample_name = file

    # Read the .txt file (assumes tab-separated format)
    df = pd.read_csv(file_path, sep="\t", index_col=0)
    df = df.T 
    
    # Convert to sparse matrix for efficient storage
    X = sp.csr_matrix(df.values)  

    # Create AnnData object
    adata = sc.AnnData(X=X)

    # Assign cell names (now in rows)
    adata.obs_names = df.index  # Cells

    # Assign gene names (now in columns)
    adata.var_names = df.columns  # Genes

    # Add filename as metadata column
    adata.obs["sample_id"] = sample_name  

    # Store in the list
    adata_list.append(adata)

adata = sc.concat(adata_list, join='outer')
adata.obs['diff_protocol'] = adata.obs['sample_id'].str.split('_').str[1]
adata.obs['source'] = adata.obs['sample_id'].str.split('_').str[2]
adata.obs['condition'] = adata.obs['sample_id'].str.split('_').str[3]
adata.obs['sc_protocol'] = "Dropseq"
adata.obs["disease"] = "Control"
adata.obs['type'] = "scRNAseq"
adata.obs['sequencing'] = "Illumina_HiSeq_2500"
adata.obs["study"] = 'd10_1016_j_stem_2018_10_010'
adata.obs['Age'] = adata.obs.index.str.split('_').str[0]

# Define the mapping dictionary
mapping = {"GSE118184_Takasato_ES_Batch1.2_dge.txt":"d26", "GSE118184_Morizane_iPS_Batch1.2_dge.txt":"d26",
          "GSE118184_Takasato_iPS_Batch1.2_dge.txt":"d26", "GSE118184_Morizane_ES_Batch1_dge.txt":"d26", "GSE118184_Takasato_iPS_day34_dge.txt":"d34",
          "GSE118184_Takasato_iPS_Batch3_dge.txt":"d26", "GSE118184_Takasato_iPS_BDNF.inhibitor_dge.txt":"d26", "GSE118184_Morizane_iPS_Batch3_dge.txt":"d26"}

adata.obs["Age"] = adata.obs["sample_id"].map(mapping)

# Update the 'Age' column based on index values
adata.obs.loc[adata.obs.index.str.startswith("Day"), "Age"] = adata.obs.index[adata.obs.index.str.startswith("Day")]
adata.obs['Age'] = adata.obs['Age'].str.replace(r'_.*', '', regex=True)
adata.obs['Age'] = adata.obs['Age'].str.replace(r'Day', 'd', regex=True)


adata.write("raw/d10_1016_j_stem_2018_10_010/d10_1016_j_stem_2018_10_010.h5ad")

In [None]:
memory_efficient_harmonize(input_file="raw/d10_1016_j_stem_2018_10_010/d10_1016_j_stem_2018_10_010.h5ad",
                          output_file="raw/d10_1016_j_stem_2018_10_010/d10_1016_j_stem_2018_10_010.h5ad",
                           harmonize_matrix= harmonize_matrix)

In [17]:
# 18

file_path = "raw/d10_1016_j_stem_2018_04_022/GSE109718_RAW/"
study = "d10_1016_j_stem_2018_04_022"


adatas = []
# Read count matrix (assuming tab-separated values)
for names in os.listdir(file_path):
    df = pd.read_csv(file_path + names, sep="\t", index_col=0, compression="gzip")
    adata = sc.AnnData(df.T)
    adata.obs["sample_id"] = names
    adata.obs["sample_id"] = [x.split("_")[0] for x in adata.obs.sample_id]
    adata.obs_names_make_unique()
    adata.var_names_make_unique()
    adata.obs["study"] = study
    adata.obs['source'] = "hiPSC-ESC"
    adata.obs['diff_protocol'] = "Freedman"
    adata.obs['sc_protocol'] = "Drop-seq"
    adata.obs['sequencing'] = "Illumina_HiSeq_2000"
    adata.obs['genome_build'] = "hg38"
    adata.obs['Age'] = "d21"
    adata.obs['type'] = "scRNAseq"
    adata.obs["disease"] = "Control"
    adatas.append(adata)

adata = sc.concat(adatas, join='outer')

adata.write("raw/d10_1016_j_stem_2018_04_022/d10_1016_j_stem_2018_04_022.h5ad")

In [None]:
memory_efficient_harmonize(input_file="raw/d10_1016_j_stem_2018_04_022/d10_1016_j_stem_2018_04_022.h5ad",
                          output_file="raw/d10_1016_j_stem_2018_04_022/d10_1016_j_stem_2018_04_022.h5ad",
                           harmonize_matrix= harmonize_matrix)

In [28]:
# 19

file_path = "raw/d10_1172_jci_insight_122697/GSE115986_RAW/"
study = "d10_1172_jci_insight_122697"

adatas = []
# Read count matrix (assuming tab-separated values)
for names in os.listdir(file_path):
    df = pd.read_csv(file_path + names, sep="\t", index_col=0, compression="gzip")
    adata = sc.AnnData(df.T)
    adata.obs["sample_id"] = names
    adata.obs["sample_id"] = [x.split("_")[0] for x in adata.obs.sample_id]
    adata.obs_names_make_unique()
    adata.var_names_make_unique()
    adata.obs["study"] = study
    adata.obs['source'] = "ESC"
    adata.obs['diff_protocol'] = "Freedman"
    adata.obs['sc_protocol'] = "Drop-seq"
    adata.obs['sequencing'] = "Illumina_HiSeq_2000"
    adata.obs['genome_build'] = "hg38"
    adata.obs['type'] = "scRNAseq"
    adata.obs["disease"] = "Control"
    adatas.append(adata)

adata = sc.concat(adatas, join='outer')

adata.write("raw/d10_1172_jci_insight_122697/d10_1172_jci_insight_122697.h5ad")

In [22]:
memory_efficient_harmonize(input_file="raw/d10_1172_jci_insight_122697/d10_1172_jci_insight_122697.h5ad",
                          output_file="raw/d10_1172_jci_insight_122697/d10_1172_jci_insight_122697_harmonized.h5ad",
                           harmonize_matrix= harmonize_matrix)

In [31]:
# 20

files = os.listdir("raw/d10_1242_dev_172361/GSE117211_RAW/")

files = set([x.split("_")[0] + "_" for x in files])

basepath = "raw/d10_1242_dev_172361/GSE117211_RAW/"
study = "d10_1242_dev_172361"

for prefix in files:
    adata = sc.read_10x_mtx(
        basepath,
        var_names="gene_symbols",
        prefix= prefix
    )
    adata.obs_names_make_unique()
    adata.var_names_make_unique()
    adata.obs["sample_id"] = prefix # Add metadata to keep track of samples
    adata.obs["sample_id"] = [x.split("_")[0] for x in adata.obs.sample_id] 
    adata.obs["study"] = study
    adata.obs['source'] = "hiPSC" #  iPSC
    adata.obs['diff_protocol'] = "Takasato_modified"
    adata.obs['sc_protocol'] = "10x_v2"
    adata.obs['sequencing'] = "Illumina_HiSeq_2500"
    adata.obs['genome_build'] = "hg38"
    adata.obs['Age'] = "d25"
    adata.obs['type'] = "scRNAseq"
    adata.obs["disease"] = "Control"
    print(f"Sample {prefix} is processed")

adata.write("raw/d10_1242_dev_172361/d10_1242_dev_172361.h5ad")

Sample GSM3273570_ is processed


In [None]:
memory_efficient_harmonize(input_file="raw/d10_1242_dev_172361/d10_1242_dev_172361.h5ad",
                          output_file="raw/d10_1242_dev_172361/d10_1242_dev_172361.h5ad",
                           harmonize_matrix= harmonize_matrix)

In [34]:
# 21

file_path = "raw/d10_1016_j_celrep_2020_108514/GSE131086_RAW/"
study = "d10_1016_j_celrep_2020_108514"

adatas = []
# Read count matrix (assuming tab-separated values)
for names in os.listdir(file_path):
    df = pd.read_csv(file_path + names, sep="\t", index_col=0, compression="gzip")
    adata = sc.AnnData(df.T)
    adata.obs["sample_id"] = names
    adata.obs["sample_id"] = [x.split("_")[0] for x in adata.obs.sample_id]
    adata.obs_names_make_unique()
    adata.var_names_make_unique()
    adata.obs["study"] = study
    adata.obs['source'] = "hiPSC"
    adata.obs['diff_protocol'] = "Takasato_Uchimura"
    adata.obs['sc_protocol'] = "10X_5"
    adata.obs['sequencing'] = "Illumina_NovaSeq_6000"
    adata.obs['genome_build'] = "hg38"
    adata.obs['Age'] = "d26"
    adata.obs['type'] = "scRNAseq"
    adata.obs["disease"] = "Control"
    adatas.append(adata)

adata = sc.concat(adatas, join='outer')

adata.write("raw/d10_1016_j_celrep_2020_108514/d10_1016_j_celrep_2020_108514.h5ad")

In [None]:
memory_efficient_harmonize(input_file="raw/d10_1016_j_celrep_2020_108514/d10_1016_j_celrep_2020_108514.h5ad",
                          output_file="raw/d10_1016_j_celrep_2020_108514/d10_1016_j_celrep_2020_108514.h5ad",
                           harmonize_matrix= harmonize_matrix)

In [39]:
# 22

files = os.listdir("raw/d10_1186_s13073_022_01023_z/GSE165408_RAW/")

files = set([x.split("_")[0] + "_" for x in files])

basepath = "raw/d10_1186_s13073_022_01023_z/GSE165408_RAW/"
study = "d10_1186_s13073_022_01023_z"

for prefix in files:
    adata = sc.read_10x_mtx(
        basepath,
        var_names="gene_symbols",
        prefix= prefix
    )
    adata.obs_names_make_unique()
    adata.var_names_make_unique()
    adata.obs["sample_id"] = prefix # Add metadata to keep track of samples
    adata.obs["sample_id"] = [x.split("_")[0] for x in adata.obs.sample_id] 
    adata.obs["study"] = study
    adata.obs['source'] = "hiPSC" #  iPSC
    adata.obs['diff_protocol'] = "Takasato_modified"
    adata.obs['sc_protocol'] = "10x_v2"
    adata.obs['sequencing'] = "Illumina_HiSeq_2500"
    adata.obs['genome_build'] = "hg38"
    adata.obs['Age'] = "d25"
    adata.obs['type'] = "scRNAseq"
    adata.obs["disease"] = "Control"
    print(f"Sample {prefix} is processed")

adata.write("raw/d10_1186_s13073_022_01023_z/d10_1186_s13073_022_01023_z.h5ad")

Sample GSM5032947_ is processed


In [None]:
memory_efficient_harmonize(input_file="raw/d10_1186_s13073_022_01023_z/d10_1186_s13073_022_01023_z.h5ad",
                          output_file="raw/d10_1186_s13073_022_01023_z/d10_1186_s13073_022_01023_z.h5ad",
                           harmonize_matrix= harmonize_matrix)

In [None]:
# 23 LMRH

files = os.listdir("raw/dno_doi_kidney_organoid_LMRH/LMRH/")

files = set([x.split("_")[0] + "_" for x in files])

basepath = "raw/dno_doi_kidney_organoid_LMRH/LMRH/"
study = "dno_doi_kidney_organoid_LMRH"

adatas = []

for prefix in files:
    adata = sc.read_10x_mtx(
        basepath,
        var_names="gene_symbols",
        prefix= prefix
    )
    adata.obs_names_make_unique()
    adata.var_names_make_unique()
    adata.obs["sample_id"] = prefix
    adata.obs["sample_id"] = [x.split("_")[0] for x in adata.obs.sample_id]
    adata.obs["study"] = study
    adata.obs['source'] = "hiPSC"
    adata.obs['diff_protocol'] = "Morizane_modified"
    adata.obs['sc_protocol'] = "10x_3_v3.1"
    adata.obs['sequencing'] = "Illumina_NovaSeq_6000"
    adata.obs['genome_build'] = "hg38"
    adata.obs['type'] = "scRNAseq"
    adata.obs["disease"] = "Control"
    adatas.append(adata)
    print(f"Sample {prefix} is processed")

adata = sc.concat(adatas, join='outer')

# Define the mapping dictionary
mapping = {"2G9.R2": "d22", "2G9.R1": "d22", "12gneg": "d25", "12g.neg.j38": "d38", "05.4": "d25",
          "WIDGET1":"d25", "WIDGET2":"d25", "WIDGET3":"d25", "WIDGET.NT":"d25"}

# Create the new column based on `sample_id`
adata.obs["Age"] = adata.obs["sample_id"].map(mapping)

adata.write("raw/dno_doi_kidney_organoid_LMRH/dno_doi_kidney_organoid_LMRH.h5ad")

In [13]:
memory_efficient_harmonize(input_file="raw/dno_doi_kidney_organoid_LMRH/dno_doi_kidney_organoid_LMRH.h5ad",
                          output_file="raw/dno_doi_kidney_organoid_LMRH/dno_doi_kidney_organoid_LMRH.h5ad",
                           harmonize_matrix= harmonize_matrix)

In [None]:
# 24 

files = os.listdir("raw/d10_1073_pnas_2219699120/raw/")

basepath = "raw/d10_1073_pnas_2219699120/raw/"
study = "d10_1073_pnas_2219699120"

adatas = []

for prefix in files:
    adata = sc.read_10x_h5(
        basepath + prefix
    )
    adata.obs_names_make_unique()
    adata.var_names_make_unique()
    adata.obs["sample_id"] = prefix
    adata.obs["sample_id"] = [x.split("_")[0] for x in adata.obs.sample_id]
    adata.obs["study"] = study
    adata.obs['source'] = "hiPSC"
    adata.obs['diff_protocol'] = "Takasato_modified"
    adata.obs['sc_protocol'] = "10x_3_v3.1"
    adata.obs['sequencing'] = "Illumina_NovaSeq_6000"
    adata.obs['genome_build'] = "hg38"
    adata.obs['type'] = "snRNAseq"
    adata.obs["disease"] = "Control"
    adatas.append(adata)
    print(f"Sample {prefix} is processed")


adata = sc.concat(adatas, join='outer')

# Define the mapping dictionary
mapping = {"GSM6573624": "d26", "GSM6573626": "d26", "GSM6573628": "d26", "GSM6573630": "d19", "GSM6573634": "d16", "GSM6573636":"d12",
          "GSM6573638":"d12", "GSM6573640":"d7", "GSM6573642":"d7"}

# Create the new column based on `sample_id`
adata.obs["Age"] = adata.obs["sample_id"].map(mapping)


adata.write("raw/d10_1073_pnas_2219699120/d10_1073_pnas_2219699120.h5ad")

In [None]:
memory_efficient_harmonize(input_file="raw/d10_1073_pnas_2219699120/d10_1073_pnas_2219699120.h5ad",
                          output_file="raw/d10_1073_pnas_2219699120/d10_1073_pnas_2219699120.h5ad",
                           harmonize_matrix= harmonize_matrix)