In [1]:
import scanpy as sc
import hdf5plugin
import anndata
import matplotlib.pyplot as plt
from matplotlib.pyplot import rc_context
import numpy as np
import pandas as pd
import os

import warnings
warnings.filterwarnings("ignore")

In [2]:
VERSION = "3"

In [3]:
DATA_PATH = "./data/2_sanitized_h5ads"

In [4]:
def get_h5ads(directory):
    h5ad_name_lst = []
    for file in os.listdir(directory):
        if file.endswith(".h5ad"):
            # Construct the full file path
            file_path = os.path.join(directory, file)
            # Print the file name
            h5ad_name_lst.append(file)
    return h5ad_name_lst

In [5]:
adata_name_lst = get_h5ads(DATA_PATH)
adata_name_lst

['luad_gse97168.h5ad', 'luad_gse123902.h5ad', 'nsclc_gse99254.h5ad']

In [6]:
H5ADS = [f"{DATA_PATH}/{i}" for i in adata_name_lst]

MERGED_NAME = "lung_cancer_concat_data"
H5AD_CONCAT = f"./data/4_concatenated_h5ad/{MERGED_NAME}_v{VERSION}.h5ad"

In [7]:
H5ADS

['./data/2_sanitized_h5ads/luad_gse97168.h5ad',
 './data/2_sanitized_h5ads/luad_gse123902.h5ad',
 './data/2_sanitized_h5ads/nsclc_gse99254.h5ad']

In [8]:
adatas = [sc.read_h5ad(i) for i in H5ADS]
adatas

[AnnData object with n_obs × n_vars = 1275 × 10523
     obs: 'tissue', 'annotation_immune',
 AnnData object with n_obs × n_vars = 31608 × 19544
     obs: 'tissue', 'annotation_immune',
 AnnData object with n_obs × n_vars = 12345 × 21063
     obs: 'tissue', 'annotation_immune']

In [9]:
adata_merged = anndata.AnnData.concatenate(*adatas, batch_key='batch', join="inner")
adata_merged.obs.rename(columns={'annotation_immune': 'cell_type'}, inplace=True)
adata_merged

AnnData object with n_obs × n_vars = 45228 × 8687
    obs: 'tissue', 'cell_type', 'batch'

In [10]:
adata_merged.write_h5ad(
    H5AD_CONCAT,
    compression=hdf5plugin.FILTERS["zstd"],
    compression_opts=hdf5plugin.Zstd(clevel=5).filter_options
)