In [1]:
%load_ext autoreload
%autoreload 2

from nxfvars import nxfvars
import scanpy as sc
import numpy as np
import itertools
from tqdm import trange, tqdm
import scipy.sparse
import numpy.testing as npt
from scanpy_helpers.integration import (
    normalize_by_gene_length,
    sanitize_adata,
    validate_adata,
    add_doublet_annotation,
    undo_log_norm,
    remap_gene_symbols,
    drop_duplicated_genes,
    aggregate_duplicate_gene_symbols,
    merge_datasets,
    MANDATORY_COLS,
)
from threadpoolctl import threadpool_limits
from tqdm.contrib.concurrent import process_map
import mygene
from operator import and_
from functools import reduce
import pandas as pd
import anndata
import re
import os

In [2]:
out_dir = nxfvars.get("artifact_dir", "/local/scratch/sturm/")

In [3]:
threadpool_limits(int(nxfvars.get("cpus", "8")))

<threadpoolctl.threadpool_limits at 0x7f7ac00ea9a0>

In [4]:
sc.set_figure_params(figsize=(5, 5))

In [5]:
dataset_table = pd.read_csv(
    nxfvars.get("samplesheet", "../../tables/samplesheet_scrnaseq_preprocessing.csv")
)
dataset_path_annotated = nxfvars.get(
    "dataset_path_annotated",
    "../../data/20_integrate_scrnaseq_data/11_seed_annotations/artifacts/",
)
dataset_path = nxfvars.get(
    "dataset_path", "../../data/20_integrate_scrnaseq_data/02_qc_and_filtering/"
)

In [6]:
dataset_table

Unnamed: 0,id,input_adata,min_counts,max_counts,min_genes,max_genes,max_pct_mito,batch_key,run_solo
0,Adams_Kaminski_2020_COPD,data/10_public_datasets/Adams_Kaminski_2020_CO...,1000,35000,500,10000,20,sample,True
1,Chen_Zhang_2020_NSCLC,data/10_public_datasets/Chen_Zhang_2020_NSCLC/...,600,30000,250,10000,20,sample,True
2,Goveia_Carmeliet_2020_NSCLC,data/10_public_datasets/Goveia_Carmeliet_2020_...,600,30000,250,10000,20,sample,True
3,Guo_Zhang_2018_NSCLC,data/10_public_datasets/Guo_Zhang_2018_NSCLC/h...,20000,3000000,1000,20000,20,sample,False
4,Habermann_Kropski_2020_pulmonary-fibrosis,data/10_public_datasets/Habermann_Kropski_2020...,600,30000,200,10000,20,sample,True
5,He_Fan_2021_LUAD,data/10_public_datasets/Kim_Lee_2020_LUAD/h5ad...,1000,35000,300,10000,20,sample,True
6,Kim_Lee_2020_LUAD,data/10_public_datasets/He_Fan_2021_LUAD/h5ad_...,600,30000,250,10000,20,sample,True
7,Lambrechts_2018_LUAD_6149v1,data/10_public_datasets/Lambrechts_2018_LUAD/E...,600,30000,200,10000,15,sample,True
8,Lambrechts_2018_LUAD_6149v2,data/10_public_datasets/Lambrechts_2018_LUAD/E...,600,30000,250,10000,20,sample,True
9,Lambrechts_2018_LUAD_6653,data/10_public_datasets/Lambrechts_2018_LUAD/E...,1200,40000,250,10000,20,sample,True


In [7]:
datasets_annotated = ["Maynard_Bivona_2020_NSCLC", "Lambrechts_2018_LUAD_6653"]

In [8]:
datasets = {
    dataset_id: sc.read_h5ad(
        f"{dataset_id}.qc.h5ad"
        if dataset_path == "."
        else f"{dataset_path}/{dataset_id}/{dataset_id}.qc.h5ad"
    )
    for dataset_id in tqdm(dataset_table["id"])
}

100%|██████████| 22/22 [00:29<00:00,  1.33s/it]


In [9]:
# Set cell-types of unannotated datasets to "unknown" for scVI
for dataset_id in datasets:
    datasets[dataset_id].obs["cell_type"] = "unknown"

In [10]:
for dataset_id in datasets_annotated:
    tmp_adata = sc.read_h5ad(
        f"{dataset_path_annotated}/{dataset_id}_annotated.h5ad"
    )
    datasets[dataset_id].obs["cell_type"] = tmp_adata.obs["cell_type"]

### Dataset-specific filtering and metadata fixes

In [11]:
datasets["Adams_Kaminski_2020_COPD"].obs["origin"] = "normal"
datasets["Adams_Kaminski_2020_COPD"].obs["sex"] = "nan"
datasets["Adams_Kaminski_2020_COPD"] = datasets["Adams_Kaminski_2020_COPD"][
    datasets["Adams_Kaminski_2020_COPD"].obs["condition"] != "IPF", :
]

In [12]:
# No modifications necessary for Chen_Zhang

In [13]:
datasets["Goveia_Carmeliet_2020_NSCLC"] = datasets["Goveia_Carmeliet_2020_NSCLC"][
    datasets["Goveia_Carmeliet_2020_NSCLC"].obs["condition"] != "LLCC"
].copy()
datasets["Goveia_Carmeliet_2020_NSCLC"].obs["sex"] = "nan"

In [14]:
datasets["Guo_Zhang_2018_NSCLC"] = datasets["Guo_Zhang_2018_NSCLC"][
    datasets["Guo_Zhang_2018_NSCLC"].obs["tissue"] != "blood"
].copy()
datasets["Guo_Zhang_2018_NSCLC"] = normalize_by_gene_length(
    datasets["Guo_Zhang_2018_NSCLC"]
)
datasets["Guo_Zhang_2018_NSCLC"].obs["sex"] = "nan"

In [15]:
datasets["Habermann_Kropski_2020_pulmonary-fibrosis"].obs["sex"] = [
    {"M": "male", "F": "female", "Unknown": "nan"}[s]
    for s in datasets["Habermann_Kropski_2020_pulmonary-fibrosis"].obs["sex"]
]
datasets["Habermann_Kropski_2020_pulmonary-fibrosis"] = datasets[
    "Habermann_Kropski_2020_pulmonary-fibrosis"
][
    datasets["Habermann_Kropski_2020_pulmonary-fibrosis"].obs["condition"]
    == "healthy_control",
    :,
].copy()

In [16]:
# No modifications necessary for He_Fan

In [17]:
datasets["Maynard_Bivona_2020_NSCLC"] = normalize_by_gene_length(
    datasets["Maynard_Bivona_2020_NSCLC"]
)

In [18]:
datasets["Laughney_Massague_2020_NSCLC"].obs["sex"] = "nan"

In [19]:
datasets["Maier_Merad_2020_NSCLC"].obs["sex"] = "nan"

In [20]:
datasets["Madissoon_Meyer_2020_pulmonary-fibrosis"].obs["tissue"] = "lung"
datasets["Madissoon_Meyer_2020_pulmonary-fibrosis"].obs["origin"] = "normal"
datasets["Madissoon_Meyer_2020_pulmonary-fibrosis"].obs["condition"] = "healthy_control"
datasets["Madissoon_Meyer_2020_pulmonary-fibrosis"].obs["sex"] = "nan"
datasets["Madissoon_Meyer_2020_pulmonary-fibrosis"].X.data = np.rint(
    datasets["Madissoon_Meyer_2020_pulmonary-fibrosis"].X.data
)

In [21]:
datasets["Mayr_Schiller_2020_pulmonary-fibrosis"].obs["sex"] = "nan"
datasets["Mayr_Schiller_2020_pulmonary-fibrosis"] = datasets[
    "Mayr_Schiller_2020_pulmonary-fibrosis"
][
    datasets["Mayr_Schiller_2020_pulmonary-fibrosis"].obs["condition"]
    == "healthy_control",
    :,
]

In [22]:
datasets["Reyfman_Misharin_2018_pulmonary-fibrosis"].obs["sex"] = "nan"
datasets["Reyfman_Misharin_2018_pulmonary-fibrosis"] = datasets[
    "Reyfman_Misharin_2018_pulmonary-fibrosis"
][
    datasets["Reyfman_Misharin_2018_pulmonary-fibrosis"].obs["condition"]
    == "healthy_control",
    :,
]

In [23]:
datasets["Travaglini_Krasnow_2020_Lung_SS2"] = datasets[
    "Travaglini_Krasnow_2020_Lung_SS2"
][datasets["Travaglini_Krasnow_2020_Lung_SS2"].obs["tissue"] == "lung", :]
datasets["Travaglini_Krasnow_2020_Lung_SS2"] = normalize_by_gene_length(
    datasets["Travaglini_Krasnow_2020_Lung_SS2"]
)

In [24]:
datasets["Zilionis_Klein_2019_NSCLC"] = datasets["Zilionis_Klein_2019_NSCLC"][
    datasets["Zilionis_Klein_2019_NSCLC"].obs["tissue"] == "lung", :
]
datasets["Zilionis_Klein_2019_NSCLC"].obs["sex"] = [
    {"M": "male", "F": "female", "Unknown": "nan"}[s]
    for s in datasets["Zilionis_Klein_2019_NSCLC"].obs["sex"]
]

Trying to set attribute `.obs` of view, copying.


### make patients unique across datasets

Except for the two Travaglini variants - they are the same patients profiled with different platforms

In [25]:
for dataset_id, adata in datasets.items():
    adata.obs["dataset"] = dataset_id
    adata.obs["patient"] = [
        f'{dataset.replace("_10x", "").replace("_SS2", "")}_{patient}'
        for dataset, patient in zip(adata.obs["dataset"], adata.obs["patient"])
    ]
    datasets[dataset_id] = adata

Trying to set attribute `.obs` of view, copying.
Trying to set attribute `.obs` of view, copying.
Trying to set attribute `.obs` of view, copying.


### Validate data

In [26]:
datasets["Lambrechts_2018_LUAD_6653"].X.data

array([ 1.,  1., 12., ...,  1., 22.,  1.], dtype=float32)

In [27]:
for dataset_id, adata in datasets.items():
    print(f"Validating {dataset_id}")
    sanitize_adata(adata)
    validate_adata(adata)

  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'patient' as categorical


Validating Adams_Kaminski_2020_COPD


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sample' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'cell_type' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'origin' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sex' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'dataset' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'dataset' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sample' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'patient' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'cell_type' as categorical


Validating Chen_Zhang_2020_NSCLC


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'patient' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sample' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'cell_type' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sex' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'dataset' as categorical


Validating Goveia_Carmeliet_2020_NSCLC


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'patient' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'cell_type' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sample' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sex' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'dataset' as categorical


Validating Guo_Zhang_2018_NSCLC


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sample' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sex' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'patient' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'dataset' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'cell_type' as categorical


Validating Habermann_Kropski_2020_pulmonary-fibrosis


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sample' as categorical


Validating He_Fan_2021_LUAD


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'patient' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'dataset' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'cell_type' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'dataset' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sample' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'patient' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'cell_type' as categorical


Validating Kim_Lee_2020_LUAD


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'dataset' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sample' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'patient' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'cell_type' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'dataset' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sample' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'patient' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'cell_type' as categorical


Validating Lambrechts_2018_LUAD_6149v1
Validating Lambrechts_2018_LUAD_6149v2


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'dataset' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sample' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'patient' as categorical


Validating Lambrechts_2018_LUAD_6653


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sample' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'patient' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'cell_type' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sex' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'dataset' as categorical


Validating Laughney_Massague_2020_NSCLC
Validating Madissoon_Meyer_2020_pulmonary-fibrosis


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'patient' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sample' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'cell_type' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'tissue' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'origin' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'condition' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sex' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'dataset' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sample' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'patient' as categorical
  c.reorder_ca

Validating Maier_Merad_2020_NSCLC
Validating Maynard_Bivona_2020_NSCLC


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sample' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'patient' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'dataset' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sample' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'patient' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'dataset' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'cell_type' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sex' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'dataset' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sample' as categorical
  c.reorder_cat

Validating Mayr_Schiller_2020_pulmonary-fibrosis
Validating Reyfman_Misharin_2018_pulmonary-fibrosis


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sample' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'patient' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'dataset' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'cell_type' as categorical


Validating Travaglini_Krasnow_2020_Lung_10x
Validating Travaglini_Krasnow_2020_Lung_SS2


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'patient' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sample' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'cell_type' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'dataset' as categorical


Validating UKIM-V


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sample' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'dataset' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'patient' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'cell_type' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sample' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'patient' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'cell_type' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'dataset' as categorical


Validating Vieira_Teichmann_2019_asthma
Validating Wu_Zhou_2021_NSCLC


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sample' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sex' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'patient' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'cell_type' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'dataset' as categorical


Validating Zilionis_Klein_2019_NSCLC


## Gene identifier remapping

Use precompiled, static table for gene symbol remapping, since querying MyGene.info requires
internet connection and is not guaranteed to be reproducible. 

In [28]:
# datasets_remapped = process_map(remap_gene_symbols, datasets.values(), max_workers=32)
# for dataset_id, dataset in zip(datasets.keys(), datasets_remapped):
#     datasets[dataset_id] = dataset

In [29]:
# gene_symbol_dict = pd.concat(
#     x.var["original_gene_symbol"]
#     .reset_index()
#     .rename(columns={"index": "gene_symbol", "original_gene_symbol": "alias"})
#     for x in datasets_remapped
# ).drop_duplicates().dropna()
# gene_symbol_dict.to_csv("../../tables/gene_symbol_dict.csv")

In [30]:
gene_symbol_df = pd.read_csv(
    nxfvars.get("gene_symbol_table", "../../tables/gene_symbol_dict.csv"),
    index_col=False,
)
gene_symbol_dict = {
    alias: symbol
    for alias, symbol in zip(gene_symbol_df["alias"], gene_symbol_df["gene_symbol"])
}

In [31]:
for dataset_id, tmp_dataset in datasets.items():
    tmp_dataset.var_names = [gene_symbol_dict.get(x, x) for x in tmp_dataset.var_names]

### aggregate duplicate gene symbols

In [32]:
for dataset_id, dataset in datasets.items():
    print(dataset_id)
    datasets[dataset_id] = aggregate_duplicate_gene_symbols(dataset)

Adams_Kaminski_2020_COPD


100%|██████████| 22/22 [00:00<00:00, 269.72it/s]


Chen_Zhang_2020_NSCLC


100%|██████████| 18/18 [00:00<00:00, 354.56it/s]


Goveia_Carmeliet_2020_NSCLC


100%|██████████| 22/22 [00:00<00:00, 501.99it/s]


Guo_Zhang_2018_NSCLC


100%|██████████| 18/18 [00:00<00:00, 128.25it/s]


Habermann_Kropski_2020_pulmonary-fibrosis


100%|██████████| 48/48 [00:00<00:00, 392.62it/s]


He_Fan_2021_LUAD


100%|██████████| 46/46 [00:00<00:00, 381.79it/s]


Kim_Lee_2020_LUAD


100%|██████████| 15/15 [00:00<00:00, 371.36it/s]


Lambrechts_2018_LUAD_6149v1


100%|██████████| 8/8 [00:00<00:00, 429.19it/s]


Lambrechts_2018_LUAD_6149v2


100%|██████████| 15/15 [00:00<00:00, 386.47it/s]


Lambrechts_2018_LUAD_6653


100%|██████████| 14/14 [00:00<00:00, 392.79it/s]


Laughney_Massague_2020_NSCLC


100%|██████████| 29/29 [00:00<00:00, 462.65it/s]


Madissoon_Meyer_2020_pulmonary-fibrosis


100%|██████████| 53/53 [00:00<00:00, 438.82it/s]


Maier_Merad_2020_NSCLC


100%|██████████| 26/26 [00:00<00:00, 508.72it/s]


Maynard_Bivona_2020_NSCLC


100%|██████████| 20/20 [00:00<00:00, 209.39it/s]


Mayr_Schiller_2020_pulmonary-fibrosis


100%|██████████| 784/784 [00:02<00:00, 344.40it/s]


Reyfman_Misharin_2018_pulmonary-fibrosis


100%|██████████| 57/57 [00:00<00:00, 422.29it/s]


Travaglini_Krasnow_2020_Lung_10x


100%|██████████| 43/43 [00:00<00:00, 532.09it/s]

Travaglini_Krasnow_2020_Lung_SS2



100%|██████████| 31/31 [00:00<00:00, 241.47it/s]


UKIM-V


100%|██████████| 14/14 [00:00<00:00, 349.76it/s]


Vieira_Teichmann_2019_asthma


100%|██████████| 33/33 [00:00<00:00, 596.03it/s]


Wu_Zhou_2021_NSCLC


100%|██████████| 21/21 [00:00<00:00, 330.68it/s]


Zilionis_Klein_2019_NSCLC


100%|██████████| 30/30 [00:00<00:00, 300.91it/s]


## Export all

In [33]:
obs_all = pd.concat([x.obs for x in datasets.values()], ignore_index=True).reset_index(
    drop=True
)
obs_all = (
    obs_all.loc[
        :,
        MANDATORY_COLS
        + [
            "accession",
            "sampleType",
            "platform",
            "age",
            "tobacco",
            "ethnicity",
            "processing_site",
            "Tissue origins",
            "histology",
            "smoking",
            "pathology",
            "EGFR",
            "tumor_stage",
            "geo_accession",
            "tissue_orig",
            "replicate",
            "race",
            "smoking_status",
            "driver_gene",
            "driver_mutation",
            "secondary_mutation",
            "Notes",
            "stage_at_diagnosis",
            "pathlogy_review",
            "biopsy_date",
            "sort_date",
            "biopsy_type",
            "biopsy_time_status",
            "early_treatment_status",
            "best_response_status",
            "biopsy_timing",
            "analysis",
            "treatment_history",
            "treatment_history_detail",
            "line_of_therapy",
            "treatment_type",
            "treatment",
            "percent_PFS_ref_values",
            "percent.PFS.reference.values",
            "infections",
            "early_bx_day",
            "treatment_start_date",
            "pfs_over_under",
            "pfs_day",
            "pfs_month",
            "date_of_death",
            "stageIII.IV_ca_dx_date",
            "ca_dx_OS",
            "region",
            "location",
            "label",
            "tumor_id",
            "tumor_type",
            "GEO_Sample",
            "biopsy_segment",
            "gsm",
            "characteristics_ch1.7.treatment received prior to surgery (1= treated; 0=untreated)",
        ],
    ]
    .join(dataset_table.set_index("id"), on="dataset")
    .drop_duplicates(ignore_index=False)
    .set_index("sample")
)
# Duplicated doesn't filter out two duplicated rows, don't ask why.
obs_all = obs_all.loc[~obs_all.index.duplicated(), :]

In [34]:
assert (
    obs_all.index.drop_duplicates().size == obs_all.shape[0]
), "The number of unique samples equals the number of rows"

In [35]:
merged_all = merge_datasets(datasets.values(), symbol_in_n_datasets=17)

In [36]:
merged_all.shape

(1124947, 17837)

In [37]:
merged_all.obs.drop_duplicates().reset_index(drop=True)

Unnamed: 0,sample,patient,tissue,origin,condition,dataset,sex,cell_type,batch
0,Adams_Kaminski_2020_COPD_001C,Adams_Kaminski_2020_COPD_001C,lung,normal,healthy_control,Adams_Kaminski_2020_COPD,,unknown,Adams_Kaminski_2020_COPD_001C
1,Adams_Kaminski_2020_COPD_002C,Adams_Kaminski_2020_COPD_002C,lung,normal,healthy_control,Adams_Kaminski_2020_COPD,,unknown,Adams_Kaminski_2020_COPD_002C
2,Adams_Kaminski_2020_COPD_003C,Adams_Kaminski_2020_COPD_003C,lung,normal,healthy_control,Adams_Kaminski_2020_COPD,,unknown,Adams_Kaminski_2020_COPD_003C
3,Adams_Kaminski_2020_COPD_022C-a,Adams_Kaminski_2020_COPD_222C,lung,normal,healthy_control,Adams_Kaminski_2020_COPD,,unknown,Adams_Kaminski_2020_COPD_022C-a
4,Adams_Kaminski_2020_COPD_022C-b,Adams_Kaminski_2020_COPD_222C,lung,normal,healthy_control,Adams_Kaminski_2020_COPD,,unknown,Adams_Kaminski_2020_COPD_022C-b
...,...,...,...,...,...,...,...,...,...
1293,Zilionis_Klein_2019_NSCLC_p2t1,Zilionis_Klein_2019_NSCLC_patient_2,lung,tumor_primary,LSCC,Zilionis_Klein_2019_NSCLC,female,unknown,Zilionis_Klein_2019_NSCLC_p2t1
1294,Zilionis_Klein_2019_NSCLC_p4t1,Zilionis_Klein_2019_NSCLC_patient_4,lung,tumor_primary,LUAD,Zilionis_Klein_2019_NSCLC,male,unknown,Zilionis_Klein_2019_NSCLC_p4t1
1295,Zilionis_Klein_2019_NSCLC_p1t3,Zilionis_Klein_2019_NSCLC_patient_1,lung,tumor_primary,LSCC,Zilionis_Klein_2019_NSCLC,male,unknown,Zilionis_Klein_2019_NSCLC_p1t3
1296,Zilionis_Klein_2019_NSCLC_p5t1,Zilionis_Klein_2019_NSCLC_patient_5,lung,tumor_primary,LUAD,Zilionis_Klein_2019_NSCLC,female,unknown,Zilionis_Klein_2019_NSCLC_p5t1


In [38]:
merged_all.write_h5ad(f"{out_dir}/merged_all.h5ad")

  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sample' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'patient' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'tissue' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'origin' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'condition' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'dataset' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sex' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'cell_type' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'batch' as categorical


In [39]:
# Some samples drop out due to the min cells threshold. Keep only the remaining samplese in the obs table.
obs_all.loc[merged_all.obs["sample"].unique(), :].to_csv(f"{out_dir}/obs_all.csv")