In [1]:
%load_ext autoreload
%autoreload 2

import scanpy as sc
import numpy as np
import itertools
from tqdm import trange
import scipy.sparse
import numpy.testing as npt
from scanpy_helpers.integration import (
    normalize_by_gene_length,
    sanitize_adata,
    validate_adata,
    add_doublet_annotation,
    undo_log_norm,
    remap_gene_symbols,
    drop_duplicated_genes,
    aggregate_duplicate_gene_symbols,
    merge_datasets
)
from threadpoolctl import threadpool_limits
from tqdm.contrib.concurrent import process_map
import mygene
from operator import and_
from functools import reduce
import pandas as pd
import anndata

In [2]:
threadpool_limits(8)

<threadpoolctl.threadpool_limits at 0x7f5108fd9220>

In [3]:
sc.set_figure_params(figsize=(5, 5))

In [4]:
annotated_datasets = {
    "Maynard_Bivona_2020_NSCLC": sc.read_h5ad(
        "../../data/30_annotate_scrnaseq_data/maynard_annotated.h5ad"
    ),
    "Lambrechts_2018_LUAD_6653": sc.read_h5ad(
        "../../data/30_annotate_scrnaseq_data/lambrechts_annotated.h5ad"
    ),
}

In [5]:
datasets = {
    "Maynard_Bivona_2020_NSCLC": sc.read_h5ad(
        "../../data/20_qc_norm_scrnaseq/01_qc_and_filtering/Maynard_Bivona_2020_NSCLC/Maynard_Bivona_2020_NSCLC.qc.h5ad"
    ),
    "Lambrechts_2018_LUAD_6653": sc.read_h5ad(
        "../../data/20_qc_norm_scrnaseq/01_qc_and_filtering/Lambrechts_2018_LUAD_6653/Lambrechts_2018_LUAD_6653.qc.h5ad"
    ),
    "Adams_Kaminski_2020_COPD": sc.read_h5ad(
        "../../data/20_qc_norm_scrnaseq/01_qc_and_filtering/Adams_Kaminski_2020_COPD/Adams_Kaminski_2020_COPD.qc.h5ad"
    ),
    "Goveia_Carmeliet_2020_NSCLC": sc.read_h5ad(
        "../../data/20_qc_norm_scrnaseq/01_qc_and_filtering/Goveia_Carmeliet_2020_NSCLC/Goveia_Carmeliet_2020_NSCLC.qc.h5ad"
    ),
    "Guo_Zhang_2018_NSCLC": sc.read_h5ad(
        "../../data/20_qc_norm_scrnaseq/01_qc_and_filtering/Guo_Zhang_2018_NSCLC/Guo_Zhang_2018_NSCLC.qc.h5ad"
    ),
    "Lambrechts_2018_LUAD_6149": sc.read_h5ad(
        "../../data/20_qc_norm_scrnaseq/01_qc_and_filtering/Lambrechts_2018_LUAD_6149/Lambrechts_2018_LUAD_6149.qc.h5ad"
    ),
    "Laughney_Massague_2020_NSCLC": sc.read_h5ad(
        "../../data/20_qc_norm_scrnaseq/01_qc_and_filtering/Laughney_Massague_2020_NSCLC/Laughney_Massague_2020_NSCLC.qc.h5ad"
    ),
    "Lukassen_Eils_2020_LUAD": sc.read_h5ad(
        "../../data/10_public_datasets/Lukassen_Eils_2020_LUAD/h5ad_processed/lukassen20_lung_orig.processed.h5ad"
    ),
    "Madissoon_Meyer_2020_pulmonary-fibrosis": sc.read_h5ad(
        "../../data/10_public_datasets/Madissoon_Meyer_2020_pulmonary-fibrosis/h5ad_processed/madissoon19_lung.processed.h5ad"
    ),
    "Maier_Merad_2020_NSCLC": sc.read_h5ad(
        "../../data/20_qc_norm_scrnaseq/01_qc_and_filtering/Maier_Merad_2020_NSCLC/Maier_Merad_2020_NSCLC.qc.h5ad"
    ),
    "Mayr_Schiller_2020_pulmonary-fibrosis": sc.read_h5ad(
        "../../data/10_public_datasets/Mayr_Schiller_2020_pulmonary-fibrosis/h5ad/integrated_human_dataset.h5ad"
    ),
    "Pircher_batch1_NSCLC": sc.read_h5ad(
        "../../data/20_qc_norm_scrnaseq/01_qc_and_filtering/batch1_3patients/batch1_3patients.qc.h5ad"
    ),
}


This is where adjacency matrices should go now.
  warn(

This is where adjacency matrices should go now.
  warn(


In [6]:
doublet_files = {
    "Adams_Kaminski_2020_COPD": "../../data/20_qc_norm_scrnaseq/02_solo/Adams_Kaminski_2020_COPD/Adams_Kaminski_2020_COPD.is_doublet.csv",
    "Goveia_Carmeliet_2020_NSCLC": "../../data/20_qc_norm_scrnaseq/02_solo/Goveia_Carmeliet_2020_NSCLC/Goveia_Carmeliet_2020_NSCLC.is_doublet.csv",
    # No doublet filtering for smartseq2
    #  "Guo_Zhang_2018_NSCLC": "../../data/20_qc_norm_scrnaseq/02_solo/Guo_Zhang_2018_NSCLC/Guo_Zhang_2018_NSCLC.is_doublet.csv",
    "Lambrechts_2018_LUAD_6149": "../../data/20_qc_norm_scrnaseq/02_solo/Lambrechts_2018_LUAD_6149/Lambrechts_2018_LUAD_6149.is_doublet.csv",
    "Laughney_Massague_2020_NSCLC": "../../data/20_qc_norm_scrnaseq/02_solo/Laughney_Massague_2020_NSCLC/Laughney_Massague_2020_NSCLC.is_doublet.csv",
    "Maier_Merad_2020_NSCLC": "../../data/20_qc_norm_scrnaseq/02_solo/Maier_Merad_2020_NSCLC/Maier_Merad_2020_NSCLC.is_doublet.csv",
    "Pircher_batch1_NSCLC": "../../data/20_qc_norm_scrnaseq/02_solo/batch1_3patients/batch1_3patients.is_doublet.csv",
}

### Add doublet information and filter datasets

In [7]:
# datasets_vis = process_map(add_doublet_annotation, [datasets[k] for k in doublet_files], doublet_files.values(), doublet_files.keys(), max_workers=16)

In [13]:
for dataset_id, dataset in datasets.items():
    if "is_doublet" in dataset.obs.columns:
        datasets[dataset_id] = dataset[dataset.obs["is_doublet"] == "False", :].copy()

### Dataset-specific filtering and metadata fixes

In [14]:
datasets["Maynard_Bivona_2020_NSCLC"] = normalize_by_gene_length(
    datasets["Maynard_Bivona_2020_NSCLC"]
)

In [15]:
datasets["Adams_Kaminski_2020_COPD"].obs["origin"] = "normal"
datasets["Adams_Kaminski_2020_COPD"].obs["sex"] = "nan"
datasets["Adams_Kaminski_2020_COPD"] = datasets["Adams_Kaminski_2020_COPD"][
    datasets["Adams_Kaminski_2020_COPD"].obs["condition"] != "IPF", :
]

In [16]:
datasets["Goveia_Carmeliet_2020_NSCLC"] = datasets["Goveia_Carmeliet_2020_NSCLC"][
    datasets["Goveia_Carmeliet_2020_NSCLC"].obs["condition"] != "LLCC"
].copy()
datasets["Goveia_Carmeliet_2020_NSCLC"].obs["sex"] = "nan"

In [17]:
datasets["Guo_Zhang_2018_NSCLC"] = datasets["Guo_Zhang_2018_NSCLC"][
    datasets["Guo_Zhang_2018_NSCLC"].obs["tissue"] != "blood"
].copy()
datasets["Guo_Zhang_2018_NSCLC"] = normalize_by_gene_length(
    datasets["Guo_Zhang_2018_NSCLC"]
)
datasets["Guo_Zhang_2018_NSCLC"].obs["sex"] = "nan"

In [18]:
datasets["Laughney_Massague_2020_NSCLC"].obs["sex"] = "nan"

In [19]:
undo_log_norm(datasets["Lukassen_Eils_2020_LUAD"])
datasets["Lukassen_Eils_2020_LUAD"].obs["sex"] = [
    {"M": "male", "F": "female"}[s]
    for s in datasets["Lukassen_Eils_2020_LUAD"].obs["sex"]
]

In [20]:
tmp_obs = datasets["Lukassen_Eils_2020_LUAD"].obs
tmp_obs["patient"] = tmp_obs["orig.ident"]
tmp_obs["sample"] = tmp_obs["orig.ident"]
tmp_obs["tissue"] = "lung"
tmp_obs["origin"] = "normal"
tmp_obs["condition"] = "LUAD"

In [21]:
datasets["Madissoon_Meyer_2020_pulmonary-fibrosis"].obs["tissue"] = "lung"
datasets["Madissoon_Meyer_2020_pulmonary-fibrosis"].obs["origin"] = "normal"
datasets["Madissoon_Meyer_2020_pulmonary-fibrosis"].obs["condition"] = "healthy_control"
datasets["Madissoon_Meyer_2020_pulmonary-fibrosis"].obs["sex"] = "nan"
datasets["Madissoon_Meyer_2020_pulmonary-fibrosis"].X.data = np.rint(
    datasets["Madissoon_Meyer_2020_pulmonary-fibrosis"].X.data
)

In [22]:
datasets["Maier_Merad_2020_NSCLC"].obs["sex"] = "nan"

In [23]:
tmp_obs = datasets["Mayr_Schiller_2020_pulmonary-fibrosis"].obs
tmp_obs["sex"] = [{"M": "male", "F": "female"}[s] for s in tmp_obs["Sex"]]
tmp_obs["condition"] = [
    {
        "control donor": "healthy_control",
        "endstage lung fibrosis": "pulmonary_fibrosis",
    }[d]
    for d in tmp_obs["health_status"]
]
tmp_obs["patient"] = tmp_obs["patient_id"]
tmp_obs["sample"] = tmp_obs["patient_id"]
tmp_obs["tissue"] = "lung"
tmp_obs["origin"] = "normal"

datasets["Mayr_Schiller_2020_pulmonary-fibrosis"] = datasets[
    "Mayr_Schiller_2020_pulmonary-fibrosis"
][
    datasets["Mayr_Schiller_2020_pulmonary-fibrosis"].obs["condition"]
    != "pulmonary_fibrosis",
    :,
].copy()

### make patients unique across datasets

In [19]:
for dataset_id, adata in datasets.items():
    adata.obs["dataset"] = dataset_id
    adata.obs["patient"] = [f"{dataset}_{patient}" for dataset, patient in zip(adata.obs["dataset"], adata.obs["patient"])]
    datasets[dataset_id] = adata

Trying to set attribute `.obs` of view, copying.


### Remove duplicated genes

In [20]:
for dataset_id, dataset in datasets.items():
    datasets[dataset_id] = aggregate_duplicate_gene_symbols(dataset)

### Validate data

In [21]:
for dataset_id, adata in datasets.items():
    print(f"Validating {dataset_id}")
    sanitize_adata(adata)
    validate_adata(adata)

... storing 'sample' as categorical


Validating Maynard_Bivona_2020_NSCLC


... storing 'patient' as categorical
... storing 'dataset' as categorical
... storing 'dataset' as categorical
... storing 'sample' as categorical
... storing 'patient' as categorical


Validating Lambrechts_2018_LUAD_6653
Validating Adams_Kaminski_2020_COPD


... storing 'patient' as categorical
... storing 'sample' as categorical
... storing 'origin' as categorical
... storing 'sex' as categorical
... storing 'dataset' as categorical
... storing 'patient' as categorical
... storing 'sample' as categorical
... storing 'sex' as categorical
... storing 'dataset' as categorical


Validating Goveia_Carmeliet_2020_NSCLC


... storing 'patient' as categorical
... storing 'sample' as categorical
... storing 'sex' as categorical
... storing 'dataset' as categorical


Validating Guo_Zhang_2018_NSCLC


... storing 'dataset' as categorical
... storing 'sample' as categorical
... storing 'patient' as categorical


Validating Lambrechts_2018_LUAD_6149


... storing 'sample' as categorical
... storing 'patient' as categorical
... storing 'sex' as categorical
... storing 'dataset' as categorical


Validating Laughney_Massague_2020_NSCLC


... storing 'sex' as categorical
... storing 'patient' as categorical
... storing 'sample' as categorical
... storing 'tissue' as categorical
... storing 'origin' as categorical


Validating Lukassen_Eils_2020_LUAD


... storing 'condition' as categorical
... storing 'dataset' as categorical


Validating Madissoon_Meyer_2020_pulmonary-fibrosis


... storing 'patient' as categorical
... storing 'sample' as categorical
... storing 'tissue' as categorical
... storing 'origin' as categorical
... storing 'condition' as categorical
... storing 'sex' as categorical
... storing 'dataset' as categorical
... storing 'sample' as categorical
... storing 'patient' as categorical
... storing 'sex' as categorical
... storing 'dataset' as categorical


Validating Maier_Merad_2020_NSCLC
Validating Mayr_Schiller_2020_pulmonary-fibrosis


... storing 'sex' as categorical
... storing 'condition' as categorical
... storing 'patient' as categorical
... storing 'sample' as categorical
... storing 'tissue' as categorical
... storing 'origin' as categorical
... storing 'dataset' as categorical
... storing 'patient' as categorical
... storing 'sample' as categorical
... storing 'dataset' as categorical


Validating Pircher_batch1_NSCLC


## Gene identifier remapping

In [22]:
datasets_remapped = process_map(remap_gene_symbols, datasets.values(), max_workers=32)
for dataset_id, dataset in zip(datasets.keys(), datasets_remapped):
    datasets[dataset_id] = dataset

HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))

querying 1-1000...querying 1-1000...done.
done.
querying 1001-2000...querying 1001-2000...done.
done.
querying 2001-3000...querying 2001-3000...done.
querying 3001-4000...done.
querying 3001-4000...done.
querying 4001-5000...done.
querying 4001-5000...done.
querying 1-1000...querying 1-1000...querying 5001-6000...done.
querying 1-1000...querying 1-1000...querying 5001-6000...done.
done.
querying 1-1000...done.
querying 1001-2000...querying 1001-2000...querying 6001-7000...done.
done.
done.
querying 1001-2000...querying 1001-2000...
querying 6001-7000...done.done.
done.
querying 1-1000...querying 2001-3000...querying 1001-2000...querying 7001-8000...done.
done.
querying 1-1000...done.
querying 1-1000...querying 2001-3000...querying 2001-3000...querying 2001-3000...done.
querying 1001-2000...done.
done.
done.
querying 3001-4000...querying 7001-8000...querying 8001-9000...done.
done.
done.
done.
querying 1001-2000...done.
querying 3001-4000...querying 1001-2000...querying 3001-4000...quer

In [23]:
for dataset_id, dataset in datasets.items():
    print(dataset_id)
    datasets[dataset_id] = aggregate_duplicate_gene_symbols(dataset)

Maynard_Bivona_2020_NSCLC


100%|██████████| 21/21 [00:00<00:00, 237.10it/s]


Lambrechts_2018_LUAD_6653


100%|██████████| 41/41 [00:00<00:00, 476.75it/s]


Adams_Kaminski_2020_COPD


100%|██████████| 23/23 [00:00<00:00, 277.77it/s]


Goveia_Carmeliet_2020_NSCLC


100%|██████████| 23/23 [00:00<00:00, 544.04it/s]


Guo_Zhang_2018_NSCLC


100%|██████████| 18/18 [00:00<00:00, 237.46it/s]


Lambrechts_2018_LUAD_6149


100%|██████████| 44/44 [00:00<00:00, 476.31it/s]


Laughney_Massague_2020_NSCLC


100%|██████████| 8/8 [00:00<00:00, 574.61it/s]


Lukassen_Eils_2020_LUAD


100%|██████████| 107/107 [00:00<00:00, 385.91it/s]


Madissoon_Meyer_2020_pulmonary-fibrosis


100%|██████████| 44/44 [00:00<00:00, 465.57it/s]


Maier_Merad_2020_NSCLC


100%|██████████| 26/26 [00:00<00:00, 567.37it/s]


Mayr_Schiller_2020_pulmonary-fibrosis


100%|██████████| 881/881 [00:02<00:00, 334.90it/s]


Pircher_batch1_NSCLC


100%|██████████| 16/16 [00:00<00:00, 444.12it/s]


## add cell type annotation

In [24]:
for dataset in datasets.values():
    dataset.obs["cell_type"] = "unknown"

In [25]:
datasets["Lambrechts_2018_LUAD_6653"].obs.loc[
    annotated_datasets["Lambrechts_2018_LUAD_6653"].obs_names, "cell_type"
] = annotated_datasets["Lambrechts_2018_LUAD_6653"].obs["cell_type"]

In [26]:
datasets["Maynard_Bivona_2020_NSCLC"].obs.loc[
    annotated_datasets["Maynard_Bivona_2020_NSCLC"].obs_names, "cell_type"
] = annotated_datasets["Maynard_Bivona_2020_NSCLC"].obs["cell_type"]

## Export all

In [54]:
merged_all = merge_datasets(datasets.values(), symbol_in_n_datasets=9)

In [55]:
merged_all.shape

(551334, 18095)

In [56]:
merged_all.write_h5ad("../../data/50_integrate_scrnaseq_data/51_merge_all/merged_all.h5ad")

... storing 'sample' as categorical
... storing 'patient' as categorical
... storing 'tissue' as categorical
... storing 'origin' as categorical
... storing 'condition' as categorical
... storing 'dataset' as categorical
... storing 'sex' as categorical
... storing 'cell_type' as categorical
... storing 'batch' as categorical


## Export for NSCLC heterogeneity
 * only tumor samples (no controls, no metastases)
 * all NSCLC subtypes

In [64]:
datasets_nsclc_heterogeneity = dict()
for dataset_id, dataset in datasets.items():
    if "tumor_primary" in dataset.obs["origin"].values:
        datasets_nsclc_heterogeneity[dataset_id] = dataset[dataset.obs["origin"] == "tumor_primary", :].copy()
del datasets_nsclc_heterogeneity["Pircher_batch1_NSCLC"]

In [65]:
merged_nsclc_heterogeneity = merge_datasets(datasets_nsclc_heterogeneity.values(), symbol_in_n_datasets=5)

In [66]:
merged_nsclc_heterogeneity.shape

(86061, 17567)

In [67]:
merged_nsclc_heterogeneity.write_h5ad("../../data/50_integrate_scrnaseq_data/51_merge_all/merged_nsclc_heterogeneity.h5ad")

... storing 'sample' as categorical
... storing 'patient' as categorical
... storing 'condition' as categorical
... storing 'dataset' as categorical
... storing 'sex' as categorical
... storing 'cell_type' as categorical
... storing 'batch' as categorical
