In [1]:
%load_ext autoreload
%autoreload 2

import scanpy as sc
import numpy as np
import itertools
from tqdm import trange, tqdm
import scipy.sparse
import numpy.testing as npt
from scanpy_helpers.integration import (
    normalize_by_gene_length,
    sanitize_adata,
    validate_adata,
    add_doublet_annotation,
    undo_log_norm,
    remap_gene_symbols,
    drop_duplicated_genes,
    aggregate_duplicate_gene_symbols,
    merge_datasets,
    MANDATORY_COLS,
)
from threadpoolctl import threadpool_limits
from tqdm.contrib.concurrent import process_map
import mygene
from operator import and_
from functools import reduce
import pandas as pd
import anndata
import re

In [2]:
threadpool_limits(8)

<threadpoolctl.threadpool_limits at 0x7f9c08424070>

In [3]:
sc.set_figure_params(figsize=(5, 5))

In [4]:
dataset_table = pd.read_csv("../../tables/samplesheet_scrnaseq_preprocessing.csv")

In [5]:
dataset_table

Unnamed: 0,id,input_adata,min_counts,max_counts,min_genes,max_genes,max_pct_mito,batch_key,run_solo
0,Adams_Kaminski_2020_COPD,data/10_public_datasets/Adams_Kaminski_2020_CO...,1000,35000,500,10000,20,sample,True
1,Chen_Zhang_2020_NSCLC,data/10_public_datasets/Chen_Zhang_2020_NSCLC/...,600,30000,250,10000,20,sample,True
2,Goveia_Carmeliet_2020_NSCLC,data/10_public_datasets/Goveia_Carmeliet_2020_...,600,30000,250,10000,20,sample,True
3,Guo_Zhang_2018_NSCLC,data/10_public_datasets/Guo_Zhang_2018_NSCLC/h...,20000,3000000,1000,20000,20,sample,False
4,Habermann_Kropski_2020_pulmonary-fibrosis,data/10_public_datasets/Habermann_Kropski_2020...,600,30000,200,10000,20,sample,True
5,He_Fan_2021_LUAD,data/10_public_datasets/Kim_Lee_2020_LUAD/h5ad...,1000,35000,300,10000,20,sample,True
6,Kim_Lee_2020_LUAD,data/10_public_datasets/He_Fan_2021_LUAD/h5ad_...,600,30000,250,10000,20,sample,True
7,Lambrechts_2018_LUAD_6149v1,data/10_public_datasets/Lambrechts_2018_LUAD/E...,600,30000,200,10000,15,sample,True
8,Lambrechts_2018_LUAD_6149v2,data/10_public_datasets/Lambrechts_2018_LUAD/E...,600,30000,250,10000,20,sample,True
9,Lambrechts_2018_LUAD_6653,data/10_public_datasets/Lambrechts_2018_LUAD/E...,1200,40000,250,10000,20,sample,True


In [6]:
datasets = {
    dataset_id: sc.read_h5ad(
        f"../../data/20_qc_norm_scrnaseq/01_qc_and_filtering/{dataset_id}/{dataset_id}.qc.h5ad"
    )
    for dataset_id in tqdm(dataset_table["id"])
}

100%|██████████| 22/22 [00:34<00:00,  1.55s/it]


### Dataset-specific filtering and metadata fixes

In [7]:
datasets["Adams_Kaminski_2020_COPD"].obs["origin"] = "normal"
datasets["Adams_Kaminski_2020_COPD"].obs["sex"] = "nan"
datasets["Adams_Kaminski_2020_COPD"] = datasets["Adams_Kaminski_2020_COPD"][
    datasets["Adams_Kaminski_2020_COPD"].obs["condition"] != "IPF", :
]

  res = method(*args, **kwargs)


In [8]:
# No modifications necessary for Chen_Zhang

In [9]:
datasets["Goveia_Carmeliet_2020_NSCLC"] = datasets["Goveia_Carmeliet_2020_NSCLC"][
    datasets["Goveia_Carmeliet_2020_NSCLC"].obs["condition"] != "LLCC"
].copy()
datasets["Goveia_Carmeliet_2020_NSCLC"].obs["sex"] = "nan"

In [10]:
datasets["Guo_Zhang_2018_NSCLC"] = datasets["Guo_Zhang_2018_NSCLC"][
    datasets["Guo_Zhang_2018_NSCLC"].obs["tissue"] != "blood"
].copy()
datasets["Guo_Zhang_2018_NSCLC"] = normalize_by_gene_length(
    datasets["Guo_Zhang_2018_NSCLC"]
)
datasets["Guo_Zhang_2018_NSCLC"].obs["sex"] = "nan"

In [11]:
datasets["Habermann_Kropski_2020_pulmonary-fibrosis"].obs["sex"] = [
    {"M": "male", "F": "female", "Unknown": "nan"}[s]
    for s in datasets["Habermann_Kropski_2020_pulmonary-fibrosis"].obs["sex"]
]
datasets["Habermann_Kropski_2020_pulmonary-fibrosis"] = datasets[
    "Habermann_Kropski_2020_pulmonary-fibrosis"
][
    datasets["Habermann_Kropski_2020_pulmonary-fibrosis"].obs["condition"]
    == "healthy_control",
    :,
].copy()

In [12]:
# No modifications necessary for He_Fan

In [13]:
datasets["Maynard_Bivona_2020_NSCLC"] = normalize_by_gene_length(
    datasets["Maynard_Bivona_2020_NSCLC"]
)

In [14]:
datasets["Laughney_Massague_2020_NSCLC"].obs["sex"] = "nan"

In [15]:
datasets["Maier_Merad_2020_NSCLC"].obs["sex"] = "nan"

In [16]:
datasets["Madissoon_Meyer_2020_pulmonary-fibrosis"].obs["tissue"] = "lung"
datasets["Madissoon_Meyer_2020_pulmonary-fibrosis"].obs["origin"] = "normal"
datasets["Madissoon_Meyer_2020_pulmonary-fibrosis"].obs["condition"] = "healthy_control"
datasets["Madissoon_Meyer_2020_pulmonary-fibrosis"].obs["sex"] = "nan"
datasets["Madissoon_Meyer_2020_pulmonary-fibrosis"].X.data = np.rint(
    datasets["Madissoon_Meyer_2020_pulmonary-fibrosis"].X.data
)

In [17]:
datasets["Mayr_Schiller_2020_pulmonary-fibrosis"].obs["sex"] = "nan"
datasets["Mayr_Schiller_2020_pulmonary-fibrosis"] = datasets[
    "Mayr_Schiller_2020_pulmonary-fibrosis"
][
    datasets["Mayr_Schiller_2020_pulmonary-fibrosis"].obs["condition"]
    == "healthy_control",
    :,
]

In [18]:
datasets["Reyfman_Misharin_2018_pulmonary-fibrosis"].obs["sex"] = "nan"
datasets["Reyfman_Misharin_2018_pulmonary-fibrosis"] = datasets[
    "Reyfman_Misharin_2018_pulmonary-fibrosis"
][
    datasets["Reyfman_Misharin_2018_pulmonary-fibrosis"].obs["condition"]
    == "healthy_control",
    :,
]

In [19]:
datasets["Travaglini_Krasnow_2020_Lung_SS2"] = datasets[
    "Travaglini_Krasnow_2020_Lung_SS2"
][datasets["Travaglini_Krasnow_2020_Lung_SS2"].obs["tissue"] == "lung", :]

In [20]:
datasets["Zilionis_Klein_2019_NSCLC"] = datasets["Zilionis_Klein_2019_NSCLC"][
    datasets["Zilionis_Klein_2019_NSCLC"].obs["tissue"] == "lung", :
]
datasets["Zilionis_Klein_2019_NSCLC"].obs["sex"] = [
    {"M": "male", "F": "female", "Unknown": "nan"}[s]
    for s in datasets["Zilionis_Klein_2019_NSCLC"].obs["sex"]
]

Trying to set attribute `.obs` of view, copying.


### make patients unique across datasets

Except for the two Travaglini variants - they are the same patients profiled with different platforms

In [21]:
for dataset_id, adata in datasets.items():
    adata.obs["dataset"] = dataset_id
    adata.obs["patient"] = [
        f'{dataset.replace("_10x", "").replace("_SS2", "")}_{patient}'
        for dataset, patient in zip(adata.obs["dataset"], adata.obs["patient"])
    ]
    datasets[dataset_id] = adata

Trying to set attribute `.obs` of view, copying.
Trying to set attribute `.obs` of view, copying.
Trying to set attribute `.obs` of view, copying.
Trying to set attribute `.obs` of view, copying.


### Validate data

In [22]:
for dataset_id, adata in datasets.items():
    print(f"Validating {dataset_id}")
    sanitize_adata(adata)
    validate_adata(adata)

Validating Adams_Kaminski_2020_COPD


... storing 'patient' as categorical
... storing 'sample' as categorical
... storing 'origin' as categorical
... storing 'sex' as categorical
... storing 'dataset' as categorical
... storing 'dataset' as categorical


Validating Chen_Zhang_2020_NSCLC


... storing 'sample' as categorical
... storing 'patient' as categorical
... storing 'patient' as categorical
... storing 'sample' as categorical
... storing 'sex' as categorical
... storing 'dataset' as categorical


Validating Goveia_Carmeliet_2020_NSCLC


... storing 'patient' as categorical
... storing 'sample' as categorical
... storing 'sex' as categorical
... storing 'dataset' as categorical


Validating Guo_Zhang_2018_NSCLC


... storing 'sample' as categorical
... storing 'sex' as categorical
... storing 'patient' as categorical
... storing 'dataset' as categorical


Validating Habermann_Kropski_2020_pulmonary-fibrosis
Validating He_Fan_2021_LUAD


... storing 'sample' as categorical
... storing 'patient' as categorical
... storing 'dataset' as categorical
... storing 'dataset' as categorical


Validating Kim_Lee_2020_LUAD


... storing 'sample' as categorical
... storing 'patient' as categorical
... storing 'dataset' as categorical
... storing 'sample' as categorical
... storing 'patient' as categorical
... storing 'dataset' as categorical
... storing 'sample' as categorical
... storing 'patient' as categorical


Validating Lambrechts_2018_LUAD_6149v1
Validating Lambrechts_2018_LUAD_6149v2


... storing 'dataset' as categorical
... storing 'sample' as categorical
... storing 'patient' as categorical


Validating Lambrechts_2018_LUAD_6653


... storing 'sample' as categorical
... storing 'patient' as categorical
... storing 'sex' as categorical
... storing 'dataset' as categorical


Validating Laughney_Massague_2020_NSCLC
Validating Madissoon_Meyer_2020_pulmonary-fibrosis


... storing 'patient' as categorical
... storing 'sample' as categorical
... storing 'tissue' as categorical
... storing 'origin' as categorical
... storing 'condition' as categorical
... storing 'sex' as categorical
... storing 'dataset' as categorical
... storing 'sample' as categorical
... storing 'patient' as categorical
... storing 'sex' as categorical
... storing 'dataset' as categorical


Validating Maier_Merad_2020_NSCLC
Validating Maynard_Bivona_2020_NSCLC


... storing 'sample' as categorical
... storing 'patient' as categorical
... storing 'dataset' as categorical
... storing 'sample' as categorical
... storing 'patient' as categorical
... storing 'dataset' as categorical
... storing 'sex' as categorical
... storing 'dataset' as categorical
... storing 'sample' as categorical


Validating Mayr_Schiller_2020_pulmonary-fibrosis
Validating Reyfman_Misharin_2018_pulmonary-fibrosis


... storing 'patient' as categorical
... storing 'sex' as categorical
... storing 'sample' as categorical
... storing 'patient' as categorical
... storing 'dataset' as categorical


Validating Travaglini_Krasnow_2020_Lung_10x
Validating Travaglini_Krasnow_2020_Lung_SS2


... storing 'patient' as categorical
... storing 'sample' as categorical
... storing 'dataset' as categorical


Validating UKIM-V


... storing 'sample' as categorical
... storing 'dataset' as categorical
... storing 'patient' as categorical
... storing 'sample' as categorical


Validating Vieira_Teichmann_2019_asthma
Validating Wu_Zhou_2021_NSCLC


... storing 'patient' as categorical
... storing 'dataset' as categorical
... storing 'sample' as categorical
... storing 'sex' as categorical
... storing 'patient' as categorical
... storing 'dataset' as categorical


Validating Zilionis_Klein_2019_NSCLC


## Gene identifier remapping

In [23]:
datasets_remapped = process_map(remap_gene_symbols, datasets.values(), max_workers=32)
for dataset_id, dataset in zip(datasets.keys(), datasets_remapped):
    datasets[dataset_id] = dataset

HBox(children=(FloatProgress(value=0.0, max=22.0), HTML(value='')))

querying 1-1000...done.
querying 1-1000...querying 1001-2000...querying 1-1000...querying 1-1000...done.
querying 1001-2000...done.
done.
querying 1-1000...done.
querying 1001-2000...querying 2001-3000...querying 1001-2000...done.
done.
querying 2001-3000...done.
querying 1001-2000...done.
done.
querying 2001-3000...done.
querying 2001-3000...querying 3001-4000...done.
querying 3001-4000...querying 2001-3000...done.
done.
done.
querying 3001-4000...querying 4001-5000...querying 3001-4000...done.
done.
querying 4001-5000...querying 3001-4000...querying 1-1000...done.
done.
done.
done.
querying 4001-5000...done.
done.
querying 5001-6000...querying 4001-5000...querying 5001-6000...querying 1001-2000...querying 4001-5000...querying 1-1000...
done.querying 1-1000...querying 5001-6000...querying 1-1000...done.
done.
done.
done.
done.
querying 1-1000...querying 6001-7000...querying 6001-7000...querying 5001-6000...querying 5001-6000...querying 2001-3000...done.
done.
querying 1-1000...done.
d

### aggregate duplicate gene symbols

TODO: take care of duplicate genes that were made unique with "var_names_make_unique"

In [24]:
for dataset_id, dataset in datasets.items():
    print(dataset_id)
    datasets[dataset_id] = aggregate_duplicate_gene_symbols(dataset)

Adams_Kaminski_2020_COPD


100%|██████████| 28/28 [00:00<00:00, 273.52it/s]
  res = method(*args, **kwargs)


Chen_Zhang_2020_NSCLC


100%|██████████| 19/19 [00:00<00:00, 364.23it/s]


Goveia_Carmeliet_2020_NSCLC


100%|██████████| 22/22 [00:00<00:00, 501.60it/s]


Guo_Zhang_2018_NSCLC


100%|██████████| 20/20 [00:00<00:00, 275.15it/s]


Habermann_Kropski_2020_pulmonary-fibrosis


100%|██████████| 49/49 [00:00<00:00, 430.66it/s]


He_Fan_2021_LUAD


100%|██████████| 47/47 [00:00<00:00, 416.33it/s]


Kim_Lee_2020_LUAD


100%|██████████| 16/16 [00:00<00:00, 419.83it/s]


Lambrechts_2018_LUAD_6149v1


100%|██████████| 9/9 [00:00<00:00, 548.78it/s]


Lambrechts_2018_LUAD_6149v2


100%|██████████| 16/16 [00:00<00:00, 371.11it/s]


Lambrechts_2018_LUAD_6653


100%|██████████| 14/14 [00:00<00:00, 443.97it/s]


Laughney_Massague_2020_NSCLC


100%|██████████| 30/30 [00:00<00:00, 535.45it/s]


Madissoon_Meyer_2020_pulmonary-fibrosis


100%|██████████| 53/53 [00:00<00:00, 465.87it/s]


Maier_Merad_2020_NSCLC


100%|██████████| 27/27 [00:00<00:00, 592.00it/s]


Maynard_Bivona_2020_NSCLC


100%|██████████| 20/20 [00:00<00:00, 244.16it/s]


Mayr_Schiller_2020_pulmonary-fibrosis


100%|██████████| 785/785 [00:01<00:00, 419.48it/s]


Reyfman_Misharin_2018_pulmonary-fibrosis


100%|██████████| 58/58 [00:00<00:00, 478.31it/s]
100%|██████████| 44/44 [00:00<00:00, 582.54it/s]

Travaglini_Krasnow_2020_Lung_10x
Travaglini_Krasnow_2020_Lung_SS2



100%|██████████| 35/35 [00:00<00:00, 293.14it/s]


UKIM-V


100%|██████████| 15/15 [00:00<00:00, 455.59it/s]


Vieira_Teichmann_2019_asthma


100%|██████████| 31/31 [00:00<00:00, 680.01it/s]


Wu_Zhou_2021_NSCLC


100%|██████████| 21/21 [00:00<00:00, 404.72it/s]


Zilionis_Klein_2019_NSCLC


100%|██████████| 32/32 [00:00<00:00, 372.62it/s]


## add cell type annotation

In [25]:
for dataset in datasets.values():
    dataset.obs["cell_type"] = "unknown"

## Export all

In [26]:
obs_all = pd.concat([x.obs for x in datasets.values()]).reset_index(drop=True)

In [27]:
obs_all = obs_all.loc[
    :,
    MANDATORY_COLS
    + [
        "accession",
        "sampleType",
        "platform",
        "age",
        "tobacco",
        "ethnicity",
        "processing_site",
        "Tissue origins",
        "histology",
        "smoking",
        "pathology",
        "EGFR",
        "tumor_stage",
        "geo_accession",
        "tissue_orig",
        "replicate",
        "race",
        "smoking_status",
        "driver_gene",
        "driver_mutation",
        "secondary_mutation",
        "Notes",
        "stage_at_diagnosis",
        "pathlogy_review",
        "biopsy_date",
        "sort_date",
        "biopsy_type",
        "biopsy_time_status",
        "early_treatment_status",
        "best_response_status",
        "biopsy_timing",
        "analysis",
        "treatment_history",
        "treatment_history_detail",
        "line_of_therapy",
        "treatment_type",
        "treatment",
        "percent_PFS_ref_values",
        "percent.PFS.reference.values",
        "infections",
        "early_bx_day",
        "treatment_start_date",
        "pfs_over_under",
        "pfs_day",
        "pfs_month",
        "date_of_death",
        "stageIII.IV_ca_dx_date",
        "ca_dx_OS",
        "region",
        "location",
        "label",
        "tumor_id",
        "tumor_type",
        "GEO_Sample",
        "biopsy_segment",
        "gsm",
        "characteristics_ch1.7.treatment received prior to surgery (1= treated; 0=untreated)",
    ],
].drop_duplicates().set_index("sample")

In [28]:
obs_all.to_excel("../../data/50_integrate_scrnaseq_data/51_merge_all/obs_all.xlsx")

In [29]:
merged_all = merge_datasets(datasets.values(), symbol_in_n_datasets=17)

In [30]:
merged_all.shape

(1124947, 17833)

In [31]:
merged_all.obs.drop_duplicates().reset_index(drop=True)

Unnamed: 0,sample,patient,tissue,origin,condition,dataset,sex,cell_type,batch
0,Adams_Kaminski_2020_COPD_001C,Adams_Kaminski_2020_COPD_001C,lung,normal,healthy_control,Adams_Kaminski_2020_COPD,,unknown,Adams_Kaminski_2020_COPD_Adams_Kaminski_2020_C...
1,Adams_Kaminski_2020_COPD_002C,Adams_Kaminski_2020_COPD_002C,lung,normal,healthy_control,Adams_Kaminski_2020_COPD,,unknown,Adams_Kaminski_2020_COPD_Adams_Kaminski_2020_C...
2,Adams_Kaminski_2020_COPD_003C,Adams_Kaminski_2020_COPD_003C,lung,normal,healthy_control,Adams_Kaminski_2020_COPD,,unknown,Adams_Kaminski_2020_COPD_Adams_Kaminski_2020_C...
3,Adams_Kaminski_2020_COPD_022C-a,Adams_Kaminski_2020_COPD_222C,lung,normal,healthy_control,Adams_Kaminski_2020_COPD,,unknown,Adams_Kaminski_2020_COPD_Adams_Kaminski_2020_C...
4,Adams_Kaminski_2020_COPD_022C-b,Adams_Kaminski_2020_COPD_222C,lung,normal,healthy_control,Adams_Kaminski_2020_COPD,,unknown,Adams_Kaminski_2020_COPD_Adams_Kaminski_2020_C...
...,...,...,...,...,...,...,...,...,...
500,Zilionis_Klein_2019_NSCLC_p2t1,Zilionis_Klein_2019_NSCLC_patient_2,lung,tumor_primary,LSCC,Zilionis_Klein_2019_NSCLC,female,unknown,Zilionis_Klein_2019_NSCLC_Zilionis_Klein_2019_...
501,Zilionis_Klein_2019_NSCLC_p4t1,Zilionis_Klein_2019_NSCLC_patient_4,lung,tumor_primary,LUAD,Zilionis_Klein_2019_NSCLC,male,unknown,Zilionis_Klein_2019_NSCLC_Zilionis_Klein_2019_...
502,Zilionis_Klein_2019_NSCLC_p1t3,Zilionis_Klein_2019_NSCLC_patient_1,lung,tumor_primary,LSCC,Zilionis_Klein_2019_NSCLC,male,unknown,Zilionis_Klein_2019_NSCLC_Zilionis_Klein_2019_...
503,Zilionis_Klein_2019_NSCLC_p5t1,Zilionis_Klein_2019_NSCLC_patient_5,lung,tumor_primary,LUAD,Zilionis_Klein_2019_NSCLC,female,unknown,Zilionis_Klein_2019_NSCLC_Zilionis_Klein_2019_...


In [32]:
merged_all.write_h5ad(
    "../../data/50_integrate_scrnaseq_data/51_merge_all/merged_all.h5ad"
)

... storing 'sample' as categorical
... storing 'patient' as categorical
... storing 'tissue' as categorical
... storing 'origin' as categorical
... storing 'condition' as categorical
... storing 'dataset' as categorical
... storing 'sex' as categorical
... storing 'cell_type' as categorical
... storing 'batch' as categorical


## Export for NSCLC heterogeneity
 * only tumor samples (no controls, no metastases)
 * all NSCLC subtypes

In [33]:
# datasets_nsclc_heterogeneity = dict()
# for dataset_id, dataset in datasets.items():
#     if "tumor_primary" in dataset.obs["origin"].values:
#         datasets_nsclc_heterogeneity[dataset_id] = dataset[
#             dataset.obs["origin"] == "tumor_primary", :
#         ].copy()
# del datasets_nsclc_heterogeneity["Pircher_batch1_NSCLC"]

In [34]:
# merged_nsclc_heterogeneity = merge_datasets(
#     datasets_nsclc_heterogeneity.values(), symbol_in_n_datasets=5
# )

In [35]:
# merged_nsclc_heterogeneity.shape

In [36]:
# merged_nsclc_heterogeneity.write_h5ad(
#     "../../data/50_integrate_scrnaseq_data/51_merge_all/merged_nsclc_heterogeneity.h5ad"
# )