In [2]:
import scanpy as sc
import pandas as pd
import numpy as np
import seaborn as sns
import scvi
import math
import matplotlib.pyplot as plt
import scipy.sparse
from glob import glob
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression

%matplotlib inline

Global seed set to 0


In [3]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

scanpy==1.9.1 anndata==0.8.0 umap==0.5.2 numpy==1.21.6 scipy==1.9.0 pandas==1.4.3 scikit-learn==1.0.2 statsmodels==0.13.2 python-igraph==0.9.9 pynndescent==0.5.6


In [4]:
adata = sc.read_h5ad("/project2/gilad/jpopp/ebQTL/data/single_cell_objects/highpass/eb_raw.qc.h5ad")

In [5]:
cellid_annotations = pd.read_csv("/project2/gilad/jpopp/ebQTL/data/fca/eb_cellid_labels.tsv", sep="\t", index_col="cell").reindex(adata.obs.index)

Put cell type labels in a more convenient form

In [7]:
cellid_annotations['value'] = [s.replace(' ', '-') for s in cellid_annotations['value']]

In [9]:
adata.obs['cellid_label']  = cellid_annotations['value']

Remove all unannotated cells

In [10]:
adata = adata[adata.obs['cellid_label'] != 'unassigned']

## Filter Samples

How many cells do we have for each individual in each cell type?

In [11]:
cell_counts = adata.obs[['donor_id', 'cellid_label']]
cell_counts = pd.DataFrame(cell_counts.groupby('cellid_label').value_counts()).reset_index(inplace=False).rename(columns={0: "n_cells_unfiltered", 'cellid_label': 'type'})
cell_counts

Unnamed: 0,type,donor_id,n_cells_unfiltered
0,Acinar-cells,NA18511,2034
1,Acinar-cells,NA19160,1614
2,Acinar-cells,NA19159,1614
3,Acinar-cells,NA19093,1522
4,Acinar-cells,NA18858,1245
...,...,...,...
1744,Vascular-endothelial-cells,NA19193,1
1745,Vascular-endothelial-cells,NA19209,0
1746,Vascular-endothelial-cells,NA18520,0
1747,Vascular-endothelial-cells,NA19101,0


To get this into proper form, we need to:
- Reformat cell types

In [12]:
cell_counts['individual'] = [s.replace("NA", "") for s in cell_counts['donor_id']]
cell_counts['ind_type'] = cell_counts['individual'].astype(str) + "_" + cell_counts['type'].astype(str)
cell_counts = cell_counts[['ind_type', 'individual', 'type', 'n_cells_unfiltered']]
cell_counts

Unnamed: 0,ind_type,individual,type,n_cells_unfiltered
0,18511_Acinar-cells,18511,Acinar-cells,2034
1,19160_Acinar-cells,19160,Acinar-cells,1614
2,19159_Acinar-cells,19159,Acinar-cells,1614
3,19093_Acinar-cells,19093,Acinar-cells,1522
4,18858_Acinar-cells,18858,Acinar-cells,1245
...,...,...,...,...
1744,19193_Vascular-endothelial-cells,19193,Vascular-endothelial-cells,1
1745,19209_Vascular-endothelial-cells,19209,Vascular-endothelial-cells,0
1746,18520_Vascular-endothelial-cells,18520,Vascular-endothelial-cells,0
1747,19101_Vascular-endothelial-cells,19101,Vascular-endothelial-cells,0


We'll drop any samples with less than 5 cells

In [13]:
cell_counts['dropped'] = cell_counts['n_cells_unfiltered'] < 5

## Filter Cell Types

We will filter to samples with at least 5 cells, and cell types with over 25 individuals represented.

In [14]:
ind_counts = cell_counts[cell_counts['n_cells_unfiltered'] >= 5]
ind_counts = pd.DataFrame(ind_counts[['type']].value_counts()).reset_index(inplace=False).rename(columns={0: "n_unfiltered"})
ind_counts = ind_counts[ind_counts['n_unfiltered']>25]
ind_counts

Unnamed: 0,type,n_unfiltered
0,Acinar-cells,53
1,Ductal-cells,53
2,Retinal-cells,53
3,Metanephric-cells,53
4,Megakaryocytes,53
5,CNS-glia,53
6,CNS-neurons,53
7,Mesangial-cells,52
8,PNS-neurons,52
9,PNS-glia,52


In [15]:
ind_counts.to_csv("/project2/gilad/jpopp/ebQTL/data/benchmark_specificity_methods/eb_cellid/pseudobulk_tmm/samples_per_celltype.tsv",
                  sep="\t", index=False)

## Pseudobulk Aggregation

First, we can highlight which samples will be kept for QTL analysis

In [16]:
cell_types_inc = ind_counts['type']
samples_inc = cell_counts[(cell_counts['dropped'] == False) & (cell_counts['type'].isin(cell_types_inc))]['ind_type']

In [18]:
cell_subset = adata.obs[['donor_id']].copy()
cell_subset['type'] = adata.obs[['cellid_label']]
cell_subset['ind'] = [s.replace("NA", "") for s in cell_subset['donor_id'].astype(str)]
cell_subset['sample'] = cell_subset['ind'] + "_" + cell_subset['type']
cell_subset = cell_subset[cell_subset['sample'].isin(samples_inc)]
cell_subset

Unnamed: 0_level_0,donor_id,type,ind,sample
cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AAACCCACACGGCGTT-1,NA18870,Vascular-endothelial-cells,18870,18870_Vascular-endothelial-cells
AAACCCACAGAAGTGC-1,NA18870,Retinal-cells,18870,18870_Retinal-cells
AAACGAAAGGTATTGA-1,NA18870,CNS-neurons,18870,18870_CNS-neurons
AAACGAACACCCTCTA-1,NA19190,Retinal-cells,19190,19190_Retinal-cells
AAACGAACATGATAGA-1,NA18913,Ductal-cells,18913,18913_Ductal-cells
...,...,...,...,...
TTTGGAGCAACGGGTA-118,NA19160,PNS-glia,19160,19160_PNS-glia
TTTGGTTGTGAATGAT-118,NA19160,CNS-neurons,19160,19160_CNS-neurons
TTTGTTGAGGATATAC-118,NA18511,Acinar-cells,18511,18511_Acinar-cells
TTTGTTGGTTTGGAGG-118,NA18511,Acinar-cells,18511,18511_Acinar-cells


For pseudobulk aggregation, we're going to use the raw data

In [19]:
adata = adata[cell_subset.index]

## Update summary tables

In [20]:
filtered_counts = adata.obs[['donor_id', 'cellid_label', 'total_counts']].copy()
filtered_counts['n_cells_filtered'] = 1
filtered_counts['individual'] = [s.replace("NA", "") for s in filtered_counts['donor_id']]
filtered_counts['ind_type'] = filtered_counts['individual'].astype(str) + "_" + filtered_counts['cellid_label'].astype(str)
filtered_counts = filtered_counts.drop(columns=['donor_id', 'individual', 'cellid_label'])
filtered_counts = filtered_counts.groupby('ind_type').agg({'total_counts': 'sum', 'n_cells_filtered': 'count'})
filtered_counts = filtered_counts.reset_index().astype({'total_counts': 'int'})
filtered_counts

Unnamed: 0,ind_type,total_counts,n_cells_filtered
0,18486_Acinar-cells,10320028,392
1,18486_Adrenocortical-cells,1565897,52
2,18486_Bronchiolar-and-alveolar-epithelial-cells,80444,7
3,18486_CNS-glia,13564501,939
4,18486_CNS-neurons,13228075,1006
...,...,...,...
1408,19257_Squamous-epithelial-cells,2296582,67
1409,19257_Stellate-cells,675680,23
1410,19257_Stromal-cells,16309284,606
1411,19257_Ureteric-bud-cells,946577,30


In [21]:
cell_counts_filtered = cell_counts.merge(filtered_counts, on='ind_type', how='left').fillna({'total_counts': 0, 'n_cells_filtered': 0}).astype({'total_counts': 'int', 'n_cells_filtered': 'int'})
cell_counts_filtered['dropped'] = cell_counts_filtered['n_cells_filtered'] < 5
cell_counts_filtered = cell_counts_filtered.sort_values(by="n_cells_filtered", ascending=False)

In [22]:
cell_counts_filtered.to_csv("/project2/gilad/jpopp/ebQTL/data/benchmark_specificity_methods/eb_cellid/pseudobulk_tmm/sample_summary.tsv", sep="\t")

In [23]:
ind_counts = cell_counts_filtered[cell_counts_filtered['n_cells_filtered'] >= 5]
ind_counts = pd.DataFrame(ind_counts[['type']].value_counts()).reset_index(inplace=False).rename(columns={0: "n_filtered"})
ind_counts = ind_counts[ind_counts['n_filtered']>25]
ind_counts

Unnamed: 0,type,n_filtered
0,Acinar-cells,53
1,Megakaryocytes,53
2,Retinal-cells,53
3,CNS-glia,53
4,CNS-neurons,53
5,Ductal-cells,53
6,Metanephric-cells,53
7,PNS-neurons,52
8,PNS-glia,52
9,Mesangial-cells,52


In [24]:
ind_counts.to_csv("/project2/gilad/jpopp/ebQTL/data/benchmark_specificity_methods/eb_cellid/pseudobulk_tmm/samples_per_celltype.tsv",
                  sep="\t", index=False)

## Aggregation

### Aggregate raw counts

In [29]:
cell_subset = adata.obs[['donor_id']].copy()
cell_subset['type'] = adata.obs[['cellid_label']]
cell_subset['ind'] = [s.replace("NA", "") for s in cell_subset['donor_id'].astype(str)]
cell_subset['sample'] = cell_subset['ind'] + "_" + cell_subset['type']
onehot = OneHotEncoder(sparse=True).fit_transform(cell_subset[['sample']])
onehot

<650553x1413 sparse matrix of type '<class 'numpy.float64'>'
	with 650553 stored elements in Compressed Sparse Row format>

In [26]:
pseudobulk_sum = adata.X.transpose() * onehot

In [27]:
pseudobulk_sum = pd.DataFrame.sparse.from_spmatrix(data=pseudobulk_sum, index=adata.var_names, columns=cell_subset['sample'].astype("category").cat.categories)

In [28]:
pseudobulk_sum.to_csv("/project2/gilad/jpopp/ebQTL/data/benchmark_specificity_methods/eb_cellid/eb_cellid.pseudobulk_tmm.tsv",
                       sep="\t", index_label="gene")