In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
import seaborn as sns
import scvi
import math
import matplotlib.pyplot as plt
import scipy.sparse
from glob import glob
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression

%matplotlib inline

Global seed set to 0


In [2]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

scanpy==1.9.1 anndata==0.8.0 umap==0.5.2 numpy==1.21.6 scipy==1.9.0 pandas==1.4.3 scikit-learn==1.0.2 statsmodels==0.13.2 python-igraph==0.9.9 pynndescent==0.5.6


First, we need raw counts for each cell on the trajectory

In [3]:
adata_full = sc.read_h5ad("/project2/gilad/jpopp/ebQTL/data/single_cell_objects/highpass/eb_raw.qc.h5ad")
adata_cmlineage = sc.read_h5ad("/project2/gilad/jpopp/ebQTL/data/trajectory_inference/cm_lineage/eb_cm_lineage.pseudotime.adata")

In [4]:
adata = adata_full[adata_cmlineage.obs_names]
adata.obs['stage'] = adata_cmlineage.obs['stage']

  adata.obs['stage'] = adata_cmlineage.obs['stage']
  next(self.gen)


In [6]:
del adata_full
del adata_cmlineage

## Filter Samples

How many cells do we have for each individual in each cell type?

In [8]:
cell_counts = adata.obs[['donor_id', 'stage']]
cell_counts = pd.DataFrame(cell_counts.groupby('stage').value_counts()).reset_index(inplace=False).rename(columns={0: "n_cells_unfiltered", 'stage': 'type'})
cell_counts

Unnamed: 0,type,donor_id,n_cells_unfiltered
0,IPSC,NA18858,14814
1,IPSC,NA18907,7298
2,IPSC,NA19153,4894
3,IPSC,NA19144,4431
4,IPSC,NA19127,2687
...,...,...,...
260,CM,NA19193,0
261,CM,NA19209,0
262,CM,NA19114,0
263,CM,NA18520,0


To get this into proper form, we need to:
- Reformat cell types

In [9]:
cell_counts['individual'] = [s.replace("NA", "") for s in cell_counts['donor_id']]
cell_counts['ind_type'] = cell_counts['individual'].astype(str) + "_" + cell_counts['type'].astype(str)
cell_counts = cell_counts[['ind_type', 'individual', 'type', 'n_cells_unfiltered']]
cell_counts

Unnamed: 0,ind_type,individual,type,n_cells_unfiltered
0,18858_IPSC,18858,IPSC,14814
1,18907_IPSC,18907,IPSC,7298
2,19153_IPSC,19153,IPSC,4894
3,19144_IPSC,19144,IPSC,4431
4,19127_IPSC,19127,IPSC,2687
...,...,...,...,...
260,19193_CM,19193,CM,0
261,19209_CM,19209,CM,0
262,19114_CM,19114,CM,0
263,18520_CM,18520,CM,0


We'll drop any samples with less than 5 cells

In [10]:
cell_counts['dropped'] = cell_counts['n_cells_unfiltered'] < 5

## Filter Cell Types

We will filter to samples with at least 5 cells, and cell types with over 25 individuals represented.

In [11]:
ind_counts = cell_counts[cell_counts['n_cells_unfiltered'] >= 5]
ind_counts = pd.DataFrame(ind_counts[['type']].value_counts()).reset_index(inplace=False).rename(columns={0: "n_unfiltered"})
ind_counts = ind_counts[ind_counts['n_unfiltered']>25]
ind_counts

Unnamed: 0,type,n_unfiltered
0,MESO,48
1,PROG,48
2,MESENDO,44
3,CM,42
4,IPSC,36


In [12]:
ind_counts.to_csv("/project2/gilad/jpopp/ebQTL/data/static_qtl_calling/eb_cmstages/pseudobulk_tmm/samples_per_celltype.tsv",
                  sep="\t", index=False)

## Pseudobulk Aggregation

First, we can highlight which samples will be kept for QTL analysis

In [13]:
cell_types_inc = ind_counts['type']
samples_inc = cell_counts[(cell_counts['dropped'] == False) & (cell_counts['type'].isin(cell_types_inc))]['ind_type']

In [14]:
cell_subset = adata.obs[['donor_id']].copy()
cell_subset['type'] = adata.obs[['stage']]
cell_subset['ind'] = [s.replace("NA", "") for s in cell_subset['donor_id'].astype(str)]
cell_subset['sample'] = cell_subset['ind'] + "_" + cell_subset['type']
cell_subset = cell_subset[cell_subset['sample'].isin(samples_inc)]
cell_subset

Unnamed: 0_level_0,donor_id,type,ind,sample
cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AAACGAAAGTAGATCA-1,NA18913,MESENDO,18913,18913_MESENDO
AAACGAACATGATAGA-1,NA18913,MESENDO,18913,18913_MESENDO
AAACGAATCTCAGGCG-1,NA18913,IPSC,18913,18913_IPSC
AAAGGATTCTAAGCCA-1,NA18913,IPSC,18913,18913_IPSC
AAAGGTAAGTGGCGAT-1,NA18913,IPSC,18913,18913_IPSC
...,...,...,...,...
TTTGGAGGTTAAACCC-118,NA18858,MESENDO,18858,18858_MESENDO
TTTGGAGGTTGCAAGG-118,NA18511,IPSC,18511,18511_IPSC
TTTGGAGTCCGTTGGG-118,NA18858,IPSC,18858,18858_IPSC
TTTGGTTTCGAAGCCC-118,NA18511,IPSC,18511,18511_IPSC


For pseudobulk aggregation, we're going to use the raw data

In [15]:
adata = adata[cell_subset.index]

## Update summary tables

In [16]:
filtered_counts = adata.obs[['donor_id', 'stage', 'total_counts']].copy()
filtered_counts['n_cells_filtered'] = 1
filtered_counts['individual'] = [s.replace("NA", "") for s in filtered_counts['donor_id']]
filtered_counts['ind_type'] = filtered_counts['individual'].astype(str) + "_" + filtered_counts['stage'].astype(str)
filtered_counts = filtered_counts.drop(columns=['donor_id', 'individual', 'stage'])
filtered_counts = filtered_counts.groupby('ind_type').agg({'total_counts': 'sum', 'n_cells_filtered': 'count'})
filtered_counts = filtered_counts.reset_index().astype({'total_counts': 'int'})
filtered_counts

Unnamed: 0,ind_type,total_counts,n_cells_filtered
0,18486_CM,1773373,46
1,18486_IPSC,78713377,1178
2,18486_MESENDO,44684210,780
3,18486_MESO,7729227,192
4,18486_PROG,4198424,95
...,...,...,...
213,19225_PROG,4477989,89
214,19257_CM,1885507,39
215,19257_MESENDO,845438,18
216,19257_MESO,1979084,120


In [17]:
cell_counts_filtered = cell_counts.merge(filtered_counts, on='ind_type', how='left').fillna({'total_counts': 0, 'n_cells_filtered': 0}).astype({'total_counts': 'int', 'n_cells_filtered': 'int'})
cell_counts_filtered['dropped'] = cell_counts_filtered['n_cells_filtered'] < 5
cell_counts_filtered = cell_counts_filtered.sort_values(by="n_cells_filtered", ascending=False)

In [18]:
cell_counts_filtered.to_csv("/project2/gilad/jpopp/ebQTL/data/static_qtl_calling/eb_cmstages/pseudobulk_tmm/sample_summary.tsv", sep="\t")

In [19]:
ind_counts = cell_counts_filtered[cell_counts_filtered['n_cells_filtered'] >= 5]
ind_counts = pd.DataFrame(ind_counts[['type']].value_counts()).reset_index(inplace=False).rename(columns={0: "n_filtered"})
ind_counts = ind_counts[ind_counts['n_filtered']>25]
ind_counts

Unnamed: 0,type,n_filtered
0,MESO,48
1,PROG,48
2,MESENDO,44
3,CM,42
4,IPSC,36


In [20]:
ind_counts.to_csv("/project2/gilad/jpopp/ebQTL/data/static_qtl_calling/eb_cmstages/pseudobulk_tmm/samples_per_celltype.tsv",
                  sep="\t", index=False)

## Aggregation

### Aggregate raw counts

In [21]:
cell_subset = adata.obs[['donor_id']].copy()
cell_subset['type'] = adata.obs[['stage']]
cell_subset['ind'] = [s.replace("NA", "") for s in cell_subset['donor_id'].astype(str)]
cell_subset['sample'] = cell_subset['ind'] + "_" + cell_subset['type']
onehot = OneHotEncoder(sparse=True).fit_transform(cell_subset[['sample']])
onehot

<83825x218 sparse matrix of type '<class 'numpy.float64'>'
	with 83825 stored elements in Compressed Sparse Row format>

In [22]:
pseudobulk_sum = adata.X.transpose() * onehot

In [23]:
pseudobulk_sum = pd.DataFrame.sparse.from_spmatrix(data=pseudobulk_sum, index=adata.var_names, columns=cell_subset['sample'].astype("category").cat.categories)

In [24]:
pseudobulk_sum.to_csv("/project2/gilad/jpopp/ebQTL/data/static_qtl_calling/eb_cmstages/pseudobulk_tmm/eb_cmstages.pseudobulk_tmm.tsv",
                       sep="\t", index_label="gene")