In [1]:
import pandas as pd
import anndata as ad
import matplotlib.pyplot as plt
import scanpy as sc
import seaborn as sns
import numpy as np
from scipy.stats import median_abs_deviation

sc.settings.set_figure_params(
    dpi=80,
    facecolor="white",
    frameon=False,
)
plt.rcParams['figure.figsize'] = (4,4)


controls2 = ['Dabrafenib', 'Belinostat']

main_dir = '../'

def plot_CDF(data, title='', figsize=(4,4)):
    fig, ax = plt.subplots(1,1, figsize=figsize)
    x = np.sort(data)
    y = np.arange(1, len(data) + 1) / len(data)

    # Plot the CDF
    ax.plot(x, y, marker='.', linestyle='none')
    ax.set_xlabel('Value')
    ax.set_ylabel('Cumulative Probability')
    ax.set_title(title)
    ax.grid(True)
    plt.show()


In [9]:
sc_counts = ad.read(f'{main_dir}/input/sc_counts.h5ad')
sc_counts.X = sc_counts.layers['counts']
sc_counts.var_names_make_unique()
sc_counts.obs['positive_ctr'] = sc_counts.obs.sm_name.isin(controls2)



In [None]:
# merge T cells
CELL_TYPES = ['NK cells', 'T cells CD4+', 'T cells CD8+', 'T regulatory cells', 'B cells', 'Myeloid cells']
T_cell_types = ['T regulatory cells', 'T cells CD8+', 'T cells CD4+']
cell_type_map = {cell_type: 'T cells' if cell_type in T_cell_types else cell_type for cell_type in CELL_TYPES}
sc_counts.obs['cell_type'] = sc_counts.obs['cell_type'].map(cell_type_map)

In [None]:
sc_counts.obs['cell_type'].unique()

In [None]:
sc_counts.obs['plate_name'] = sc_counts.obs['plate_name'].astype(str).apply(lambda name: name.split('-')[1]).astype('category')
# define specific id for cell type of each well 
sc_counts.obs['plate_well_cell_type'] = sc_counts.obs['plate_name'].astype('str') \
    + '_' + sc_counts.obs['well'].astype('str') \
    + '_' + sc_counts.obs['cell_type'].astype('str')
sc_counts.obs['plate_well_cell_type'] = sc_counts.obs['plate_well_cell_type'].astype('category')

# QC + cell type annotation: original

In [None]:
# pbmc_markers = pd.read_csv(f"{main_dir}/output/decoupler_final_labels.csv")

In [4]:
if False:
    # sc.pp.filter_genes(adata, min_cells=100)
    # adata = adata[:,~adata.var.index.str.startswith('MT-')].copy()

    import decoupler as dc
    pbmc_markers = pd.read_csv(f"{main_dir}/output/decoupler_final_labels.csv")

    def run_dc(adata, labels, resolution, meta_name):
        
        adata = adata.copy()
        ## Run ORA:
        dc.run_ora(
        mat=adata,
        net=labels,
        source='cell_type',
        target='genesymbol',
        min_n=3,
        verbose=True
        )
        
        acts = dc.get_acts(adata, obsm_key='ora_estimate')
        # We need to remove inf and set them to the maximum value observed
        acts_v = acts.X.ravel()
        max_e = np.nanmax(acts_v[np.isfinite(acts_v)])
        acts.X[~np.isfinite(acts.X)] = max_e

        # We can scale the obtained activities for better visualizations
        sc.pp.scale(acts)
        
        df = dc.rank_sources_groups(acts, groupby='leiden_'+str(resolution), reference='rest', method='wilcoxon')
        
        n_ctypes = 3
        ctypes_dict = df.groupby('group').head(n_ctypes).groupby('group')['names'].apply(lambda x: list(x)).to_dict()
        
        annotation_dict = df.groupby('group').head(1).set_index('group')['names'].to_dict()
        adata.obs[str(meta_name)] = [annotation_dict[clust] for clust in adata.obs['leiden_'+str(resolution)]]
        
        return adata

    # adata = run_dc(adata, labels= pbmc_markers, resolution=2.0, meta_name="cell_type")

In [8]:
pbmc_markers

Unnamed: 0,genesymbol,canonical_marker,cell_type,germ_layer,human,human_sensitivity,human_specificity,mouse,mouse_sensitivity,mouse_specificity,ncbi_tax_id,organ,ubiquitiousness
0,LYZ,True,Myeloid cells,Mesoderm,True,0.442177,0.021347,False,0.000000,0.000000,9606.0,Immune system,0.008
1,FABP4,True,Myeloid cells,Mesoderm,True,0.034014,0.009195,True,0.023810,0.033263,9606.0,Immune system,0.028
2,ZBTB16,True,NK cells,Mesoderm,True,0.000000,0.012187,True,0.000000,0.000000,9606.0,Immune system,0.007
3,DPP4,True,NK cells,Mesoderm,True,0.000000,0.013470,True,0.000000,0.021432,9606.0,Immune system,0.020
4,DPP4,True,Myeloid cells,Mesoderm,True,0.000000,0.013793,True,0.428571,0.020202,9606.0,Immune system,0.020
...,...,...,...,...,...,...,...,...,...,...,...,...,...
263,LDHB,True,T cells CD4+,,,,,,,,,,
264,TPT1,True,T cells CD4+,,,,,,,,,,
265,TRAC,True,T cells CD4+,,,,,,,,,,
266,CD3D,True,T cells CD4+,,,,,,,,,,


# QC

In [None]:
import sctk
sctk.calculate_qc(sc_counts)
sc_counts

In [None]:
sc.set_figure_params(figsize=(6,6))
p3 = sc.pl.scatter(sc_counts, "n_counts", "n_genes", color="cell_type", size=30, alpha=.5)

In [None]:
sc.set_figure_params(figsize=(4,4))
fig, axes = plt.subplots(2, 2, figsize=(14,10))
for i_celltype, cell_type in enumerate(sc_counts.obs.cell_type.unique()):
    ii = i_celltype // 2
    jj = i_celltype % 2
    ax = axes[ii][jj]
    
    mask = sc_counts.obs.cell_type == cell_type
    sc_counts_subset = sc_counts[mask,:]
    legend_loc = 'none'
    # if i_celltype==3:
    #     legend_loc = 'upper right'
    xx = sc_counts_subset.obs.n_counts
    yy = sc_counts_subset.obs.n_genes
    color = sc_counts_subset.obs.percent_mito
    
    scatter  = ax.scatter(xx, yy, c=color, s=20, alpha=.3, cmap='coolwarm')
    ax.set_xlabel('n_counts')
    ax.set_ylabel('n_genes')
    ax.set_xlim([0, 32000])
    ax.set_ylim([0, 6000])
    ax.set_title(cell_type)
    fig.colorbar(scatter, ax=ax)  # Add a color bar with label

plt.tight_layout()
plt.show()

In [None]:
sctk.cellwise_qc(sc_counts)

In [None]:
sctk.default_metric_params_df

In [None]:
# for cells
print('n_counts:', sc_counts.obs.n_counts.min(), sc_counts.obs.n_counts.max())
print('n_genes:', sc_counts.obs.n_genes.min(), sc_counts.obs.n_genes.max())
print('percent_mito:', sc_counts.obs.percent_mito.min(), sc_counts.obs.percent_mito.max())
print('percent_ribo:', sc_counts.obs.percent_ribo.min(), sc_counts.obs.percent_ribo.max())
print('percent_hb:', sc_counts.obs.percent_hb.min(), sc_counts.obs.percent_hb.max())

## Automatic filtering

In [None]:
metrics_list = ["log1p_n_counts", "log1p_n_genes", "percent_mito", "percent_ribo", "percent_hb"]
sctk.generate_qc_clusters(sc_counts, metrics = metrics_list)
sc_counts

In [None]:
sc.pl.embedding(sc_counts, "X_umap_qc", color=["qc_cluster", "log1p_n_counts"], color_map="OrRd")


In [None]:
sctk.clusterwise_qc(sc_counts)
sc_counts

In [None]:
#this won't be necessary in scanpy 1.10.0, booleans will become directly plottable
for col in ['cell_passed_qc', 'cluster_passed_qc']:
    sc_counts.obs[col+"_int"] = sc_counts.obs[col].astype(int)
sc.pl.embedding(sc_counts, "X_umap_qc", color=["cell_passed_qc_int", "cluster_passed_qc_int"])


the discrepency between cluster outlier and single cell outliers is extreme. we fall back to manual filtering.

## Manual filtering

### Cell wise

In [None]:
plot_CDF(sc_counts.obs.n_counts, 'n_counts')
plot_CDF(sc_counts.obs.n_genes, 'n_genes')
plot_CDF(sc_counts.obs.percent_hb, 'percent_hb')

In [None]:
# for cells
print('n_counts:', sc_counts.obs.n_counts.min(), sc_counts.obs.n_counts.max())
print('n_genes:', sc_counts.obs.n_genes.min(), sc_counts.obs.n_genes.max())
print('percent_mito:', sc_counts.obs.percent_mito.min(), sc_counts.obs.percent_mito.max())
print('percent_ribo:', sc_counts.obs.percent_ribo.min(), sc_counts.obs.percent_ribo.max())
print('percent_hb:', sc_counts.obs.percent_hb.min(), sc_counts.obs.percent_hb.max())

In [None]:
filter_percent_hb = sc_counts.obs.percent_hb>.2
filter_percent_hb.sum()


In [None]:
sc_counts[filter_percent_hb,:].obs.plate_name.unique()


In [None]:
filter_n_genes = sc_counts.obs.n_genes<1000
filter_n_genes.sum()

In [None]:
sc_counts[filter_n_genes,:].obs.sm_name

### Gene wise

In [None]:
# for genes
print('n_cells:', sc_counts.var.n_cells.min(), sc_counts.var.n_cells.max())

In [None]:
p_threshold = 100
genes_tokeep_mask = np.ones(sc_counts.shape[1], dtype=bool)
for plate in sc_counts.obs['plate_name'].unique():
    mask = sc_counts.obs['plate_name'] == plate
    sc_counts_subset = sc_counts[mask,:]
    n_cells_by_count = (sc_counts_subset.X!=0).sum(axis=0)
    
    to_keep = np.ravel(n_cells_by_count>=p_threshold)

    print(genes_tokeep_mask.sum())
    genes_tokeep_mask = genes_tokeep_mask*to_keep
    

## Actual filtering

In [None]:
sc_counts = sc_counts[(~filter_n_genes) & (~filter_percent_hb), genes_tokeep_mask]


# Pseudobulking counts 

In [None]:
import anndata
import pandas as pd
import numpy as np
import sklearn
import scipy
import warnings

warnings.filterwarnings('ignore', category=FutureWarning, message="The default value of 'ignore' for the `na_action` parameter in pandas.Categorical.map is deprecated")
warnings.filterwarnings('ignore', category=UserWarning, message="No data for colormapping provided via 'c'")

import anndata as ad
import scanpy as sc

import matplotlib.pyplot as plt

import os, binascii
from scipy import sparse



In [None]:

def sum_by(adata: ad.AnnData, col: str) -> ad.AnnData:
    """
    Adapted from this forum post: 
    https://discourse.scverse.org/t/group-sum-rows-based-on-jobs-feature/371/4
    """
    
    assert pd.api.types.is_categorical_dtype(adata.obs[col])

    # sum `.X` entries for each unique value in `col`
    cat = adata.obs[col].values

    indicator = sparse.coo_matrix(
        (
            np.broadcast_to(True, adata.n_obs),
            (cat.codes, np.arange(adata.n_obs))
        ),
        shape=(len(cat.categories), adata.n_obs),
    )
  
    sum_adata = ad.AnnData(
        indicator @ adata.X,
        var=adata.var,
        obs=pd.DataFrame(index=cat.categories),
    )
    
    # copy over `.obs` values that have a one-to-one-mapping with `.obs[col]`
    obs_cols = adata.obs.columns
    obs_cols = list(set(adata.obs.columns) - set([col]))
    
    one_to_one_mapped_obs_cols = []
    nunique_in_col = adata.obs[col].nunique()
    for other_col in obs_cols:
        if len(adata.obs[[col, other_col]].drop_duplicates()) == nunique_in_col:
            one_to_one_mapped_obs_cols.append(other_col)

    joining_df = adata.obs[[col] + one_to_one_mapped_obs_cols].drop_duplicates().set_index(col)
    assert (sum_adata.obs.index == sum_adata.obs.join(joining_df).index).all()
    sum_adata.obs = sum_adata.obs.join(joining_df)
    sum_adata.obs.index.name = col
    sum_adata.obs = sum_adata.obs.reset_index()
    sum_adata.obs.index = sum_adata.obs.index.astype('str')

    return sum_adata

In [None]:
# actuall bulking
bulk_adata = sum_by(sc_counts, 'plate_well_cell_type')
bulk_adata.obs['cell_count'] = sc_counts.obs.groupby('plate_well_cell_type').size().values
bulk_adata.X = np.array(bulk_adata.X.todense())

print('ratio of missingness' , (bulk_adata.X==0).sum()/bulk_adata.X.size)
bulk_adata.var = bulk_adata.var.reset_index()

In [None]:
bulk_adata.var.set_index('index', inplace=True)
bulk_adata.var.head()

In [None]:
bulk_adata.obs.cell_type.unique()

In [None]:
bulk_adata.write(f'{main_dir}/output/preprocess/bulk_adata.h5ad')