# Overview
SNP annotations generated from these gene programs can be found [here](https://alkesgroup.broadinstitute.org/LDSCORE/Jagadeesh_Dey_sclinker/annotations/cell_type_programs/).

Mapping between the single cell datasets and the SNP annotation folders are indicated by the scdatasets dictionary key. Each SNP annotation folder contains a subfolder for each cell type identified in the dataset. Each cell type folder contains a subfolder for SNP annotations generated from the 100KB, ABC-U-Roadmap all and ABC-U-Roadmap enhancer-gene linking strategies.


In [None]:
import scanpy as sc
from collections import Counter
import numpy as np
import pandas as pd

In [None]:
filedir='/ahg/regevdata/projects/scgwas/data/singlecell/modules/healthy/celltypeenriched/'
datapath='/ahg/regevdata/projects/scgwas/data/singlecell/'

In [None]:
def write_matrix(adata, filename, moduletype='celltype'):
    # set up the ordering of genes and cells
    #adata = adatas[filename]
    genes = list(set(adata.var_names))
    gene2idx = {gene:i for i, gene in enumerate(genes)}
    
    pvalmtxs, logfoldmtxs, scoremtxs = [], [], []
    
    #ctlabels = ['annot_level_2', 'annot_level_3']
    ctlabels = ['cell_type']
    print(adata.obs.columns)
    print(ctlabels)
    for ctlabel in ctlabels:
        delabel = ctlabel + '_DE'
        cellsubsets = adata.uns[delabel]['names'].dtype.fields.keys()
        cell2idx = {cellsubset:i for i, cellsubset in enumerate(cellsubsets)}

        # create empty matrix
        pvalmtx = np.zeros((len(gene2idx), len(cell2idx)))

        logfoldmtx = np.zeros((len(gene2idx), len(cell2idx)))
        scoremtx = np.zeros((len(gene2idx), len(cell2idx)))

        # loop through and fill up the matrix with pvalue, logfold and score
        for gene, pval, logfold, score in zip(adata.uns[delabel]['names'], 
                                       adata.uns[delabel]['pvals_adj'], 
                                       adata.uns[delabel]['logfoldchanges'], 
                                       adata.uns[delabel]['scores']):
            for cell_subset in cellsubsets:
                if gene[cell_subset] in gene2idx:
                    pvalmtx[gene2idx[gene[cell_subset]], cell2idx[cell_subset]] = pval[cell_subset]
                    logfoldmtx[gene2idx[gene[cell_subset]], cell2idx[cell_subset]] = logfold[cell_subset]
                    scoremtx[gene2idx[gene[cell_subset]], cell2idx[cell_subset]] = score[cell_subset]

        # transform matrix to dataframe
        level = ctlabel.split('_')[-1]
        #+"_L%s"%level
        cellsubsets = [ct+"_L%s"%level for ct in cellsubsets]
        pvalmtxs.append(pd.DataFrame(pvalmtx, index=genes, columns=cellsubsets))
        logfoldmtxs.append(pd.DataFrame(logfoldmtx, index=genes, columns=cellsubsets))
        scoremtxs.append(pd.DataFrame(scoremtx, index=genes, columns=cellsubsets))
    pvalmtxs = pd.concat(pvalmtxs, axis=1)
    logfoldmtxs = pd.concat(logfoldmtxs, axis=1)
    scoremtxs = pd.concat(scoremtxs, axis=1)


    # write matrix to file
    pvalmtxs.to_csv("%s/%s_pval.csv"%(filedir, filename))
    logfoldmtxs.to_csv("%s/%s_logfold.csv"%(filedir, filename))
    scoremtxs.to_csv("%s/%s_score.csv"%(filedir, filename))

In [None]:
scdatasets = {
    'zheng_pbmc' : (datapath + '/blood/pbmc/pbmc-processed-annotated.h5ad', 'sample_id'),
    'krasnow_pbmc' : (datapath + '/lung/krasnow/pbmc-processed-annotated.h5ad', 'patient'),
    'ICA_bonemarrow' : (datapath + '/blood/bonemarrow-processed-final.h5ad', 'donor_organism.provenance.document_id'),
    'ICA_cordblood' : (datapath + '/blood/cordblood-processed-final.h5ad', 'donor_organism.provenance.document_id'),
    'adipose' : (datapath + '/adipose/adipose-processed-final.h5ad', 'orig.ident'),
    'alzheimers_brain' : (datapath + '/z_disease/alzheimers/healthy.h5ad', 'Subject'),
    'brain' : (datapath + '/brain/allen/allenbraindata-processed.h5ad', 'external_donor_name_label'),
    'idoamit_healthy' : (datapath + '/z_disease/covid/covid-healthy.h5ad', 'sample'),
    'kropski_lung':(datapath + '/lung/healthykropski-annotated.h5ad', 'Sample_Name'),
    'msbrain': (datapath + '/z_disease/multiplesclerosis/healthy-processed-annotated.h5ad', 'sample'),
    'skin':(datapath + '/skin/healthypublicskindata-processed.h5ad', 'sample'),
    'liver' : (datapath + '/liver/liver-processed-final.h5ad', 'orig.ident'),
    'kidney' : (datapath + '/kidney/kidneyadata-processed.h5ad', 'Experiment'),
    'heart' : (datapath + '/heart/heart-processed.h5ad', 'biological.individual'),
    'xavier_colon' : (datapath + '/colon/healthydata.h5ad', 'subject'),
    }

In [None]:
cell_counts = []
cell_subsets = []
patient_counts = []
for tissue in scdatasets.keys():
    filename, sample_label = scdatasets[tissue]
    tissueadata = sc.read(filename)
    print(tissue, tissueadata.shape[0], len(set(tissueadata.obs[sample_label])), len(set(tissueadata.obs['cell_type'])))
    cell_counts.append(tissueadata.shape[0])
    patient_counts.append(len(set(tissueadata.obs[sample_label])))
    cell_subsets.append(len(set(tissueadata.obs['cell_type'])))
print(sum(patient_counts))
sum(cell_counts)
sum(cell_subsets)
#sc.pl.umap(tissueadata, color='annot_level_2', save=tissue+'.pdf')

In [None]:
adatas = {}

In [None]:
cell_counts = []
for tissue in scdatasets.keys():
    print(tissue)
    filename, sampleid = scdatasets[tissue]
    tissueadata = sc.read(filename)
    cell_counts.append(tissueadata.shape[0])
    
    #for ctlabel in ['annot_level_2', 'annot_level_3']:
    for ctlabel in ['cell_type']:
        print(ctlabel)
        counts = Counter(tissueadata.obs[ctlabel])
        tissueadata.obs[ctlabel+'_counts'] = [counts[ct] for ct in tissueadata.obs[ctlabel]]
        adata = tissueadata[tissueadata.obs[ctlabel+'_counts'] > 10].copy()
        n_genes = adata.shape[1]
        sc.tl.rank_genes_groups(adata, ctlabel, key_added=ctlabel+'_DE', use_raw=False, method='wilcoxon', n_genes=n_genes)
        sc.pl.rank_genes_groups(adata, n_genes=25, key=ctlabel+'_DE', sharey=False)
        tissueadata.uns[ctlabel+'_DE'] = adata.uns[ctlabel+'_DE']
    #adatas[tissue] = tissueadata
    write_matrix(tissueadata, tissue)
    