In [None]:
import scanpy as sc
import pegasus as pg
from collections import Counter
import numpy as np
import pandas as pd
from sklearn.decomposition import NMF

In [None]:
outdir='/ahg/regevdata/projects/scgwas/data/singlecell/modules/nmf/healthy'

In [None]:
scdatasets = {
    'zheng_pbmc' : ('/ahg/regevdata/projects/scgwas/data/singlecell/blood/pbmc/pbmc-processed-annotated.h5ad', 'sample_id'),
    'ICA_bonemarrow' : ('/ahg/regevdata/projects/scgwas/data/singlecell/blood/bonemarrow-processed-final.h5ad', 'donor_organism.provenance.document_id'),
    'ICA_cordblood' : ('/ahg/regevdata/projects/scgwas/data/singlecell/blood/cordblood-processed-final.h5ad', 'donor_organism.provenance.document_id'),
    'krasnow_pbmc' : ('/ahg/regevdata/projects/scgwas/data/singlecell/lung/krasnow/pbmc-processed-annotated.h5ad', 'patient'),
    
    'allen_brain' : ('/ahg/regevdata/projects/scgwas/data/singlecell/brain/allen/allenbraindata-processed.h5ad', 'external_donor_name_label'),
    'alzheimers_brain' : ('/ahg/regevdata/projects/scgwas/data/singlecell/z_disease/alzheimers/healthy.h5ad', 'Subject'),
    'rowitch_brain': ('/ahg/regevdata/projects/scgwas/data/singlecell/z_disease/multiplesclerosis/healthy-processed-annotated.h5ad', 'sample'),

    'kropski_lung' : ('/ahg/regevdata/projects/scgwas/data/singlecell/lung/healthykropski-annotated.h5ad', 'Sample_Name'),
    'krasnow_lung' : ('/ahg/regevdata/projects/scgwas/data/singlecell/lung/krasnow/lung-processed-annotated.h5ad', 'patient'),

    'heart' : ('/ahg/regevdata/projects/scgwas/data/singlecell/heart/heart-processed.h5ad', 'biological.individual'),
    
    'skin':('/ahg/regevdata/projects/scgwas/data/singlecell/skin/healthypublicskindata-processed.h5ad', 'sample'),

    'xavier_colon' : ('/ahg/regevdata/projects/scgwas/data/singlecell/modules/nmf/healthy_colon.h5ad', 'subject'),
    'xavier_ens' : ('/ahg/regevdata/projects/scgwas/data/singlecell/ens/ensdata-processed-annotated.h5ad', 'Patient_ID'),
    
    'adipose' : ('/ahg/regevdata/projects/scgwas/data/singlecell/adipose/adipose-processed-final.h5ad', 'orig.ident'),
    'liver' : ('/ahg/regevdata/projects/scgwas/data/singlecell/liver/liver-processed-final.h5ad', 'orig.ident'),
    'kidney' : ('/ahg/regevdata/projects/scgwas/data/singlecell/kidney/kidneyadata-processed.h5ad', 'Experiment'),
    
    'idoamit_healthy' : ('/ahg/regevdata/projects/scgwas/data/singlecell/z_disease/covid/covid-healthy.h5ad', 'sample'),
}

In [None]:
for tissue in scdatasets.keys():
    print(tissue)
    filename, sample_label = scdatasets[tissue]
    tissueadata = sc.read(filename)
    print(tissueadata.shape)
    num_celltypes = len(set(tissueadata.obs['annot_level_2']))
    model = NMF(n_components=num_celltypes+10, init='random', random_state=0)
    tissueadata = tissueadata[:,tissueadata.var['highly_variable']]
    X = tissueadata.layers['counts']
    X = X/np.max(X)
    W = model.fit_transform(X)
    W = pd.DataFrame(W, index = tissueadata.obs_names, columns = ['NMF_%d'%i for i in range(W.shape[1])])
    W.to_csv(outdir + '/%s'%tissue+'_cellprograms.csv')
    H = pd.DataFrame(model.components_.T, index=tissueadata.var_names, columns=['NMF_%d'%i for i in range(model.components_.shape[0])])
    H.to_csv(outdir + '/%s'%tissue+'_geneprograms.csv')

In [None]:
import scipy

In [None]:
for tissue in scdatasets.keys():
    print(tissue)
    filename, sample_label = scdatasets[tissue]
    tissueadata = sc.read(filename)
    W = pd.read_csv('/ahg/regevdata/projects/scgwas/data/singlecell/modules/nmf/healthy/%s_cellprograms.csv'%tissue)
    W = W.set_index(W.columns[0])

    for i in range(W.shape[1]):
        tissueadata.obs['NMF_%d'%i] = W['NMF_%d'%i]
    sc.pl.umap(tissueadata, color=['NMF_%d'%i for i in range(W.shape[1])], color_map='Reds', save=tissue+'_healthyprograms.pdf')
    sc.pl.umap(tissueadata, color=['annot_level_2'], ncols=1, save=tissue+'_healthy.pdf')

In [None]:
from scipy.sparse import rand as r1
from numpy.random import rand as r2

for tissue in scdatasets.keys():
    print(tissue)
    filename, sample_label = scdatasets[tissue]
    tissueadata = sc.read(filename)
    W = pd.read_csv(outdir + '/%s_cellprograms.csv'%tissue)
    W = W.set_index(W.columns[0])

    for i in range(W.shape[1]):
        tissueadata.obs['NMF_%d'%i] = W['NMF_%d'%i]
    sc.pl.umap(tissueadata, color=['NMF_%d'%i for i in range(W.shape[1])]+['annot_level_2'], 
                   color_map='Reds', save=tissue+'_healthy.png')
    
    tissueadata = tissueadata[:, np.array((tissueadata.X.sum(axis=0) > 0))]
    x = scipy.sparse.csc_matrix(tissueadata.X)
    #x = x[:,0:-6]
    nrow,ncol = x.shape
    correlations = []
    for i in range(W.shape[1]):
        y = W['NMF_%d'%i].values
        yy = y - y.mean()
        xm = x.mean(axis=0).A.ravel()
        ys = yy / np.sqrt(np.dot(yy, yy))
        xs = np.sqrt(np.add.reduceat(x.data**2, x.indptr[:-1]) - nrow*xm*xm)

        correl2 = np.add.reduceat(x.data * ys[x.indices], x.indptr[:-1]) / xs
        correlations.append(correl2)
    corr = pd.DataFrame(np.vstack(correlations).T, columns=W.columns, index=tissueadata.var_names)
    corr.to_csv(outdir + '/%s_correlation_cellprograms.csv'%tissue)


In [None]:
import seaborn as sns

In [None]:
for tissue in scdatasets.keys():
    filename, sample_label = scdatasets[tissue]
    adata = sc.read(filename)
    W = pd.read_csv(outdir + '/%s_cellprograms.csv'%tissue)
    print(tissue, W.shape)
    W = W.set_index(W.columns[0])
    W = W/W.quantile(0.99, axis=0)
    W = W.clip(0,1)
    
    for i in range(W.shape[1]):
        adata.obs['NMF_%d'%i] = W['NMF_%d'%i]
    sc.pl.umap(adata, color=['NMF_%d'%i for i in range(W.shape[1])], color_map='Reds')
    sc.pl.umap(adata, color=['annot_level_2', 'annot_level_3'], ncols=1)
    
    
    topicdata = sc.AnnData(W.values)
    topicdata.obs_names = W.index
    topicdata.var_names = W.columns
    topicdata.obs['celltypes'] = adata.obs['annot_level_2']
    
    sc.pl.matrixplot(topicdata, var_names=topicdata.var_names, groupby='celltypes', standard_scale='var', )
    
    W['celltype'] = adata.obs['annot_level_3']
    
    
    W = W.groupby('celltype').agg('mean')
    
    nmf_specificity = (W > .3).sum(0)
    display(W)
    for i in range(W.shape[0]):
        for j in range(W.shape[1]):
            if W.values[i,j] > .65 and nmf_specificity[j]==1:
                print(W.index[i], W.columns[j])
    
    H2 = pd.read_csv(outdir + '/%s_geneprograms.csv'%tissue)
    H2 = H2.set_index(H2.columns[0])
    
    
    H = pd.read_csv(outdir + '/%s_correlation_cellprograms.csv'%tissue)
    H = H.set_index(H.columns[0])
    print(H.shape)
    for i in range(H.shape[1]):
        print(i)
        print(",".join(H.sort_values(by='NMF_%d'%i, ascending=False).index[0:50]))
        print(",".join(H2.sort_values(by='NMF_%d'%i, ascending=False).index[0:50]))
        
        print(len(set(H2.sort_values(by='NMF_%d'%i, ascending=False).index[0:500]).intersection(H.sort_values(by='NMF_%d'%i, ascending=False).index[0:500])))