In [1]:
## Test the Spectra Model and potential applications

# Prerequisites - Load Libraries

In [2]:
import scanpy as sc

In [3]:
import anndata as  ad

In [4]:
import pandas as pd

In [5]:
import random

In [6]:
import numpy as np

In [7]:
import random

In [8]:
import os

In [9]:
import decoupler as dc

In [10]:
ad.__version__

'0.8.0'

In [11]:
print(pd.__version__)

1.5.3


In [12]:
import scanpy as sc
from scipy.sparse import issparse

# Preqrequisites Configurations & Parameters

In [13]:
### Load the parameters that are set via the configuration files

In [14]:
### Load configurations file
global_configs = pd.read_csv('configurations/Data_Configs.csv', sep = ',')

In [15]:
global_configs

Unnamed: 0,parameter,value
0,data_path,/lustre/groups/epigenereg01/workspace/projects...
1,result_path,/lustre/groups/epigenereg01/workspace/projects...


In [16]:
data_path = global_configs['value'][global_configs['parameter'] == 'data_path']

In [17]:
data_path

0    /lustre/groups/epigenereg01/workspace/projects...
Name: value, dtype: object

In [18]:
result_path = global_configs['value'][global_configs['parameter'] == 'result_path']

In [19]:
result_path

1    /lustre/groups/epigenereg01/workspace/projects...
Name: value, dtype: object

In [20]:
## Loading the file containing the name of the single-cell dataset

In [21]:
sc_configs = pd.read_csv('configurations/01_Pre_Processing_SC_Data.csv', sep = ',')

In [22]:
sc_configs

Unnamed: 0,data_name,data_type
0,dcm_acm,h5seurat


In [23]:
sc_configs = sc_configs[sc_configs['data_name'] != '']

In [24]:
sc_dataset_names = pd.unique(sc_configs['data_name'])

In [25]:
sc_dataset_names

array(['dcm_acm'], dtype=object)

In [26]:
### Generate the result data directory if it does not exist yet

if not os.path.exists(result_path[1] + '01_results'):
    # Create the directory if it doesn't exist
    os.makedirs(result_path[1] + '01_results')


# Load data

## Anndata object

In [35]:
### Load single-cell datasets as anndata format; should contain the meta-columns: sample_id; cluster_id
### Raw Counts should be given in default assay

In [36]:
sc_data_list = []

In [37]:
for i in sc_dataset_names:
    source_text = data_path[0] + '/' + i + '.h5ad'
    adata = ad.read_h5ad(source_text)
    adata.obs['group_id'] = 'dummy'
    adata = adata.raw.to_adata()
    sc_data_list[i] = adata

TypeError: list indices must be integers or slices, not str

In [82]:
#data_path[0] + '/' + sc_dataset_names[0] + '.h5ad'

In [27]:
adata = ad.read_h5ad(data_path[0] + '/' + sc_dataset_names[0] + '.h5ad')

In [28]:
adata.obs['group_id'] = 'dummy'

In [29]:
adata = adata.raw.to_adata()

In [30]:
#adata.raw.X

In [31]:
adata

AnnData object with n_obs × n_vars = 881081 × 33145
    obs: 'Sample', 'donor_id', 'Region_x', 'Primary.Genetic.Diagnosis', 'n_genes', 'n_counts', 'percent_mito', 'percent_ribo', 'scrublet_score_z', 'scrublet_score_log', 'solo_score', 'cell_states', 'Assigned', 'self_reported_ethnicity_ontology_term_id', 'disease_ontology_term_id', 'cell_type_ontology_term_id', 'sex_ontology_term_id', 'assay_ontology_term_id', 'organism_ontology_term_id', 'is_primary_data', 'tissue_ontology_term_id', 'development_stage_ontology_term_id', 'suspension_type', 'tissue_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid', 'group_id'
    var: 'feature_name', 'feature_reference', 'feature_biotype', 'feature_length'
    uns: 'cell_states_colors', 'cell_type_ontology_term_id_colors', 'citation', 'leiden', 'neighbors', 'pca', 'schema_reference', 'schema_version', 'title', 'umap'
    obsm: 'X_pca', 'X_umap'

In [32]:
adata.var_names = adata.var['feature_name']

AnnData expects .var.index to contain strings, but got values like:
    ['MIR1302-2HG', 'FAM138A', 'OR4F5', 'ENSG00000238009.6', 'ENSG00000239945.1']

    Inferred to be: categorical

  names = self._prep_dim_index(names, "var")


In [33]:
sample_data = adata.obs[['donor_id', 'disease', 'sex', 'development_stage', 'self_reported_ethnicity', 'Primary.Genetic.Diagnosis']].drop_duplicates()
sample_data['sample_id'] = sample_data['donor_id']
sample_data.to_csv('/lustre/groups/epigenereg01/workspace/projects/jove/example_data/Prepared_Sample_Meta_Data.csv')

In [34]:
sample_data.groupby(['disease']).size()

disease
non-compaction cardiomyopathy                       1
dilated cardiomyopathy                             52
arrhythmogenic right ventricular cardiomyopathy     8
normal                                             18
dtype: int64

In [35]:
sample_data.groupby(['disease', 'Primary.Genetic.Diagnosis']).size()

disease                                          Primary.Genetic.Diagnosis
non-compaction cardiomyopathy                    BAG3                          0
                                                 DES                           0
                                                 DSP                           0
                                                 FKTN                          0
                                                 FLNC                          0
                                                 LMNA                          0
                                                 PKP2                          0
                                                 PLN                           0
                                                 PVneg                         0
                                                 RBM20                         0
                                                 TNNC1                         0
                                  

In [36]:
sampled_donors = sample_data['donor_id'][sample_data['Primary.Genetic.Diagnosis'].isin(['control', 'TTN', 'PVneg', 'LMNA', 'PKP2'])]

In [37]:
len(sampled_donors)

56

In [38]:
adata = adata[adata.obs['donor_id'].isin(sampled_donors)]

In [39]:
adata = adata[adata.obs['cell_type'] != 'unknown']

In [40]:
### choose genes from pathways

In [41]:
pathways = pd.read_csv("/lustre/groups/epigenereg01/workspace/projects/jove/example_data/Prepared_Pathway_Data.csv")

In [42]:
genes = pd.unique(pathways['gene'])

In [43]:
len(genes)

2012

In [44]:
random_numbers = random.sample(range(1, 33146), 5000)

In [45]:
# Convert the positions to boolean indices
#gene_indices = np.zeros(adata.n_vars, dtype=bool)
#gene_indices[random_numbers ] = True

In [46]:
#gene_indices

In [47]:
adata = adata[:, adata.var_names.isin(genes)]

In [48]:
#adata= adata[:, gene_indices]

In [49]:
adata

View of AnnData object with n_obs × n_vars = 635199 × 1980
    obs: 'Sample', 'donor_id', 'Region_x', 'Primary.Genetic.Diagnosis', 'n_genes', 'n_counts', 'percent_mito', 'percent_ribo', 'scrublet_score_z', 'scrublet_score_log', 'solo_score', 'cell_states', 'Assigned', 'self_reported_ethnicity_ontology_term_id', 'disease_ontology_term_id', 'cell_type_ontology_term_id', 'sex_ontology_term_id', 'assay_ontology_term_id', 'organism_ontology_term_id', 'is_primary_data', 'tissue_ontology_term_id', 'development_stage_ontology_term_id', 'suspension_type', 'tissue_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid', 'group_id'
    var: 'feature_name', 'feature_reference', 'feature_biotype', 'feature_length'
    uns: 'cell_states_colors', 'cell_type_ontology_term_id_colors', 'citation', 'leiden', 'neighbors', 'pca', 'schema_reference', 'schema_version', 'title', 'umap'
    obsm: 'X_pca', 'X_umap'

In [50]:
pd.unique(adata.obs['Primary.Genetic.Diagnosis'])

['control', 'TTN', 'LMNA', 'PKP2', 'PVneg']
Categories (5, object): ['LMNA', 'PKP2', 'PVneg', 'TTN', 'control']

In [51]:
pd.unique(adata.obs['disease'])

['normal', 'dilated cardiomyopathy', 'arrhythmogenic right ventricular cardiomyopathy']
Categories (3, object): ['dilated cardiomyopathy', 'arrhythmogenic right ventricular cardiomyopathy', 'normal']

In [52]:
adata.obs['sample_id']  = adata.obs['donor_id']
adata.obs['cluster_id'] = adata.obs['cell_type']


  adata.obs['sample_id']  = adata.obs['donor_id']


In [53]:
 len(pd.unique(adata.obs['sample_id']))

56

## TBD add checks / amount of cells etc.

In [54]:
## Generate info about expression 

In [55]:
gene_expr_data = pd.DataFrame()

In [56]:
for i in pd.unique(adata.obs['cluster_id']):
    cell_type = i
    adata_subset = adata[adata.obs['cluster_id'] == cell_type]
    
    amount_cells = adata_subset.shape[0]
    ## Calcalte percentage of cells expressiong gene
    amount_cells_expressing_gene = (adata_subset.X > 0).sum(axis= 0)
    perc_cells_expressing_gene = (amount_cells_expressing_gene/ amount_cells) * 100
    
    data = {
    'perc_cells_expressing_gene': np.ravel(perc_cells_expressing_gene),
    'total_amount_cells_expressing_gene': np.ravel(amount_cells_expressing_gene),
    'gene': adata_subset.var_names,
    'cluster': cell_type
    }

    df = pd.DataFrame(data)
    
    ### Append data
    gene_expr_data = pd.concat([gene_expr_data, df])

In [57]:
gene_expr_data

Unnamed: 0,perc_cells_expressing_gene,total_amount_cells_expressing_gene,gene,cluster
0,1.314940,1711,ISG15,mural cell
1,0.006148,8,TNFRSF18,mural cell
2,0.082232,107,TNFRSF4,mural cell
3,5.278205,6868,UBE2J2,mural cell
4,2.396250,3118,ATAD3B,mural cell
...,...,...,...,...
1975,13.251962,304,PFKL,adipocyte
1976,1.525719,35,TRPM2,adipocyte
1977,38.927637,893,UBE2G2,adipocyte
1978,1.133391,26,ITGB2,adipocyte


In [58]:
i = sc_dataset_names[0]

In [59]:
i

'dcm_acm'

In [60]:
### Save file
gene_expr_data.to_csv(result_path[1] + '/01_results/01_' +  i +  '_Gene_Expr_per_Cell_Type' +  '.csv')

## Optional: generate pseudobulk

In [61]:

adata = dc.get_pseudobulk(
    adata,
    sample_col='sample_id',
    groups_col='cluster_id',
    #layer='counts',
    mode='mean',
    min_cells=0,  # Filter to remove samples by a minimum number of cells in a sample-group pair.
    min_counts=0)  # TBD: why raw 

    

In [62]:
adata.obs

Unnamed: 0,donor_id,Primary.Genetic.Diagnosis,Assigned,self_reported_ethnicity_ontology_term_id,disease_ontology_term_id,cell_type_ontology_term_id,sex_ontology_term_id,assay_ontology_term_id,organism_ontology_term_id,is_primary_data,...,disease,organism,sex,self_reported_ethnicity,development_stage,group_id,sample_id,cluster_id,psbulk_n_cells,psbulk_counts
D1_adipocyte,D1,control,True,HANCESTRO:0005,PATO:0000461,CL:0000136,PATO:0000383,EFO:0009899,NCBITaxon:9606,False,...,normal,Homo sapiens,female,European,sixth decade human stage,dummy,D1,adipocyte,68.0,14668.0
D2_adipocyte,D2,control,True,HANCESTRO:0005,PATO:0000461,CL:0000136,PATO:0000384,EFO:0009899,NCBITaxon:9606,False,...,normal,Homo sapiens,male,European,seventh decade human stage,dummy,D2,adipocyte,74.0,13888.0
D4_adipocyte,D4,control,True,HANCESTRO:0005,PATO:0000461,CL:0000136,PATO:0000383,EFO:0009899,NCBITaxon:9606,False,...,normal,Homo sapiens,female,European,eighth decade human stage,dummy,D4,adipocyte,150.0,21976.0
D5_adipocyte,D5,control,True,HANCESTRO:0005,PATO:0000461,CL:0000136,PATO:0000383,EFO:0009899,NCBITaxon:9606,False,...,normal,Homo sapiens,female,European,seventh decade human stage,dummy,D5,adipocyte,69.0,13549.0
D6_adipocyte,D6,control,True,HANCESTRO:0005,PATO:0000461,CL:0000136,PATO:0000384,EFO:0009899,NCBITaxon:9606,False,...,normal,Homo sapiens,male,European,eighth decade human stage,dummy,D6,adipocyte,37.0,8016.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
H67_myeloid cell,H67,control,True,HANCESTRO:0005,PATO:0000461,CL:0000763,PATO:0000384,EFO:0009922,NCBITaxon:9606,True,...,normal,Homo sapiens,male,European,fifth decade human stage,dummy,H67,myeloid cell,296.0,104245.0
H7_myeloid cell,H7,control,True,HANCESTRO:0005,PATO:0000461,CL:0000763,PATO:0000383,EFO:0009922,NCBITaxon:9606,False,...,normal,Homo sapiens,female,European,fifth decade human stage,dummy,H7,myeloid cell,533.0,150915.0
IC_H01_myeloid cell,IC_H01,TTN,True,HANCESTRO:0005,MONDO:0005021,CL:0000763,PATO:0000384,EFO:0009922,NCBITaxon:9606,True,...,dilated cardiomyopathy,Homo sapiens,male,European,fifth decade human stage,dummy,IC_H01,myeloid cell,279.0,103811.0
IC_H02_myeloid cell,IC_H02,TTN,True,HANCESTRO:0005,MONDO:0005021,CL:0000763,PATO:0000384,EFO:0009922,NCBITaxon:9606,True,...,dilated cardiomyopathy,Homo sapiens,male,European,adolescent stage,dummy,IC_H02,myeloid cell,533.0,154920.0


In [63]:
### Generate a dataframe to save

In [64]:
data = adata.to_df()

In [65]:
data

feature_name,A1BG,AAAS,AAMP,ABCA13,ABCE1,ABI1,ABI2,ABL1,ABL2,ACAA1,...,YES1,YPEL5,YWHAB,YWHAZ,ZAP70,ZBP1,ZBTB16,ZEB1,ZNRF1,ZNRF2
D1_adipocyte,0.000000,0.088235,0.147059,0.014706,0.029412,0.044118,0.132353,0.735294,0.308824,0.058824,...,0.176471,0.058824,0.044118,0.308824,0.0,0.000000,0.720588,0.514706,0.132353,0.117647
D2_adipocyte,0.000000,0.027027,0.094595,0.000000,0.067568,0.108108,0.175676,0.716216,0.432432,0.081081,...,0.135135,0.027027,0.121622,0.229730,0.0,0.000000,0.972973,0.283784,0.108108,0.094595
D4_adipocyte,0.006667,0.113333,0.100000,0.006667,0.066667,0.086667,0.146667,0.406667,0.273333,0.120000,...,0.033333,0.033333,0.066667,0.293333,0.0,0.000000,0.793333,0.366667,0.066667,0.053333
D5_adipocyte,0.000000,0.072464,0.086957,0.000000,0.072464,0.188406,0.246377,0.449275,0.507246,0.086957,...,0.115942,0.086957,0.028986,0.130435,0.0,0.000000,0.869565,0.405797,0.086957,0.057971
D6_adipocyte,0.027027,0.054054,0.162162,0.000000,0.162162,0.162162,0.054054,0.378378,0.324324,0.162162,...,0.054054,0.108108,0.054054,0.270270,0.0,0.000000,0.702703,0.297297,0.054054,0.108108
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
H67_myeloid cell,0.037162,0.020270,0.060811,0.000000,0.111486,0.422297,0.587838,0.905405,0.466216,0.118243,...,0.064189,0.054054,0.158784,0.395270,0.0,0.006757,1.050676,0.165541,0.054054,0.327703
H7_myeloid cell,0.018762,0.030019,0.039400,0.000000,0.046904,0.330206,0.288931,0.596623,0.407129,0.097561,...,0.105066,0.078799,0.135084,0.356473,0.0,0.007505,1.823640,0.073171,0.080675,0.288931
IC_H01_myeloid cell,0.014337,0.032258,0.060932,0.000000,0.075269,0.523297,0.182796,0.874552,1.691756,0.111111,...,0.154122,0.086022,0.225806,0.362007,0.0,0.003584,1.465950,0.060932,0.050179,0.419355
IC_H02_myeloid cell,0.028143,0.041276,0.041276,0.000000,0.045028,0.414634,0.409006,0.711069,0.292683,0.097561,...,0.106942,0.088180,0.200750,0.245779,0.0,0.005629,0.189493,0.048780,0.060038,0.382739


In [66]:
data.index

Index(['D1_adipocyte', 'D2_adipocyte', 'D4_adipocyte', 'D5_adipocyte',
       'D6_adipocyte', 'D7_adipocyte', 'DL2_adipocyte', 'DT4_adipocyte',
       'H01_adipocyte', 'H02_adipocyte',
       ...
       'H56_myeloid cell', 'H57_myeloid cell', 'H58_myeloid cell',
       'H59_myeloid cell', 'H6_myeloid cell', 'H67_myeloid cell',
       'H7_myeloid cell', 'IC_H01_myeloid cell', 'IC_H02_myeloid cell',
       'IC_H04_myeloid cell'],
      dtype='object', length=494)

In [67]:
data['feature'] = data.index

In [68]:
data

feature_name,A1BG,AAAS,AAMP,ABCA13,ABCE1,ABI1,ABI2,ABL1,ABL2,ACAA1,...,YPEL5,YWHAB,YWHAZ,ZAP70,ZBP1,ZBTB16,ZEB1,ZNRF1,ZNRF2,feature
D1_adipocyte,0.000000,0.088235,0.147059,0.014706,0.029412,0.044118,0.132353,0.735294,0.308824,0.058824,...,0.058824,0.044118,0.308824,0.0,0.000000,0.720588,0.514706,0.132353,0.117647,D1_adipocyte
D2_adipocyte,0.000000,0.027027,0.094595,0.000000,0.067568,0.108108,0.175676,0.716216,0.432432,0.081081,...,0.027027,0.121622,0.229730,0.0,0.000000,0.972973,0.283784,0.108108,0.094595,D2_adipocyte
D4_adipocyte,0.006667,0.113333,0.100000,0.006667,0.066667,0.086667,0.146667,0.406667,0.273333,0.120000,...,0.033333,0.066667,0.293333,0.0,0.000000,0.793333,0.366667,0.066667,0.053333,D4_adipocyte
D5_adipocyte,0.000000,0.072464,0.086957,0.000000,0.072464,0.188406,0.246377,0.449275,0.507246,0.086957,...,0.086957,0.028986,0.130435,0.0,0.000000,0.869565,0.405797,0.086957,0.057971,D5_adipocyte
D6_adipocyte,0.027027,0.054054,0.162162,0.000000,0.162162,0.162162,0.054054,0.378378,0.324324,0.162162,...,0.108108,0.054054,0.270270,0.0,0.000000,0.702703,0.297297,0.054054,0.108108,D6_adipocyte
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
H67_myeloid cell,0.037162,0.020270,0.060811,0.000000,0.111486,0.422297,0.587838,0.905405,0.466216,0.118243,...,0.054054,0.158784,0.395270,0.0,0.006757,1.050676,0.165541,0.054054,0.327703,H67_myeloid cell
H7_myeloid cell,0.018762,0.030019,0.039400,0.000000,0.046904,0.330206,0.288931,0.596623,0.407129,0.097561,...,0.078799,0.135084,0.356473,0.0,0.007505,1.823640,0.073171,0.080675,0.288931,H7_myeloid cell
IC_H01_myeloid cell,0.014337,0.032258,0.060932,0.000000,0.075269,0.523297,0.182796,0.874552,1.691756,0.111111,...,0.086022,0.225806,0.362007,0.0,0.003584,1.465950,0.060932,0.050179,0.419355,IC_H01_myeloid cell
IC_H02_myeloid cell,0.028143,0.041276,0.041276,0.000000,0.045028,0.414634,0.409006,0.711069,0.292683,0.097561,...,0.088180,0.200750,0.245779,0.0,0.005629,0.189493,0.048780,0.060038,0.382739,IC_H02_myeloid cell


In [69]:
## Convert to long format

In [70]:
data_long = pd.melt(data, id_vars = ['feature'])

In [71]:
data_long['sample_id'] = data_long['feature'].str.extract(r'^([^_]*)')
data_long['type'] = data_long['feature'].str.extract(r'_(.*)')

In [72]:
data_long['variable'] = data_long['feature_name']
data_long['dataset'] = sc_dataset_names[0]

In [73]:
data_long = data_long[['sample_id', 'variable', 'value', 'dataset', 'type']]

In [74]:
result_path[1]

'/lustre/groups/epigenereg01/workspace/projects/jove/example_results/'

In [75]:
result_path[1] + '/01_results/01_' + i + 'Pseudobulk_Table' +'.csv'

'/lustre/groups/epigenereg01/workspace/projects/jove/example_results//01_results/01_dcm_acmPseudobulk_Table.csv'

In [76]:
data_long.to_csv(result_path[1] + '/01_results/01_' + i + 'Pseudobulk_Table' +'.csv')

In [77]:
len(pd.unique(data_long['sample_id']))

54