In [108]:
### Data Preparation for MOFA analysis in R (adjust some columns + delete some layers)

# Load Libraries

In [109]:
import scanpy as sc
import anndata as an
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import scanorama
import os
import multiprocessing
import random
import time
import git
import sys
from datetime import date
from matplotlib.backends.backend_pdf import PdfPages

# Load Data

## Configurations

### Technical configurations

In [110]:
multiprocessing.cpu_count()  ### total amount of cpu on the used core

64

In [111]:
len(os.sched_getaffinity(0)) ### amount of available cpus

24

In [112]:
sc.settings.n_jobs   # default number of CPUs to use for parallel computing

1

In [113]:
sc.settings.max_memory  # maximum memory to use in GB

15

In [114]:
random.seed(7)

In [115]:
ncore = '24'

In [116]:
random_state_var = 7

In [117]:
n_jobs_var = 24

In [118]:
os.environ["OMP_NUM_THREADS"] = ncore
os.environ["OPENBLAS_NUM_THREADS"] = ncore
os.environ["MKL_NUM_THREADS"] = ncore
os.environ["VECLIB_MAXIMUM_THREADS"] = ncore
os.environ["NUMEXPR_NUM_THREADS"] = ncore

In [119]:
sc.logging.print_versions()
sc.set_figure_params(facecolor="white", figsize=(8, 8))
sc.settings.verbosity = 0

The `sinfo` package has changed name and is now called `session_info` to become more discoverable and self-explanatory. The `sinfo` PyPI package will be kept around to avoid breaking old installs and you can downgrade to 0.3.2 if you want to use it without seeing this message. For the latest features and bug fixes, please install `session_info` instead. The usage and defaults also changed slightly, so please review the latest README at https://gitlab.com/joelostblom/session_info.
-----
anndata     0.7.6
scanpy      1.8.1
sinfo       0.3.4
-----
PIL                 8.3.1
annoy               NA
anyio               NA
attr                21.2.0
babel               2.9.1
backcall            0.2.0
beta_ufunc          NA
binom_ufunc         NA
brotli              NA
certifi             2021.05.30
cffi                1.14.6
chardet             4.0.0
charset_normalizer  2.0.0
cycler              0.10.0
cython_runtime      NA
dateutil            2.8.2
decorator           5.0.9
defusedxml       

In [120]:
sc.settings.verbosity

<Verbosity.error: 0>

In [121]:
import warnings
warnings.filterwarnings("ignore")

### Parameters

In [122]:
data_path = '../data/current'

In [123]:
data_path

'../data/current'

In [124]:
variant = 'Singlets'

In [125]:
variants = [variant]

In [126]:
result_path = '../results/current'

In [127]:
### Dataset storing annotations

In [128]:
path = result_path + '/B-Analysis/saved_annotations.csv'
saved_annotations = pd.read_csv(path)
print('Last modified' + time.ctime(os.path.getmtime(path)))

Last modifiedMon Mar  4 14:49:42 2024


In [129]:
save_path = '_B5_' + variant + '_Annotations.pdf'

## Clustered and combined data from B3

In [130]:
anndata_dict = dict.fromkeys(variants)

In [131]:
anndata_dict_subset = dict.fromkeys(variants)

In [132]:
for key in anndata_dict:
    dataset_path = data_path + '/analysis/B/B3_Integrated_Singlet_processed_rna.h5ad'
    
    print(dataset_path)
    print('Last modified' + time.ctime(os.path.getmtime(dataset_path)))
    
    adata_orig = sc.read_h5ad(dataset_path)
    anndata_dict[key]= adata_orig # save anndata in dictionary
    
    anndata_dict[key].uns['data_load_time'] = time.ctime(os.path.getmtime(dataset_path))  # save the last modified timestamp of the data loaded
    anndata_dict[key].uns['data_load_name'] = dataset_path # save the data path of the data loaded 
    

../data/current/analysis/B/B3_Integrated_Singlet_processed_rna.h5ad
Last modifiedMon Mar  4 15:18:28 2024


In [133]:
### Short data check

In [134]:
# anndata_dict['V1_All']  # cells have been filtered only on singlets

In [135]:
anndata_dict[key].X.sum(axis=1) # data was normalized and log-transformed, scaled


array([-105.86096 ,    9.41081 , -116.85063 , ...,  249.01207 ,
        106.5417  ,   49.693195], dtype=float32)

In [136]:
anndata_dict[key].raw.X.sum(axis=1) # raw count values


matrix([[ 3947.],
        [ 5870.],
        [ 3888.],
        ...,
        [ 4079.],
        [17619.],
        [ 3231.]], dtype=float32)

In [137]:
anndata_dict[key].obsm['X_corected_Scanorama'].sum(axis = 1)

matrix([[21.065617],
        [25.584118],
        [21.224699],
        ...,
        [22.754984],
        [29.788113],
        [23.377083]], dtype=float32)

In [138]:
anndata_dict[key].obsm['X_scanorama_rb_mt'].sum(axis= 1)

array([ 0.10920271, -0.17586301,  0.15258163, ..., -0.56758383,
       -0.5215452 ,  0.21703739])

In [139]:
np.shape(anndata_dict[key].obsm)

(20,)

In [140]:
np.shape(anndata_dict[key].obsm['X_corected_Scanorama'])

(148275, 2000)

In [141]:
# sorted(anndata_dict[key].var.columns)

In [142]:
anndata_dict[key]

AnnData object with n_obs × n_vars = 148275 × 1393
    obs: 'nCount_HTO', 'nFeature_HTO', 'nCount_RNA', 'nFeature_RNA', 'percent_mt', 'HTO_maxID', 'HTO_secondID', 'HTO_margin', 'HTO_classification', 'HTO_classification.global', 'hash.ID', 'HTO_classification_final', 'name', 'library', 'id', 'read', 'pattern', 'sequence', 'feature_type', 'Unnamed: 7', 'in_sample', 'display_name', 'group_x', 'HTO_Doublet_Classification', 'doublet_score', 'predicted_doublet', 'A5_scrublet_predicted_doublet_lib0.1', 'A5_scrublet_doublet_score_lib_0.1', 'A5_scrublet_predicted_doublet_lib0.2', 'A5_scrublet_doublet_score_lib_0.2', 'A5_scrublet_predicted_doublet_lib', 'A5_scrublet_doublet_score_lib', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'valid_cell_filter_dying', 'valid_cell_filter_doublet', 'A7_PCA_50_10_neighbors_cluster', 'A7_PCA_100_10_neighbors_cluster', 'A7_PCA_100_50_neighbors_cluster', 'A7_PCA_500_50_neighbors_cluster', 'A7_Singlet_PCA_50_10_neighbors_clus

In [143]:
anndata_dict[key].obsm['X_corected_Scanorama'].shape

(148275, 2000)

# Data Adaptions

In [144]:
anndata_dict[key].obs.columns

Index(['nCount_HTO', 'nFeature_HTO', 'nCount_RNA', 'nFeature_RNA',
       'percent_mt', 'HTO_maxID', 'HTO_secondID', 'HTO_margin',
       'HTO_classification', 'HTO_classification.global', 'hash.ID',
       'HTO_classification_final', 'name', 'library', 'id', 'read', 'pattern',
       'sequence', 'feature_type', 'Unnamed: 7', 'in_sample', 'display_name',
       'group_x', 'HTO_Doublet_Classification', 'doublet_score',
       'predicted_doublet', 'A5_scrublet_predicted_doublet_lib0.1',
       'A5_scrublet_doublet_score_lib_0.1',
       'A5_scrublet_predicted_doublet_lib0.2',
       'A5_scrublet_doublet_score_lib_0.2',
       'A5_scrublet_predicted_doublet_lib', 'A5_scrublet_doublet_score_lib',
       'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt',
       'pct_counts_mt', 'valid_cell_filter_dying', 'valid_cell_filter_doublet',
       'A7_PCA_50_10_neighbors_cluster', 'A7_PCA_100_10_neighbors_cluster',
       'A7_PCA_100_50_neighbors_cluster', 'A7_PCA_500_50_neighbors_c

## Adapt to remove rb & mt genes

In [145]:
for key in anndata_dict:
    anndata_dict[key]  = anndata_dict[key][:,np.logical_and(anndata_dict[key].var['rb'] == False , anndata_dict[key].var['mt'] == False)].copy()

In [146]:
sum(anndata_dict[key].var_names.str.startswith('RPS12'))

0

In [147]:
anndata_dict[key].var_names

Index(['HES4', 'ISG15', 'TNFRSF18', 'TNFRSF4', 'AL645728.1', 'MIB2', 'MMP23B',
       'PLCH2', 'UTS2', 'TNFRSF9',
       ...
       'SLC5A3', 'KCNE1', 'AP000692.2', 'HLCS', 'MX1', 'AIRE', 'COL6A2',
       'PCNT', 'DIP2A', 'S100B'],
      dtype='object', length=1392)

In [148]:
anndata_dict[key].raw

<anndata._core.raw.Raw at 0x7f54b5464850>

In [149]:
anndata_dict[key].layers

Layers with keys: A6_normalized, A6_normalized_log

In [150]:
anndata_dict[key].obsm['X_corected_Scanorama'].sum(axis = 1)

matrix([[21.065617],
        [25.584118],
        [21.224699],
        ...,
        [22.754984],
        [29.788113],
        [23.377083]], dtype=float32)

## Generate new columns

In [151]:
for key in anndata_dict:
    anndata_dict[key].obs['classification_measurement'] = anndata_dict[key].obs['classification'].astype('string') + '_' + anndata_dict[key].obs['measurement_x'].astype('string')
    anndata_dict[key].obs['delta_ef_value_group_measurement'] = anndata_dict[key].obs['delta_ef_value_group'].astype('string') + '_' + anndata_dict[key].obs['measurement_x'].astype('string')

In [152]:
for key in anndata_dict:
    anndata_dict[key].obs['classification_measurement'] = anndata_dict[key].obs['classification_measurement'].astype('category')
    anndata_dict[key].obs['delta_ef_value_group_measurement'] = anndata_dict[key].obs['delta_ef_value_group_measurement'].astype('category')

# Save dataset for MOFA analysis in R

In [153]:
### Test saving reduced version for conversion

In [154]:
anndata_reduced = anndata_dict[key]

In [155]:
del anndata_reduced.layers

In [156]:
# del anndata_reduced.obs

In [157]:
#del anndata_reduced.obsm

In [158]:
del anndata_reduced.varm

In [159]:
del anndata_reduced.obsp

In [160]:
del anndata_reduced.uns

In [161]:
anndata_reduced

AnnData object with n_obs × n_vars = 148275 × 1392
    obs: 'nCount_HTO', 'nFeature_HTO', 'nCount_RNA', 'nFeature_RNA', 'percent_mt', 'HTO_maxID', 'HTO_secondID', 'HTO_margin', 'HTO_classification', 'HTO_classification.global', 'hash.ID', 'HTO_classification_final', 'name', 'library', 'id', 'read', 'pattern', 'sequence', 'feature_type', 'Unnamed: 7', 'in_sample', 'display_name', 'group_x', 'HTO_Doublet_Classification', 'doublet_score', 'predicted_doublet', 'A5_scrublet_predicted_doublet_lib0.1', 'A5_scrublet_doublet_score_lib_0.1', 'A5_scrublet_predicted_doublet_lib0.2', 'A5_scrublet_doublet_score_lib_0.2', 'A5_scrublet_predicted_doublet_lib', 'A5_scrublet_doublet_score_lib', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'valid_cell_filter_dying', 'valid_cell_filter_doublet', 'A7_PCA_50_10_neighbors_cluster', 'A7_PCA_100_10_neighbors_cluster', 'A7_PCA_100_50_neighbors_cluster', 'A7_PCA_500_50_neighbors_cluster', 'A7_Singlet_PCA_50_10_neighbors_clus

In [162]:
anndata_reduced.obs = anndata_reduced.obs[['nCount_HTO','B2_Scanorama_Singlet_cluster', 'B2_Scanorama_Singlet_rb_mt_cluster', 'cell_type_Scanorama', 'cluster_cell_type_Scanorama', 'classification_measurement', 'delta_ef_value_group_measurement', 'm_y', 'measurement_y', 'delta_ef_value_group', 'delta_ef_value', 'sample_y','age', 'sex', 'm_x', 'classification', 'group_y', 'measurement_x', 'sample_x', 'library', 'id', 'read', 'pattern', 'sequence', 'feature_type','in_sample', 'display_name', 'group_x']]

In [163]:
anndata_reduced.obs

Unnamed: 0,nCount_HTO,B2_Scanorama_Singlet_cluster,B2_Scanorama_Singlet_rb_mt_cluster,cell_type_Scanorama,cluster_cell_type_Scanorama,classification_measurement,delta_ef_value_group_measurement,m_y,measurement_y,delta_ef_value_group,...,sample_x,library,id,read,pattern,sequence,feature_type,in_sample,display_name,group_x
AAACCCACATACAGGG-1-L1,1574.0,8,8,B-cell,8_B-cell,acs_w_o_infection_TP1,x_greater_1_TP1,"M9 (14,3)",TP1,x_greater_1,...,9.1,L1,HTO_B0259,R2,5PNNNNNNNNNN(BC),CAGTAGTCACGGTCA,Antibody Capture,1.0,9.1,TP1
AAACCCACATGACTTG-1-L1,379.0,9,9,Monocytes - CD16_FCGR3A,9_Monocytes - CD16_FCGR3A,acs_w_o_infection_TP1,x_greater_1_TP1,"M9 (14,3)",TP1,x_greater_1,...,9.1,L1,HTO_B0259,R2,5PNNNNNNNNNN(BC),CAGTAGTCACGGTCA,Antibody Capture,1.0,9.1,TP1
AAACCCAGTCATCAGT-1-L1,421.0,5,5,T-cell-CD4,5_T-cell-CD4,acs_w_o_infection_TP1,x_smaller_1_TP1,"M7 (0,75)",TP1,x_smaller_1,...,7.1,L1,HTO_B0257,R2,5PNNNNNNNNNN(BC),TGTCTTTCCTGCCAG,Antibody Capture,1.0,7.1,TP1
AAACCCAGTGGTAATA-1-L1,478.0,0,0,T-cell-CD4,0_T-cell-CD4,acs_w_o_infection_TP1,x_greater_1_TP1,"M9 (14,3)",TP1,x_greater_1,...,9.1,L1,HTO_B0259,R2,5PNNNNNNNNNN(BC),CAGTAGTCACGGTCA,Antibody Capture,1.0,9.1,TP1
AAACCCATCATCACAG-1-L1,851.0,16,16,Plasma Blast,16_Plasma Blast,acs_w_o_infection_TP1,x_greater_1_TP1,"M9 (14,3)",TP1,x_greater_1,...,9.1,L1,HTO_B0259,R2,5PNNNNNNNNNN(BC),CAGTAGTCACGGTCA,Antibody Capture,1.0,9.1,TP1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGCAGGTTCGC-1-L14,1666.0,10,10,B-cell,10_B-cell,acs_w_o_infection_TP4,x_smaller_1_TP4,"M26 (0,3)",TP4,x_smaller_1,...,26.4,L14,HTO_B0256,R2,5PNNNNNNNNNN(BC),GGTTGCCAGATGTCA,Antibody Capture,1.0,26.4,TP4
TTTGTTGGTACCTAGT-1-L14,810.0,0,0,T-cell-CD4,0_T-cell-CD4,acs_w_o_infection_TP4,x_smaller_1_TP4,"M22 (0,067)",TP4,x_smaller_1,...,22.4,L14,HTO_B0253,R2,5PNNNNNNNNNN(BC),TTCCGCCTCTCTTTG,Antibody Capture,1.0,22.4,TP4
TTTGTTGGTACGTTCA-1-L14,883.0,6,6,Monocytes - CD14,6_Monocytes - CD14,acs_w_o_infection_TP4,x_smaller_1_TP4,M23 (0),TP4,x_smaller_1,...,23.4,L14,HTO_B0254,R2,5PNNNNNNNNNN(BC),AGTAAGTTCAGCGTA,Antibody Capture,1.0,23.4,TP4
TTTGTTGGTGGACCAA-1-L14,1159.0,4,4,Monocytes - CD14,4_Monocytes - CD14,acs_w_o_infection_TP4,x_greater_1_TP4,"M28 (2,25)",TP4,x_greater_1,...,28.4,L14,HTO_B0257,R2,5PNNNNNNNNNN(BC),TGTCTTTCCTGCCAG,Antibody Capture,1.0,28.4,TP4


In [164]:
for col in anndata_reduced.obs.columns:
    if pd.api.types.is_categorical_dtype(anndata_reduced.obs[col]):
        anndata_reduced.obs[col]=anndata_reduced.obs[col].cat.add_categories("missing").fillna('missing')

In [165]:
obsm = anndata_reduced.obsm['X_corected_Scanorama'] 

In [166]:
obsm.sum(axis = 1)

matrix([[21.065617],
        [25.584118],
        [21.224699],
        ...,
        [22.754984],
        [29.788113],
        [23.377083]], dtype=float32)

In [167]:
anndata_scanorama = sc.AnnData(X=anndata_reduced.obsm['X_corected_Scanorama'], obs = anndata_reduced.obs)

In [168]:
anndata_scanorama

AnnData object with n_obs × n_vars = 148275 × 2000
    obs: 'nCount_HTO', 'B2_Scanorama_Singlet_cluster', 'B2_Scanorama_Singlet_rb_mt_cluster', 'cell_type_Scanorama', 'cluster_cell_type_Scanorama', 'classification_measurement', 'delta_ef_value_group_measurement', 'm_y', 'measurement_y', 'delta_ef_value_group', 'delta_ef_value', 'sample_y', 'age', 'sex', 'm_x', 'classification', 'group_y', 'measurement_x', 'sample_x', 'library', 'id', 'read', 'pattern', 'sequence', 'feature_type', 'in_sample', 'display_name', 'group_x'

In [169]:
anndata_scanorama.X.sum(axis = 1)

matrix([[21.065617],
        [25.584118],
        [21.224699],
        ...,
        [22.754984],
        [29.788113],
        [23.377083]], dtype=float32)

In [170]:
#anndata_scanorama.raw.X.sum(axis=1)

In [171]:
#anndata

In [172]:
del anndata_reduced.obsm

In [173]:
anndata_reduced

AnnData object with n_obs × n_vars = 148275 × 1392
    obs: 'nCount_HTO', 'B2_Scanorama_Singlet_cluster', 'B2_Scanorama_Singlet_rb_mt_cluster', 'cell_type_Scanorama', 'cluster_cell_type_Scanorama', 'classification_measurement', 'delta_ef_value_group_measurement', 'm_y', 'measurement_y', 'delta_ef_value_group', 'delta_ef_value', 'sample_y', 'age', 'sex', 'm_x', 'classification', 'group_y', 'measurement_x', 'sample_x', 'library', 'id', 'read', 'pattern', 'sequence', 'feature_type', 'in_sample', 'display_name', 'group_x'
    var: 'gene_ids', 'feature_types', 'genome', 'mt', 'rb', 'n_cells-L1', 'n_cells_by_counts-L1', 'mean_counts-L1', 'pct_dropout_by_counts-L1', 'total_counts-L1', 'highly_variable-L1', 'means-L1', 'dispersions-L1', 'dispersions_norm-L1', 'n_cells-L10', 'n_cells_by_counts-L10', 'mean_counts-L10', 'pct_dropout_by_counts-L10', 'total_counts-L10', 'highly_variable-L10', 'means-L10', 'dispersions-L10', 'dispersions_norm-L10', 'n_cells-L11', 'n_cells_by_counts-L11', 'mean_count

In [174]:
data_name = data_path + '/analysis/B/B6_DE_Integrated_Singlet_processed_rna.h5ad'

In [175]:
data_name_scano = data_path + '/analysis/B/B6_DE_Integrated_Singlet_processed_scanorama_rna.h5ad'

In [176]:
anndata_reduced

AnnData object with n_obs × n_vars = 148275 × 1392
    obs: 'nCount_HTO', 'B2_Scanorama_Singlet_cluster', 'B2_Scanorama_Singlet_rb_mt_cluster', 'cell_type_Scanorama', 'cluster_cell_type_Scanorama', 'classification_measurement', 'delta_ef_value_group_measurement', 'm_y', 'measurement_y', 'delta_ef_value_group', 'delta_ef_value', 'sample_y', 'age', 'sex', 'm_x', 'classification', 'group_y', 'measurement_x', 'sample_x', 'library', 'id', 'read', 'pattern', 'sequence', 'feature_type', 'in_sample', 'display_name', 'group_x'
    var: 'gene_ids', 'feature_types', 'genome', 'mt', 'rb', 'n_cells-L1', 'n_cells_by_counts-L1', 'mean_counts-L1', 'pct_dropout_by_counts-L1', 'total_counts-L1', 'highly_variable-L1', 'means-L1', 'dispersions-L1', 'dispersions_norm-L1', 'n_cells-L10', 'n_cells_by_counts-L10', 'mean_counts-L10', 'pct_dropout_by_counts-L10', 'total_counts-L10', 'highly_variable-L10', 'means-L10', 'dispersions-L10', 'dispersions_norm-L10', 'n_cells-L11', 'n_cells_by_counts-L11', 'mean_count

In [177]:
## Select obs columns to keep 

In [178]:
anndata_reduced.obs = anndata_reduced.obs[['nCount_HTO', 'B2_Scanorama_Singlet_rb_mt_cluster', 'cell_type_Scanorama', 'cluster_cell_type_Scanorama', 'library', 'in_sample', 'display_name', 'classification']]

In [186]:
pd.unique(anndata_reduced.obs['display_name'])

['9.1', '7.1', '8.1', '4.1', '1.1', ..., '21.4', '28.4', '26.4', '4.4', '23.4']
Length: 119
Categories (120, object): ['1.1', '2', '2.1', '2.2', ..., '32', '33', '34', 'missing']

In [187]:
pd.unique(anndata_reduced.obs['classification'])

['acs_w_o_infection', 'acs_subacute', 'acs_w_infection', 'koronarsklerose', 'vollstaendiger_ausschluss', 'ccs']
Categories (7, object): ['acs_subacute', 'acs_w_infection', 'acs_w_o_infection', 'ccs', 'koronarsklerose', 'vollstaendiger_ausschluss', 'missing']

In [208]:
anndata_reduced.obs['sample_id'] = anndata_reduced.obs['display_name']

In [209]:
anndata_reduced.obs['sample_id'] = anndata_reduced.obs['sample_id'].astype('string')

In [210]:
#anndata_reduced.obs['sample_id'][anndata_reduced.obs['classification'].isin(['ccs', 'vollstaendiger_ausschluss', 'koronarsklerose'])].astype('string')

In [211]:
anndata_reduced.obs['sample_id'][anndata_reduced.obs['classification'].isin(['ccs', 'vollstaendiger_ausschluss', 'koronarsklerose'])] = 'k'  + anndata_reduced.obs['sample_id'][anndata_reduced.obs['classification'].isin(['ccs', 'vollstaendiger_ausschluss', 'koronarsklerose'])].astype('string')
anndata_reduced.obs['sample_id'][anndata_reduced.obs['classification'].isin(['acs_subacute', 'acs_w_infection', 'acs_w_o_infection'])] = 'm'  + anndata_reduced.obs['sample_id'][anndata_reduced.obs['classification'].isin(['acs_subacute', 'acs_w_infection', 'acs_w_o_infection'])].astype('string')

In [212]:
anndata_reduced.var = anndata_reduced.var[['gene_ids',  'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std']]

In [213]:
pd.unique(anndata_reduced.obs['sample_id'])

<StringArray>
[ 'm9.1',  'm7.1',  'm8.1',  'm4.1',  'm1.1',  'm2.1',  'm5.1', 'm13.1',
 'm15.1', 'm18.1',
 ...
   'k31',    'k6', 'm24.4', 'm20.4', 'm22.4', 'm21.4', 'm28.4', 'm26.4',
  'm4.4', 'm23.4']
Length: 119, dtype: string

In [214]:
#anndata_reduced.uns

In [215]:
anndata_reduced.write(data_name)

... storing 'sample_id' as categorical


In [227]:
data_name

'../data/current/analysis/B/B6_DE_Integrated_Singlet_processed_rna.h5ad'

In [216]:
anndata_scanorama.write(data_name_scano)

# Save dataset to share via zenodo

In [217]:
anndata_reduced.raw.X.sum(axis=1)  # contains the raw values

matrix([[ 3947.],
        [ 5870.],
        [ 3888.],
        ...,
        [ 4079.],
        [17619.],
        [ 3231.]], dtype=float32)

In [218]:
## Remove some columns from # obs that are not needed

In [220]:
anndata_reduced

AnnData object with n_obs × n_vars = 148275 × 1392
    obs: 'nCount_HTO', 'B2_Scanorama_Singlet_rb_mt_cluster', 'cell_type_Scanorama', 'cluster_cell_type_Scanorama', 'library', 'in_sample', 'display_name', 'classification', 'sample_id'
    var: 'gene_ids', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'

In [221]:
pd.unique(anndata_reduced.obs['B2_Scanorama_Singlet_rb_mt_cluster'])

['8', '9', '5', '0', '16', ..., '10', '17', '12', '15', '18']
Length: 19
Categories (20, object): ['0', '1', '2', '3', ..., '16', '17', '18', 'missing']

In [226]:

pd.unique(anndata_reduced.obs['cluster_cell_type_Scanorama'])

['8_B-cell', '9_Monocytes - CD16_FCGR3A', '5_T-cell-CD4', '0_T-cell-CD4', '16_Plasma Blast', ..., '10_B-cell', '17_Progenitor', '12_Monocytes - CD16_FCGR3A', '15_Plasma Blast', '18_Megakaryocytes']
Length: 19
Categories (20, object): ['0_T-cell-CD4', '1_T-cell-CD8', '2_T-cell-CD4', '3_NK', ..., '16_Plasma Blast', '17_Progenitor', '18_Megakaryocytes', 'missing']

In [228]:
anndata_reduced.write(result_path + '/Submission/Prepared_sc_Data.h5ad')