In [64]:
# Helper Script: Load Seurat object and transform to anndata for input for analysis workflow

# Prerequisite - Load Libraries

In [65]:
import h5py
import numpy as np
import pandas as pd
import scanpy as sc
from anndata import AnnData
from scipy.sparse import csr_matrix

In [66]:
import os

In [86]:
### Check the path of loaded packages
sc.__path__

['/opt/conda/envs/mofa_analysis/lib/python3.9/site-packages/scanpy']

# Preqrequisites Configurations & Parameters

In [67]:
### Load the parameters that are set via the configuration files

In [68]:
### Load configurations file
global_configs = pd.read_csv('configurations/Data_Configs.csv', sep = ',')

In [69]:
data_path = global_configs['value'][global_configs['parameter'] == 'data_path']

In [70]:
data_path

0    /home/icb/corinna.losert/projects/mofa_workflo...
Name: value, dtype: object

In [71]:
result_path = global_configs['value'][global_configs['parameter'] == 'result_path']

In [72]:
result_path

1    /home/icb/corinna.losert/projects/mofa_workflo...
Name: value, dtype: object

In [73]:
### Data name of sc dataset

In [74]:
files = [f for f in os.listdir(data_path[0]) if os.path.isfile(os.path.join(data_path[0], f))]

In [75]:
files

['Prepared_Neutrophil_Data.csv',
 'Prepared_Pathway_Data.csv',
 'Prepared_sc_Data_prod.h5ad',
 'output_file.h5ad',
 'Prepared_sc_Data.h5seurat',
 'Prepared_Sample_Meta_Data.csv',
 'Prepared_Clinical_Data.csv',
 'Prepared_Proteomic_Data.csv',
 'Prepared_Cytokine_Data.csv']

In [76]:
### Get only the seurat files to convert

In [77]:
# Filter the list to include only files with '.h5seurat' in the name
h5seurat_files = [f for f in files if '.h5seurat' in f]

In [78]:
h5seurat_files = [f.replace('.h5seurat', '') for f in h5seurat_files]

In [79]:
h5seurat_files

['Prepared_sc_Data']

# Conversion

In [80]:
### Load single-cell datasets in seurat and convert to anndata

In [81]:
# Open the .h5seurat file

for i in h5seurat_files:
    file_path = os.path.join(data_path[0], f'{i}'+ '.h5seurat')
    output_path = os.path.join(data_path[0], f'{i}'+ '.h5ad')

    with h5py.File(file_path, 'r') as f:
        # Access the counts group
        counts_group = f['assays/RNA/counts']  ## this needs to exist in seurat structure otherwise conversion won't work; see structure below
    
        # Create the sparse matrix
        data_array = counts_group['data'][:]
        indices_array = counts_group['indices'][:]
        indptr_array = counts_group['indptr'][:]
        shape = (len(indptr_array) - 1, indices_array.max() + 1)
    
        data = csr_matrix((data_array, indices_array, indptr_array), shape=shape)
    
        # Extract gene names and cell barcodes
        genes = [x.decode('utf-8') for x in f['assays/RNA/meta.features/_index'][:]] ## this needs to exist in seurat structure otherwise conversion won't work; see structure below
        barcodes = [x.decode('utf-8') for x in f['cell.names'][:]] ## this needs to exist in seurat structure otherwise conversion won't work; see structure below

        # Extract metadata if available
        metadata = {}
        if 'meta.data' in f:
            for key in f['meta.data'].keys():
                metadata[key] = [x.decode('utf-8') if isinstance(x, bytes) else x for x in f['meta.data'][key][:]]
            metadata = pd.DataFrame(metadata, index=barcodes)

    # Create AnnData object
    adata = AnnData(X=data, var=pd.DataFrame(index=genes), obs=pd.DataFrame(index=barcodes))

    # Add metadata to AnnData object if available
    if not metadata.empty:
        del metadata['_index']
        adata.obs = metadata

    # Optionally save to an .h5ad file
    adata.write( output_path)

  adata = AnnData(X=data, var=pd.DataFrame(index=genes), obs=pd.DataFrame(index=barcodes))


In [82]:
pd.crosstab(adata.obs['cluster_id'], adata.obs['cluster_id']).sum(axis=1)

cluster_id
ASDC                     8
B cell                9829
CD4 CTL               2346
CD4 Naive             6495
CD4 Proliferating      108
CD4 TCM              45102
CD4 TEM               6296
CD8 Naive             2612
CD8 Proliferating       13
CD8 TCM                525
CD8 TEM              10611
CD14 Mono            32727
CD16 Mono             4680
Doublet                 48
Eryth                    6
HSPC                   332
ILC                    931
MAIT                  1177
NK                   15699
NK Proliferating       168
NK_CD56bright          874
Plasmablast            703
Platelet               450
Treg                  2362
cDC1                    83
cDC2                  1314
dnT                    130
gdT                   1901
pDC                    745
dtype: int64

# Inspection of h5seurat file (use if structure is different from the one assumed above)

In [83]:
# Open the .h5seurat file

for i in h5seurat_files:
    file_path = os.path.join(data_path[0], f'{i}'+ '.h5seurat')
    with h5py.File(file_path, 'r') as f:
        # Function to recursively print the structure of the HDF5 file
        def print_hdf5_structure(name, obj):
            if isinstance(obj, h5py.Group):
                print(f"Group: {name}")
            elif isinstance(obj, h5py.Dataset):
                print(f"Dataset: {name}, shape: {obj.shape}, dtype: {obj.dtype}")

        # Print the structure of the HDF5 file
        f.visititems(print_hdf5_structure)

Group: active.ident
Dataset: active.ident/levels, shape: (1,), dtype: object
Dataset: active.ident/values, shape: (148275,), dtype: int32
Group: assays
Group: assays/RNA
Group: assays/RNA/counts
Dataset: assays/RNA/counts/data, shape: (243903981,), dtype: float64
Dataset: assays/RNA/counts/indices, shape: (243903981,), dtype: int32
Dataset: assays/RNA/counts/indptr, shape: (148276,), dtype: int32
Group: assays/RNA/data
Dataset: assays/RNA/data/data, shape: (243903981,), dtype: float64
Dataset: assays/RNA/data/indices, shape: (243903981,), dtype: int32
Dataset: assays/RNA/data/indptr, shape: (148276,), dtype: int32
Dataset: assays/RNA/features, shape: (19221,), dtype: object
Group: assays/RNA/meta.features
Dataset: assays/RNA/meta.features/_index, shape: (19221,), dtype: object
Dataset: assays/RNA/meta.features/dispersions, shape: (19221,), dtype: float64
Dataset: assays/RNA/meta.features/dispersions-L1, shape: (19221,), dtype: float64
Dataset: assays/RNA/meta.features/dispersions-L10, 