In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from scipy.io import mmwrite
import h5py
import os, re

In [2]:
path = '/lustre/orion/syb111/proj-shared/Projects/scrna-seq/data/human/heart/healthy/sanger/raw/hca_heart_global_ctl200723_freeze.h5ad'

In [58]:
# to see all the hd5 group names
with h5py.File(path, 'r') as f:
    for key in f.keys():
        if key == 'obs':
            bc_meta = list(f[key])
        print(f"{key}: {list(f[key])}")

X: ['data', 'indices', 'indptr']
obs: ['NRP', 'Used', '__categories', '_index', 'age_group', 'cell_source', 'cell_states', 'cell_type', 'donor', 'gender', 'n_counts', 'n_genes', 'percent_mito', 'percent_ribo', 'region', 'sample', 'scrublet_score', 'source', 'type', 'version']
obsm: ['X_pca', 'X_umap']
uns: ['cell_type_colors']
var: ['__categories', '_index', 'feature_types-Harvard-Nuclei', 'feature_types-Sanger-CD45', 'feature_types-Sanger-Cells', 'feature_types-Sanger-Nuclei', 'gene_ids-Harvard-Nuclei', 'gene_ids-Sanger-CD45', 'gene_ids-Sanger-Cells', 'gene_ids-Sanger-Nuclei']


In [59]:
with h5py.File(path, 'r') as hf:
    for name in ['age_group', 'cell_type', 'cell_states', 'region', 'source']:
        data = hf['obs'][name]
        print(f'({len(np.unique(data))}) {name}: {hf["obs"][name][()]}')

(7) age_group: [2 2 2 ... 4 4 4]
(13) cell_type: [10 11 11 ...  2  2  2]
(67) cell_states: [52 62 62 ... 12 16 15]
(6) region: [0 0 0 ... 0 0 0]
(3) source: [2 2 2 ... 0 0 0]


In [None]:
{'age_group': {0: '40-45', 1: '45-50'}, 'cell_type': {}}

In [77]:
#names = {'age_group': 2, 'cell_type': 5, 'cell_states': 4, 'region': 6, 'source': 10}

with h5py.File(path, 'r') as hf:
    x = hf['obs']['__categories']
    converter = {}
    for i, category in enumerate(x):
        if i in [2, 4, 5, 8, 10]:
            converter[category] = {}
            for j, txt in enumerate(x[category][()]):
                converter[category][j] = txt
            #converter[i] = str(x[category][()])
    for i, thing in enumerate(x):
        print(f'{i}: {x[thing][()]}')

0: ['No' 'Yes']
1: ['No' 'Yes']
2: ['40-45' '45-50' '50-55' '55-60' '60-65' '65-70' '70-75']
3: ['Harvard-Nuclei' 'Sanger-CD45' 'Sanger-Cells' 'Sanger-Nuclei']
4: ['Adip1' 'Adip2' 'Adip3' 'Adip4' 'B_cells' 'CD4+T_cytox' 'CD4+T_tem'
 'CD8+T_cytox' 'CD8+T_tem' 'CD16+Mo' 'DOCK4+MØ1' 'DOCK4+MØ2' 'EC1_cap'
 'EC2_cap' 'EC3_cap' 'EC4_immune' 'EC5_art' 'EC6_ven' 'EC7_atria' 'EC8_ln'
 'EC9_FB-like' 'EC10_CMC-like' 'FB1' 'FB2' 'FB3' 'FB4' 'FB5' 'FB6' 'FB7'
 'IL17RA+Mo' 'LYVE1+MØ1' 'LYVE1+MØ2' 'LYVE1+MØ3' 'Mast' 'Meso' 'CD14+Mo'
 'Mo_pi' 'MØ_AgP' 'MØ_mod' 'NC1' 'NC2' 'NC3' 'NC4' 'NC5' 'NC6' 'NK' 'NKT'
 'NØ' 'PC1_vent' 'PC2_atria' 'PC3_str' 'PC4_CMC-like' 'SMC1_basic'
 'SMC2_art' 'aCM1' 'aCM2' 'aCM3' 'aCM4' 'aCM5' 'doublets' 'DC' 'nan'
 'vCM1' 'vCM2' 'vCM3' 'vCM4' 'vCM5']
5: ['Adipocytes' 'Atrial_Cardiomyocyte' 'Endothelial' 'Fibroblast' 'Lymphoid'
 'Mesothelial' 'Myeloid' 'Neuronal' 'NotAssigned' 'Pericytes'
 'Smooth_muscle_cells' 'Ventricular_Cardiomyocyte' 'doublets']
6: ['D1' 'D2' 'D3' 'D4' 'D

In [78]:
converter

{'age_group': {0: '40-45',
  1: '45-50',
  2: '50-55',
  3: '55-60',
  4: '60-65',
  5: '65-70',
  6: '70-75'},
 'cell_states': {0: 'Adip1',
  1: 'Adip2',
  2: 'Adip3',
  3: 'Adip4',
  4: 'B_cells',
  5: 'CD4+T_cytox',
  6: 'CD4+T_tem',
  7: 'CD8+T_cytox',
  8: 'CD8+T_tem',
  9: 'CD16+Mo',
  10: 'DOCK4+MØ1',
  11: 'DOCK4+MØ2',
  12: 'EC1_cap',
  13: 'EC2_cap',
  14: 'EC3_cap',
  15: 'EC4_immune',
  16: 'EC5_art',
  17: 'EC6_ven',
  18: 'EC7_atria',
  19: 'EC8_ln',
  20: 'EC9_FB-like',
  21: 'EC10_CMC-like',
  22: 'FB1',
  23: 'FB2',
  24: 'FB3',
  25: 'FB4',
  26: 'FB5',
  27: 'FB6',
  28: 'FB7',
  29: 'IL17RA+Mo',
  30: 'LYVE1+MØ1',
  31: 'LYVE1+MØ2',
  32: 'LYVE1+MØ3',
  33: 'Mast',
  34: 'Meso',
  35: 'CD14+Mo',
  36: 'Mo_pi',
  37: 'MØ_AgP',
  38: 'MØ_mod',
  39: 'NC1',
  40: 'NC2',
  41: 'NC3',
  42: 'NC4',
  43: 'NC5',
  44: 'NC6',
  45: 'NK',
  46: 'NKT',
  47: 'NØ',
  48: 'PC1_vent',
  49: 'PC2_atria',
  50: 'PC3_str',
  51: 'PC4_CMC-like',
  52: 'SMC1_basic',
  53: 'SMC2_art',

In [None]:
# age_group: 2
# cell_type: 5
# cell_state: 4
# percent_mito:

In [79]:
#mapper = {'age_group': 2, 'cell_type': 5, 'cell_states': 4, 'region': 6, 'source': 10}

with h5py.File(path, 'r') as f:
    # gene x cell matrix
    data = f['X']['data'][()]
    indices = f['X']['indices'][()]
    indptr = f['X']['indptr'][()]
    sparse_mtx = csr_matrix((data, indices, indptr)).transpose()
    # bc and gene ids
    bc_ids = f['obs']['_index'][()]
    hgnc_gene_ids = f['var']['_index'][()]
    #ens_gene_ids = f['var']['gene_ids'][()]
    # cell types
    bc_meta_to_keep = ['age_group', 'cell_type', 'cell_states', 'region', 'source']

    bc_metadata_to_keep = []
    for meta in bc_meta_to_keep:
        metadata = f['obs'][meta][()]
        my_dict = converter[meta]
        my_dict = pd.Series(my_dict)
        metadata = my_dict[metadata].to_numpy().T
        bc_metadata_to_keep.append(metadata)

In [84]:
# how many bcs?
print(len(bc_ids))

486134


In [82]:
# write the matrix to file in Market Exchange Format (10X's .mtx format) and use 10X filename
#out_mtx_path = '/gpfs/alpine/syb105/proj-shared/Personal/jmerlet/projects/cell_atlasses/data/human/lister_lab/matrix.mtx'
out_mtx_path = '/lustre/orion/syb111/proj-shared/Projects/scrna-seq/data/human/heart/healthy/sanger/raw/matrix.mtx'
mmwrite(out_mtx_path, sparse_mtx, field='integer')

In [85]:
# write the bcs and bc cell type metadata to file
# match 10x filename and formatting for Seurat read-in
out_bc_path = '/lustre/orion/syb111/proj-shared/Projects/scrna-seq/data/human/heart/healthy/sanger/raw/barcodes.tsv'
out_bc_data = pd.DataFrame(bc_ids.astype(str))
out_bc_data.to_csv(out_bc_path, sep='\t', index=False, header=False)
# filename doesn't matter for metadata
out_bc_meta_path = '/lustre/orion/syb111/proj-shared/Projects/scrna-seq/data/human/heart/healthy/sanger/meta/bc_meta.tsv'
out_bc_metadata = pd.DataFrame(data=bc_metadata_to_keep).T
out_bc_metadata.index = bc_ids.astype(str)
out_bc_metadata.columns = bc_meta_to_keep
out_bc_metadata.to_csv(out_bc_meta_path, sep='\t')

In [87]:
# write the HGNC symbols to file using 10X filename and formatting
out_gene_path = '/gpfs/alpine/syb105/proj-shared/Personal/jmerlet/projects/cell_atlasses/data/human/lister_lab/features.tsv'
out_gene_path = '/lustre/orion/syb111/proj-shared/Projects/scrna-seq/data/human/heart/healthy/sanger/raw/features.tsv'
str_fill_col = np.repeat('Gene', len(hgnc_gene_ids))
out_gene_data = pd.DataFrame(data=[hgnc_gene_ids.astype(str), hgnc_gene_ids.astype(str), str_fill_col]).T
out_gene_data.to_csv(out_gene_path, sep='\t', index=False, header=False)