In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from scipy.io import mmwrite
import h5py
import os, re

In [2]:
path = '/gpfs/alpine/syb105/proj-shared/Personal/jmerlet/projects/cell_atlasses/data/human/lister_lab/RNA-all_full-counts-and-downsampled-CPM.h5ad'

In [4]:
# to see all the hd5 group names
with h5py.File(path, 'r') as f:
    for key in f.keys():
        if key == 'obs':
            bc_meta = list(f[key])
        print(f"{key}: {list(f[key])}")

X: ['data', 'indices', 'indptr']
layers: ['ds_norm_cts']
obs: ['Astro_GFAP_dev-traj', 'Astro_SLC1A2_dev-traj', 'Brain Regions*', 'CCK_RELN_dev-traj', 'CCK_SORCS1_dev-traj', 'CCK_SYT6_dev-traj', 'Cause of Death', 'Collection_year', 'Date-of-Collection', 'ICD-10 Code', 'ICD-10 category', 'ID2_CSMD1_dev-traj', 'L2_CUX2_LAMP5_dev-traj', 'L3_CUX2_PRSS12_dev-traj', 'L4_RORB_LRRK1_dev-traj', 'L4_RORB_MET_dev-traj', 'L4_RORB_MME_dev-traj', 'L5-6_THEMIS_CNR1_dev-traj', 'L5-6_THEMIS_NTNG2_dev-traj', 'L5-6_TLE4_HTR2C_dev-traj', 'L5-6_TLE4_SCUBE1_dev-traj', 'L5-6_TLE4_SORCS1_dev-traj', 'LAMP5_CCK_dev-traj', 'LAMP5_NDNF_dev-traj', 'LAMP5_NOS1_dev-traj', 'Library Prep Date', 'Library Prep Lot', 'Micro_dev-traj', 'OPC_MBP_dev-traj', 'OPC_dev-traj', 'Oligo_dev-traj', 'Oxygen', 'PMI', 'PV_SCUBE3_dev-traj', 'PV_SST_dev-traj', 'PV_SULF1_dev-traj', 'PV_WFDC2_dev-traj', 'RL#', 'Race', 'SST_ADGRG6_dev-traj', 'SST_B3GAT2_dev-traj', 'SST_BRINP3_dev-traj', 'SST_CALB1_dev-traj', 'SST_NPY_dev-traj', 'SST_STK32A_

In [9]:
with h5py.File(path, 'r') as f:
    # gene x cell matrix
    data = f['X']['data'][()]
    indices = f['X']['indices'][()]
    indptr = f['X']['indptr'][()]
    sparse_mtx = csr_matrix((data, indices, indptr)).transpose()
    # bc and gene ids
    bc_ids = f['obs']['_index'][()]
    hgnc_gene_ids = f['var']['_index'][()]
    ens_gene_ids = f['var']['gene_ids'][()]
    # cell types
    #bc_meta_to_keep = ['cell_type', 'major_clust', 'sub_clust']
    bc_meta_to_keep = ['cell_type', 'major_clust', 'sub_clust', 'stage_id', 'Brain Regions*']
    bc_metadata_to_keep = []
    for meta in bc_meta_to_keep:
        metadata = f['obs'][meta]['codes'][()]
        converter = {}
        for i, category in enumerate(f['obs'][meta]['categories'][()]):
            converter[i] = str(category, 'utf-8')
        converter = pd.Series(converter)
        metadata = converter[metadata].to_numpy().T
        bc_metadata_to_keep.append(metadata)

In [26]:
# write the matrix to file in Market Exchange Format (10X's .mtx format) and use 10X filename
out_mtx_path = '/gpfs/alpine/syb105/proj-shared/Personal/jmerlet/projects/cell_atlasses/data/human/lister_lab/matrix.mtx'
mmwrite(out_mtx_path, sparse_mtx, field='integer')

In [10]:
# write the bcs and bc cell type metadata to file
# match 10x filename and formatting for Seurat read-in
out_bc_path = '/gpfs/alpine/syb105/proj-shared/Personal/jmerlet/projects/cell_atlasses/data/human/lister_lab/barcodes.tsv'
out_bc_data = pd.DataFrame(bc_ids.astype(str))
out_bc_data.to_csv(out_bc_path, sep='\t', index=False, header=False)
# filename doesn't matter for metadata
out_bc_meta_path = '/gpfs/alpine/syb105/proj-shared/Personal/jmerlet/projects/cell_atlasses/data/human/lister_lab/bc_meta.tsv'
out_bc_metadata = pd.DataFrame(data=bc_metadata_to_keep).T
out_bc_metadata.index = bc_ids.astype(str)
out_bc_metadata.columns = bc_meta_to_keep
out_bc_metadata.to_csv(out_bc_meta_path, sep='\t')

In [33]:
# write the HGNC symbols to file using 10X filename and formatting
out_gene_path = '/gpfs/alpine/syb105/proj-shared/Personal/jmerlet/projects/cell_atlasses/data/human/lister_lab/features.tsv'
str_fill_col = np.repeat('Gene', len(hgnc_gene_ids))
out_gene_data = pd.DataFrame(data=[ens_gene_ids.astype(str), hgnc_gene_ids.astype(str), str_fill_col]).T
out_gene_data.to_csv(out_gene_path, sep='\t', index=False, header=False)