In [1]:
import numpy as np
import pandas as pd

import h5py
from scipy.io import mmread
from scipy.sparse import csr_matrix
import scipy

from glob import glob

In [3]:

'''
barcodes_loc = 'GSM7488053_2330-AB-1.barcodes.tsv.gz'
gn_loc = 'GSM7488053_2330-AB-1.genes.tsv.gz'
mtx_loc = 'GSM7488053_2330-AB-1.matrix.mtx.gz'
'''

file_folder = 'GSE234987_RAW/'
out_file = 'test_10x.h5'
genome_name = 'GRCh38'

def mtx2h5(file_folder, file_prefix, genome_name = 'GRCh38'):

    #Load data
    barcodes = pd.read_csv(file_folder + file_prefix + 'barcodes.tsv.gz',
                            sep = '\t', header = None, compression='gzip')

    gene_name = pd.read_csv(file_folder + file_prefix + 'genes.tsv.gz',
                            sep = '\t', header = None, compression='gzip')

    mtx = mmread(file_folder + file_prefix + 'matrix.mtx.gz')
    
    #Transpose the matrix if the shape of the matrix is (gene,cell)
    #if(gene_name.shape[0] != mtx.shape[1]):
    #    print('Transpose')
    #    mtx = mtx.transpose()
    
    #Convert the matrix to csr sparse matrix
    mtx = scipy.sparse.csc_matrix(mtx)
    
    genomes = np.array(np.repeat(genome_name, gene_name.shape[0]), dtype = 'S')

    #Save the converted h5 file as file_prefix + '.h5'
    with h5py.File(file_prefix + 'h5', 'w') as f:
        # Create matrix group
        matrix_grp = f.create_group('matrix')

        # Add datasets to matrix group
        matrix_grp.create_dataset('barcodes', data= np.string_(barcodes[0]))
        matrix_grp.create_dataset('data', data= np.array(mtx.data, dtype = 'i4'))
        matrix_grp.create_dataset('indices', data= np.array(mtx.indices, dtype = 'i8'))
        matrix_grp.create_dataset('indptr', data= np.array(mtx.indptr, dtype = 'i8'))
        matrix_grp.create_dataset('shape', data= np.array(mtx.shape, dtype = 'i4'))

        # Create features group
        features_grp = matrix_grp.create_group('features')
        features_grp.create_dataset('_all_tag_keys', data=np.array(['genome'], dtype = 'S6'))
        features_grp.create_dataset('id', data=np.string_(gene_name[0]))
        features_grp.create_dataset('name', data=np.array(np.string_(gene_name[1]), dtype = 'S16'))
        features_grp.create_dataset('feature_type', data=np.string_(gene_name[2]))
        features_grp.create_dataset('genome', data=genomes)
        # Add other datasets as needed, depending on your data

In [5]:
file_prefixes = pd.Series(glob(file_folder + '*mtx.gz')).str.split('matrix',
                                expand = True)[0].str.split('\\', expand = True)[1].tolist()
file_prefixes

['GSM7488053_2330-AB-1.',
 'GSM7488054_2330-AB-3.',
 'GSM7488055_4902-AB-1.',
 'GSM7488056_4902-AB-2.',
 'GSM7488057_2057-AB-3.',
 'GSM7488058_2057-AB-5.',
 'GSM7488059_2057-AB-6.']

In [6]:
for file_prefix in file_prefixes:
    
    print(file_prefix)
    mtx2h5(file_folder, file_prefix, genome_name = 'GRCh38')

GSM7488053_2330-AB-1.
GSM7488054_2330-AB-3.
GSM7488055_4902-AB-1.
GSM7488056_4902-AB-2.
GSM7488057_2057-AB-3.
GSM7488058_2057-AB-5.
GSM7488059_2057-AB-6.
