In [1]:
import h5py
import xarray as xr
import pandas as pd

In [2]:
with h5py.File('cell_4176_mCH.hdf5') as f:
    print(f.keys())
    bins = f['bins'].value
    cov = f['cov'].value
    mc = f['mc'].value
    meta = f['meta'].value

<KeysViewHDF5 ['bins', 'cov', 'mc', 'meta']>




In [3]:
region_bed = pd.DataFrame(bins, columns=['chrom', 'start', 'end'])
metadata = pd.DataFrame(meta, columns=['source', 'slice', 'target', 'gender', 'cluster', 'non-clonal reads'])

In [4]:
cov_df = pd.DataFrame(cov,
                      index=[f'RS2_{i}' for i in range(cov.shape[0])],
                      columns=region_bed.apply(lambda i: '-'.join(i),
                                               axis=1).values)
mc_df = pd.DataFrame(mc,
                     index=[f'RS2_{i}' for i in range(mc.shape[0])],
                     columns=region_bed.apply(lambda i: '-'.join(i),
                                              axis=1).values)
metadata.index = [f'RS2_{i}' for i in range(mc.shape[0])]

In [5]:
sample_mcds = xr.open_dataset('/home/hanliu/project/mouse_rostral_brain/dataset/4H-180806.mcds')
region_bed = pd.DataFrame([
    sample_mcds.coords['chrom100k_chrom'].to_pandas(),
    sample_mcds.coords['chrom100k_bin_start'].to_pandas(),
    sample_mcds.coords['chrom100k_bin_end'].to_pandas(),
]).T

region_to_int = {'-'.join(row.astype(str)): i for i, row in region_bed.iterrows()}
int_columns = cov_df.columns.map(region_to_int)
assert int_columns.isna().sum() == 0

cov_df.columns = int_columns
mc_df.columns = int_columns

cov_df.index.name = 'cell'
cov_df.columns.name = 'chrom100k'
mc_df.index.name = 'cell'
mc_df.columns.name = 'chrom100k'


In [13]:
mcds = xr.concat([xr.DataArray(mc_df), xr.DataArray(cov_df)], dim='count_type')
mcds.coords['count_type'] = ['mc', 'cov']
mcds = mcds.expand_dims(mc_type=['CHN'])

In [26]:
mcds = xr.Dataset({'chrom100k_da': mcds})
mcds.to_netcdf('RS2_PT.4176cells.raw_count.mcds')

In [24]:
metadata.to_hdf('RS2_PT.4176cells.meta.hdf', key='data')

In [27]:
mcds