In [None]:
import h5py
import pandas as pd
import numpy as np
import json
import os
from os.path import join

In [None]:
f = h5py.File("Cistrome_DNase_1kb_average.multires.mv5")
good_f = h5py.File("my_file_genome_wide_20180228.multires.mv5", "r")

In [None]:
def descend_obj(obj,sep='\t'):
    """
    Iterate through groups in a HDF5 file and prints the groups and datasets names and datasets attributes
    """
    if type(obj) in [h5py._hl.group.Group,h5py._hl.files.File]:
        for key in obj.keys():
            print(sep,'-',key,':',obj[key])
            descend_obj(obj[key],sep=sep+'\t')
    elif type(obj)==h5py._hl.dataset.Dataset:
        for key in obj.attrs.keys():
            print(sep+'\t','-',key,':',obj.attrs[key])

In [None]:
descend_obj(f)

In [None]:
def tileset_info(f):
    """
    Return some information about this tileset that will
    help render it in on the client.
    Parameters
    ----------
    filename: str
      The filename of the h5py file containing the tileset info.
    Returns
    -------
    tileset_info: {}
      A dictionary containing the information describing
      this dataset
    """
    # t1 = time.time()
    #f = h5py.File(filename, "r")
    # t2 = time.time()
    # a sorted list of resolutions, lowest to highest
    # awkward to write because a the numbers representing resolution
    # are datapoints / pixel so lower resolution is actually a higher
    # number
    resolutions = sorted([int(r) for r in f["resolutions"].keys()])[::-1]

    # the "leftmost" datapoint position
    # an array because higlass can display multi-dimensional
    # data
    min_pos = [0]
    max_pos = [int(sum(f["chroms"]["length"][:]))]

    # the "rightmost" datapoint position
    # max_pos = [len(f['resolutions']['values'][str(resolutions[-1])])]
    tile_size = int(f["info"].attrs["tile-size"])
    first_chrom = f["chroms"]["name"][0]

    shape = list(f["resolutions"][str(resolutions[0])]["values"][first_chrom].shape)
    shape[0] = tile_size

    # t3 = time.time()
    # print("tileset info time:", t3 - t2)

    tileset_info = {
        "resolutions": resolutions,
        "min_pos": min_pos,
        "max_pos": max_pos,
        "tile_size": tile_size,
        "shape": shape,
    }

    if "row_infos" in f["resolutions"][str(resolutions[0])].attrs:
        row_infos = f["resolutions"][str(resolutions[0])].attrs["row_infos"]
        tileset_info["row_infos"] = [r.decode("utf8") for r in row_infos]

    return tileset_info

In [None]:
tileset_info(good_f)

In [None]:
chrom_0_to_22_names = good_f["chroms"]["name"][0:22]
chrom_0_to_22_lengths = good_f["chroms"]["length"][0:22]

In [None]:
try:
    del f["chroms"]["name"]
except:
    pass
try:
    del f["chroms"]["length"]
except:
    pass
f["chroms"].create_dataset("name", data=chrom_0_to_22_names, dtype='S23')
f["chroms"].create_dataset("length", data=chrom_0_to_22_lengths, dtype='int64')



In [None]:
f_resolutions = list(f["resolutions"].keys())
f_resolutions

In [None]:
f["resolutions"]['1000']["values"]["chr22"][()].sum()

In [None]:
for r in f_resolutions:
    try:
        del f["resolutions"][r]["chroms"]["name"]
    except:
        pass
    try:
        del f["resolutions"][r]["chroms"]["length"]
    except:
        pass
    f["resolutions"][r]["chroms"].create_dataset("name", data=chrom_0_to_22_names, dtype='S23')
    f["resolutions"][r]["chroms"].create_dataset("length", data=chrom_0_to_22_lengths, dtype='int64')

In [None]:
for r in f_resolutions:
    for chr_name in chrom_0_to_22_names[:-1]:
        res_chr_width = good_f["resolutions"][r]["values"][chr_name].shape[0]
        res_chr_height = f["resolutions"][r]["values"]["chr22"].shape[1]
        try:
            del f["resolutions"][r]["values"][chr_name]
        except:
            pass
        f["resolutions"][r]["values"].create_dataset(chr_name, data=np.zeros((res_chr_width, res_chr_height), dtype='<f4'))

In [None]:
f["resolutions"]['1000']["values"]["chr22"][()].sum()

In [None]:
row_infos = f["resolutions"]["16384000"].attrs["row_infos"]
row_infos

In [None]:
row_infos_objs = [ str(json.dumps({ "Cluster": r.decode('utf-8').split('\t')[0], "Cell Type": r.decode('utf-8').split('\t')[1] })).encode() for r in row_infos ]

In [None]:
f["resolutions"]["16384000"].attrs["row_infos"] = row_infos_objs

In [None]:
tileset_info(f)

In [None]:
f.close()