In [1]:
!which python

/Users/hanqingliu/miniconda3/envs/omb/bin/python


In [2]:
import pandas as pd
import xarray as xr
import numpy as np
import pathlib
import warnings
import orjson
import msgpack

In [3]:
input_dir = pathlib.Path('../raw/')
output_dir = pathlib.Path('')

categorical_path = 'Category.HDF'
continous_path = 'Continuous.HDF'
coords_path = 'Coords'
gene_rate_path = 'Gene.small.mcds'
palette_path = 'Palette.msg'

COORDS_PATH = 'Coords.h5'
VARIABLE_PATH = 'Variables.h5'
PALETTE_PATH = 'Palette.msg'

categorical_path = input_dir / categorical_path
continuous_path = input_dir / continous_path
coords_dir = input_dir / coords_path
gene_rate_path = input_dir / gene_rate_path
palette_path = input_dir / palette_path

In [4]:
"""
Ingest user input data to standard internal Dataset

Several main things
# Coords file
- The name of the file before first "." char will be the name of the coords set.
- No header
- First column must be cell id
- Second (x), third (y), and forth (z, optional) column are coordinates
- Each file only contain one set of coordinates, if have multiple views, use multiple coords files.
- Coords transfer into np.float16

# Categorical / Continuous variables
- Header according to variable name
- First column must be cell id, all cell ids must exist in the coords
- Columns may have nan

# Palette
- Int id map to real str id
- Associated with continuous variable, categorical variable, and region values

# Region
    ## Gene
        - Int id map to real str id
        - Genome coords

"""
import pandas as pd
import numpy as np
import warnings
import msgpack


def read_msgpack(path):
    with open(path, 'rb') as f:
        data = msgpack.unpackb(f.read())
    return data


def write_msgpack(path, data):
    with open(path, 'wb') as f:
        f.write(msgpack.packb(data))


"""
File names in ingested dataset dir
"""
COORDS_PATH = 'Coords.h5'
CELL_ID_PATH = 'CellIDMap.msg'
VARIABLE_PATH = 'Variables.h5'
PALETTE_PATH = 'Palette.msg'

"""
Default data types
"""
COORDS_DTYPE = np.float16
CONTINUOUS_VAR_DTYPE = np.float32


def ingest_cell_coords(coords_dir, output_dir):
    """
    Load all the coords, use union of cell ids and map all cell id into int internally, return the cell map dict.
    Parameters
    ----------
    coords_dir
        User input dir path
    output_dir
        Standard output dir path
    Returns
    -------
    cell_to_int: dict
        cell to int map, use for all other data's cell id validation and conversion
    """
    # load all the coords
    print(f'Loading cell coords')
    coords_dict = {}
    for path in list(coords_dir.glob('*csv.gz')):
        coord_name = path.name.split('.')[0]
        coords_df = pd.read_csv(path, header=None, index_col=0).astype(COORDS_DTYPE)
        coords_df.index.name = 'cell'
        if coords_df.shape[1] == 2:
            coords_df.columns = ['x', 'y']
        elif coords_df.shape[1] == 3:
            coords_df.columns = ['x', 'y', 'z']
        else:
            raise NotImplementedError(f'Coords table right now only support 2D or 3D, '
                                      f'got a table with {coords_df.shape[1]} dims.')
        coords_dict[coord_name] = coords_df

    # generate cell_id map
    total_cell_ids = set()
    for k, v in coords_dict.items():
        total_cell_ids |= set(v.index)
    cell_to_int = {c: i for i, c in enumerate(total_cell_ids)}
    del total_cell_ids
    print(f'Got a total of {len(cell_to_int)} unique cell ids from all coords files')

    # change all the coords table index into int
    print(f'Standardizing internal cell ids')
    for k, v in coords_dict.items():
        v.index = v.index.map(cell_to_int)

    print(f'Saving coords and cell ids')
    with pd.HDFStore(output_dir / COORDS_PATH, 'w') as hdf:
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            for k, v in coords_dict.items():
                hdf[k] = v
    write_msgpack(output_dir / CELL_ID_PATH, cell_to_int)
    
    return cell_to_int


def ingest_variables(cell_to_int, output_dir, categorical_path=None, continuous_path=None):
    variables_to_cat = []

    if categorical_path is not None:
        categorical_df = pd.read_hdf(categorical_path, key='data').astype('category')
        # validate cell ids before conversion
        cell_not_in_coords = categorical_df.index[~categorical_df.index.isin(cell_to_int)]
        error_str = ', '.join(cell_not_in_coords[:10])
        try:
            assert cell_not_in_coords.size == 0
        except AssertionError:
            raise KeyError(f'{cell_not_in_coords.size} cell ids found in categorical variable table '
                           f'do not found in any coords table, e.g. {error_str}')
        else:
            categorical_df.index = categorical_df.index.map(cell_to_int)
            cells, num_vars = categorical_df.shape
            print(f'Got {num_vars} categorical variables for {cells} cells.')
            variables_to_cat.append(categorical_df)

    if continuous_path is not None:
        continuous_df = pd.read_hdf(continuous_path, key='data').astype(CONTINUOUS_VAR_DTYPE)
        # validate cell ids before conversion
        cell_not_in_coords = continuous_df.index[~continuous_df.index.isin(cell_to_int)]
        error_str = ', '.join(cell_not_in_coords[:10])
        try:
            assert cell_not_in_coords.size == 0
        except AssertionError:
            raise KeyError(f'{cell_not_in_coords.size} cell ids found in continuous variable table '
                           f'do not found in any coords table, e.g. {error_str}')
        else:
            continuous_df.index = continuous_df.index.map(cell_to_int)
            cells, num_vars = continuous_df.shape
            print(f'Got {num_vars} continuous variables for {cells} cells.')
            variables_to_cat.append(continuous_df)

    if len(variables_to_cat) != 0:
        total_variables = pd.concat(variables_to_cat, axis=1, sort=True)
    else:
        total_variables = pd.DataFrame([], index=cell_to_int.values())

    total_variables.to_hdf(output_dir / VARIABLE_PATH, key='data', format="table")
    return total_variables


def ingest_palette(total_variables, output_dir, palette_path=None):
    if palette_path is not None:
        palette = read_msgpack(palette_path)

        categorical_variables = total_variables.select_dtypes('category')

        for k, _ in palette.items():
            try:
                cate_data = categorical_variables[k]
            except KeyError:
                raise KeyError(f'{k} is not provided as a categorical variable, but exist in palette.')

            # TODO check if cate_data completely exist in palette?
            # TODO check values is hex? Standardize the color to certain format?
    else:
        # make an empty palette anyway, prevent file not found
        palette = {}

    write_msgpack(output_dir / PALETTE_PATH, palette)
    return


In [5]:
cell_to_int = ingest_cell_coords(coords_dir, output_dir)
total_variables = ingest_variables(cell_to_int, output_dir, 
                                   categorical_path=categorical_path, continuous_path=continuous_path)


Loading cell coords
Got a total of 103982 unique cell ids from all coords files
Standardizing internal cell ids
Saving coords and cell ids
Got 16 categorical variables for 103982 cells.
Got 10 continuous variables for 103982 cells.


In [6]:
ingest_palette(total_variables, output_dir, palette_path=palette_path)