In [31]:
from anndata import read_h5ad
import pyarrow as pa
import pandas as pd
import numpy as np
import json
import zarr
import os
from hurry.filesize import size
from numcodecs import Zlib

In [32]:
out = read_h5ad("out.h5ad")
cluster_marker_genes = read_h5ad("cluster_marker_genes.h5ad")

In [33]:
cluster_marker_genes

AnnData object with n_obs × n_vars = 6010 × 9006 
    obs: 'n_genes', 'n_counts', 'leiden'
    var: 'n_cells'
    uns: 'leiden', 'leiden_colors', 'neighbors', 'rank_genes_groups', 'umap'
    obsm: 'X_pca', 'X_umap'

In [34]:
out

AnnData object with n_obs × n_vars = 6287 × 38032 

In [35]:
# Choose one of the two anndata objects for analysis
gexp = cluster_marker_genes

In [36]:
gexp_arr = gexp.X
gexp_df = gexp.to_df()

In [37]:
gexp_arr.shape

(6010, 9006)

In [38]:
gexp_arr.min()

-1.0937375

In [39]:
# Re-scale the gene expression values between 0 and 255
gexp_arr_min = gexp_arr.min()
gexp_arr_max = gexp_arr.max()
gexp_arr_range = gexp_arr_max - gexp_arr_min
gexp_arr_ratio = 255 / gexp_arr_range

gexp_norm_arr = (gexp_arr - gexp_arr_min) * gexp_arr_ratio

In [40]:
gexp_norm_df = pd.DataFrame(index=gexp_df.index.values.tolist(), columns=gexp_df.columns.values.tolist(), data=gexp_norm_arr)

In [11]:
# Try CSV
gexp_norm_df.to_csv("cluster_marker_genes.csv")

In [12]:
# Try Arrow
table = pa.Table.from_pandas(gexp_norm_df)

with pa.RecordBatchFileWriter("cluster_marker_genes.arrow", table.schema) as writer:
    writer.write(table)

In [145]:
# Try genes.json
"""
{
    "Gad2": {
        "max": 237,
        "cells": {
            "1": 0,
            "2": 3
        }
    },
    "Slc32a1": {
        "max": 123,
        "cells": {
            "1": 4,
            "2": 5
        }
    }
}
"""
genes_json = gexp_norm_df.to_dict()
for k, v in genes_json.items():
    genes_json[k] = {
        "max": int(gexp_norm_df[k].max()),
        "cells": dict(zip(v.keys(), map(int, v.values()))),
    }

with open("cluster_marker_genes.genes.json", 'w') as f:
    json.dump(genes_json, f)

In [26]:
# Try clusters.json
"""
{
    "rows": [
        "Gad2",
        "Slc32a1"
    ],
    "cols": [
        "1",
        "2"
    ],
    "matrix": [
        [0, 4],
        [3, 5]
    ]
}
"""

clusters_json = {
    "rows": gexp.obs.index.values.tolist(),
    "cols": gexp.var.index.values.tolist(),
    "matrix": [ list(map(int, r)) for r in gexp_norm_arr ]
}

with open("cluster_marker_genes.clusters.json", 'w') as f:
    json.dump(clusters_json, f)

In [41]:
z = zarr.open(
    'cluster_marker_genes.zarr',
    mode='w',
    shape=gexp_norm_arr.shape,
    dtype='uint8',
    compressor=Zlib(level=1)
)

# data
z[:] = gexp_norm_arr
# observations: cells (rows)
z.attrs["obs"] = gexp.obs.index.values.tolist()
# variables: genes (columns)
z.attrs["var"] = gexp.var.index.values.tolist()

In [42]:
z.info

0,1
Type,zarr.core.Array
Data type,uint8
Shape,"(6010, 9006)"
Chunk shape,"(752, 1126)"
Order,C
Read-only,False
Compressor,Zlib(level=1)
Store type,zarr.storage.DirectoryStore
No. bytes,54126060 (51.6M)
No. bytes stored,15537346 (14.8M)


In [43]:
size_df = pd.DataFrame(columns=["ext", "size", "size_str"], index=[], data=[])
for ext in ["zarr", "clusters.json", "arrow", "csv", "genes.json"]:
    file_name = f"cluster_marker_genes.{ext}"
    if ext == "zarr":
        file_size = sum(os.path.getsize(os.path.join(file_name, f)) for f in os.listdir(file_name))
    else:
        file_size = os.path.getsize(file_name)
    size_df = size_df.append({ "ext": ext, "size": file_size, "size_str": size(file_size) }, ignore_index=True)
size_df.set_index("ext")

Unnamed: 0_level_0,size,size_str
ext,Unnamed: 1_level_1,Unnamed: 2_level_1
zarr,15537346,14M
clusters.json,217422070,207M
arrow,220597802,210M
csv,528811342,504M
genes.json,1300036168,1G
