Merge pull request #81 from klarman-cell-observatory/rocherr

spatial read and write to zarr
lilab-bcb · Dec 21, 2021 · 43d7b0a · 43d7b0a
2 parents b353dc0 + bb3e867
commit 43d7b0a
Show file tree

Hide file tree

Showing 8 changed files with 256 additions and 65 deletions.
diff --git a/pegasusio/__init__.py b/pegasusio/__init__.py
@@ -14,7 +14,7 @@
 warnings.filterwarnings("ignore", category=FutureWarning, module='anndata')
 
 
-modalities = ['rna', 'citeseq', 'hashing', 'tcr', 'bcr', 'crispr', 'atac', 'cyto', 'nanostring']
+modalities = ['rna', 'citeseq', 'hashing', 'tcr', 'bcr', 'crispr', 'atac', 'cyto', 'nanostring', 'visium']
 
 from .decorators import timer, run_gc
 
@@ -23,6 +23,7 @@
 from .citeseq_data import CITESeqData
 from .cyto_data import CytoData
 from .nanostring_data import NanostringData
+from .spatial_data import SpatialData
 from .qc_utils import calc_qc_filters, apply_qc_filters, DictWithDefault
 from .multimodal_data import MultimodalData
 from .aggr_data import AggrData, _fillna

diff --git a/pegasusio/multimodal_data.py b/pegasusio/multimodal_data.py
@@ -1,7 +1,7 @@
 import gc
 import numpy as np
 import pandas as pd
-from scipy.sparse import csr_matrix, hstack, vstack
+from scipy.sparse import csr_matrix, hstack
 from typing import List, Dict, Union, Set, Tuple, Optional
 
 import logging
@@ -46,6 +46,17 @@ def update(self, data: "MultimodalData") -> None:
                 raise ValueError(f"Detected duplicated key '{key}'")
             self.data[key] = data.data[key]
 
+    # Check if the img field is there
+    @property
+    def img(self) -> Union[pd.DataFrame, None]:
+        return self._unidata.img if self._unidata is not None and hasattr(self._unidata, 'img') else None
+
+    # Set the img field if needed
+    @img.setter
+    def img(self, img: pd.DataFrame):
+        assert self._unidata is not None 
+        assert self._unidata.get_modality() ==  "visium", "data needs to be spatial"
+        self._unidata.img = img
 
     @property
     def obs(self) -> Union[pd.DataFrame, None]:
@@ -358,7 +369,6 @@ def filter_data(self,
         """
         Filter each "rna" modality UnimodalData in the focus_list separately using the set filtration parameters. Then for all other UnimodalData objects, select only barcodes that are in the union of selected barcodes from previously filtered UnimodalData objects.
         If focus_list is None, focus_list = [self._selected]
-
         Parameters
         ----------
         select_singlets: ``bool``, optional, default ``False``
@@ -588,4 +598,4 @@ def _clean_tmp(self) -> dict:
 
     def _addback_tmp(self, _tmp_multi) -> None:
         for key, _tmp_dict in _tmp_multi.items():
-            self.data[key]._addback_tmp(_tmp_dict)
+            self.data[key]._addback_tmp(_tmp_dict)
diff --git a/pegasusio/readwrite.py b/pegasusio/readwrite.py
@@ -17,19 +17,16 @@
 from .vdj_utils import load_10x_vdj_file
 from .cyto_utils import load_fcs_file
 from .nanostring_utils import load_nanostring_files
+from .spatial_utils import load_visium_folder
 
 
 def infer_file_type(input_file: Union[str, List[str]]) -> Tuple[str, str, str]:
     """ Infer file format from input_file name
-
     This function infer file type by inspecting the file name.
-
     Parameters
     ----------
-
     input_file : `str` or `List[str]`
         Input file name.
-
     Returns
     -------
     `str`
@@ -38,7 +35,6 @@ def infer_file_type(input_file: Union[str, List[str]]) -> Tuple[str, str, str]:
         The path covering all input files. Most time this is the same as input_file. But for HCA mtx and csv, this should be parent directory.
     `str`
         Type of the path, either 'file' or 'directory'.
-
     Note: The last two `str`s are mainly used for transfer to cloud
     """
     file_type = None
@@ -104,16 +100,13 @@ def read_input(
     select_modality: Set[str] = None,
 ) -> MultimodalData:
     """Load data into memory.
-
     This function is used to load input data into memory. Inputs can be in 'zarr', 'h5ad', 'loom', '10x', 'mtx', 'csv', 'tsv', 'fcs' (for flow/mass cytometry data) or 'nanostring' (Nanostring GeoMx spatial data) formats.
-
     Parameters
     ----------
-
     input_file : `str`
         Input file name.
     file_type : `str`, optional (default: None)
-        File type, choosing from 'zarr', 'h5ad', 'loom', '10x', 'mtx', 'csv', 'tsv', 'fcs' (for flow/mass cytometry data) or 'nanostring'. If None, inferred from input_file.
+        File type, choosing from 'zarr', 'h5ad', 'loom', '10x', 'mtx', 'csv', 'tsv', 'fcs' (for flow/mass cytometry data), 'nanostring' or 'visium'. If None, inferred from input_file.
     mode: `str`, optional (default: 'r')
         File open mode, options are 'r' or 'a'. If mode == 'a', file_type must be zarr and ngene/select_singlets cannot be set.
     genome : `str`, optional (default: None)
@@ -128,19 +121,15 @@ def read_input(
         Only select data with genomes in select_genome. Select_data, select_genome and select_modality are mutually exclusive.
     select_modality: `Set[str]`, optional (default: None)
         Only select data with modalities in select_modality. Select_data, select_genome and select_modality are mutually exclusive.
-
     Returns
     -------
-
     A MultimodalData object.
-
     Examples
     --------
     >>> data = io.read_input('example_10x.h5')
     >>> data = io.read_input('example.h5ad')
     >>> data = io.read_input('example_ADT.csv', genome = 'hashing_HTO', modality = 'hashing')
     """
-
     if is_list_like(input_file):
         input_file = [os.path.expanduser(os.path.expandvars(x)) for x in input_file]
     else:
@@ -171,6 +160,8 @@ def read_input(
             segment_file = input_file[1]
             annotation_file = input_file[2] if len(input_file) > 2 else None
             data = load_nanostring_files(input_matrix, segment_file, annotation_file = annotation_file, genome = genome)
+        elif file_type == 'visium':
+            data = load_visium_folder(input_file)
         elif file_type == "mtx":
             data = load_mtx_file(input_file, genome = genome, modality = modality)
         else:
@@ -198,27 +189,22 @@ def write_output(
     precision: int = 2
 ) -> None:
     """ Write data back to disk.
-
     This function is used to write data back to disk.
-
     Parameters
     ----------
-
     data : MutimodalData
         data to write back.
     output_file : `str`
-        output file name. Note that for mtx files, output_file specifies a directory. For scp format, file_type must be specified. 
+        output file name. Note that for mtx files, output_file specifies a directory. For scp format, file_type must be specified.
     file_type : `str`, optional (default: None)
         File type can be 'zarr' (as folder), 'zarr.zip' (as a ZIP file), 'h5ad', 'loom', 'mtx' or 'scp'. If file_type is None, it will be inferred based on output_file.
     is_sparse : `bool`, optional (default: True)
         Only used for writing out SCP-compatible files, if write expression as a sparse matrix.
     precision : `int`, optional (default: 2)
         Precision after decimal point for values in mtx and scp expression matrix.
-
     Returns
     -------
     `None`
-
     Examples
     --------
     >>> io.write_output(data, 'test.zarr')
@@ -262,4 +248,4 @@ def _infer_output_file_type(output_File: str) -> str:
         write_scp_file(data, output_file, is_sparse = is_sparse, precision = precision)
 
     data._addback_tmp(_tmp_multi)
-    logger.info(f"{file_type} file '{output_file}' is written.")
+    logger.info(f"{file_type} file '{output_file}' is written.")
diff --git a/pegasusio/spatial_data.py b/pegasusio/spatial_data.py
@@ -0,0 +1,62 @@
+import numpy as np
+import pandas as pd
+from scipy.sparse import csr_matrix
+from typing import Dict, Optional, Union
+
+import logging
+
+from pegasusio.unimodal_data import UnimodalData
+
+logger = logging.getLogger(__name__)
+
+
+class SpatialData(UnimodalData):
+    """
+    Class to implement data structure to
+    manipulate spatial data with the spatial image (img) field
+    This class extends UnimodalData with additional
+    functions specific to the img field
+    """
+
+    def __init__(
+        self,
+        barcode_metadata: Optional[Union[dict, pd.DataFrame]] = None,
+        feature_metadata: Optional[Union[dict, pd.DataFrame]] = None,
+        matrices: Optional[Dict[str, csr_matrix]] = None,
+        metadata: Optional[dict] = None,
+        barcode_multiarrays: Optional[Dict[str, np.ndarray]] = None,
+        feature_multiarrays: Optional[Dict[str, np.ndarray]] = None,
+        barcode_multigraphs: Optional[Dict[str, csr_matrix]] = None,
+        feature_multigraphs: Optional[Dict[str, csr_matrix]] = None,
+        cur_matrix: str = "raw.data",
+        img=None,
+    ) -> None:
+        assert metadata["modality"] == "visium"
+        super().__init__(
+            barcode_metadata,
+            feature_metadata,
+            matrices,
+            metadata,
+            barcode_multiarrays,
+            feature_multiarrays,
+            barcode_multigraphs,
+            feature_multigraphs,
+            cur_matrix,
+        )
+        self._img = img
+
+    @property
+    def img(self) -> Optional[pd.DataFrame]:
+        return self._img
+
+    @img.setter
+    def img(self, img: pd.DataFrame):
+        self._img = img
+
+    def __repr__(self) -> str:
+        repr_str = super().__repr__()
+        key = "img"
+        fstr = self._gen_repr_str_for_attrs(key)
+        if fstr != "":
+            repr_str += f"\n    {key}: {fstr}"
+        return repr_str
diff --git a/pegasusio/spatial_utils.py b/pegasusio/spatial_utils.py
@@ -0,0 +1,109 @@
+import os
+
+import pandas as pd
+
+from pegasusio import MultimodalData, SpatialData
+from .hdf5_utils import load_10x_h5_file
+import json
+from PIL import Image
+
+
+def process_spatial_metadata(df):
+    df["in_tissue"] = df["in_tissue"].apply(lambda n: True if n == 1 else False)
+    df["barcodekey"] = df["barcodekey"].map(lambda s: s.split("-")[0])
+    df.set_index("barcodekey", inplace=True)
+
+
+def is_image(filename):
+    return filename.endswith((".png", ".jpg"))
+
+
+def load_visium_folder(input_path) -> MultimodalData:
+    """
+    Method to read the visium spatial data folder
+    into MultimodalData object that contains SpatialData
+    """
+    file_list = os.listdir(input_path)
+    sample_id = input_path.split("/")[-1]
+    # Load count matrix.
+    hdf5_filename = "raw_feature_bc_matrix.h5"
+    assert hdf5_filename in file_list, "Raw count hdf5 file is missing!"
+    rna_data = load_10x_h5_file(f"{input_path}/{hdf5_filename}")
+
+    # Load spatial metadata.
+    assert ("spatial" in file_list) and (
+        os.path.isdir(f"{input_path}/spatial")
+    ), "Spatial folder is missing!"
+    tissue_pos_csv = "spatial/tissue_positions_list.csv"
+    spatial_metadata = pd.read_csv(
+        f"{input_path}/{tissue_pos_csv}",
+        names=[
+            "barcodekey",
+            "in_tissue",
+            "array_row",
+            "array_col",
+            "pxl_col_in_fullres",
+            "pxl_row_in_fullres",
+        ],
+    )
+    process_spatial_metadata(spatial_metadata)
+
+    barcode_metadata = pd.concat([rna_data.obs, spatial_metadata], axis=1)
+    feature_metadata = rna_data.var
+
+    matrices = {"raw.data": rna_data.X}
+    metadata = {"genome": rna_data.get_genome(), "modality": "visium"}
+
+    #  Store “pxl_col_in_fullres” and ”pxl_row_in_fullres” as a 2D array,
+    # which is the spatial location info of each cell in the dataset.
+    obsm = spatial_metadata[["pxl_col_in_fullres", "pxl_row_in_fullres"]]
+    barcode_multiarrays = {"spatial_coordinates": obsm.to_numpy()}
+
+    #  Store all the other spatial info of cells, i.e. “in_tissue”, “array_row”, and “array_col”
+    obs = spatial_metadata[["in_tissue", "array_row", "array_col"]]
+    barcode_metadata = obs
+
+    # Store image metadata as a Pandas DataFrame, with the following structure:
+    img = pd.DataFrame()
+    spatial_path = f"{input_path}/spatial"
+
+    with open(f"{spatial_path}/scalefactors_json.json") as fp:
+        scale_factors = json.load(fp)
+
+    arr = os.listdir(spatial_path)
+    for png in arr:
+        if not is_image(png):
+            continue
+        if "hires" in png:
+            with Image.open(f"{spatial_path}/{png}") as data:
+                data.load()
+            dict = {
+                "sample_id": sample_id,
+                "image_id": "hires",
+                "data": data,
+                "scaleFactor": scale_factors["tissue_hires_scalef"],
+            }
+            img = img.append(dict, ignore_index=True)
+        elif "lowres" in png:
+            with Image.open(f"{spatial_path}/{png}") as data:
+                data.load()
+            dict = {
+                "sample_id": sample_id,
+                "image_id": "lowres",
+                "data": data,
+                "scaleFactor": scale_factors["tissue_lowres_scalef"],
+            }
+            img = img.append(dict, ignore_index=True)
+
+    assert not img.empty, "the image data frame is empty"
+    spdata = SpatialData(
+        barcode_metadata,
+        feature_metadata,
+        matrices,
+        metadata,
+        barcode_multiarrays=barcode_multiarrays,
+        img=img,
+    )
+    data = MultimodalData(spdata)
+
+    return data