Skip to content

Commit

Permalink
Merge pull request #81 from klarman-cell-observatory/rocherr
Browse files Browse the repository at this point in the history
spatial read and write to zarr
  • Loading branch information
yihming committed Dec 21, 2021
2 parents b353dc0 + bb3e867 commit 43d7b0a
Show file tree
Hide file tree
Showing 8 changed files with 256 additions and 65 deletions.
3 changes: 2 additions & 1 deletion pegasusio/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
warnings.filterwarnings("ignore", category=FutureWarning, module='anndata')


modalities = ['rna', 'citeseq', 'hashing', 'tcr', 'bcr', 'crispr', 'atac', 'cyto', 'nanostring']
modalities = ['rna', 'citeseq', 'hashing', 'tcr', 'bcr', 'crispr', 'atac', 'cyto', 'nanostring', 'visium']

from .decorators import timer, run_gc

Expand All @@ -23,6 +23,7 @@
from .citeseq_data import CITESeqData
from .cyto_data import CytoData
from .nanostring_data import NanostringData
from .spatial_data import SpatialData
from .qc_utils import calc_qc_filters, apply_qc_filters, DictWithDefault
from .multimodal_data import MultimodalData
from .aggr_data import AggrData, _fillna
Expand Down
16 changes: 13 additions & 3 deletions pegasusio/multimodal_data.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import gc
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, hstack, vstack
from scipy.sparse import csr_matrix, hstack
from typing import List, Dict, Union, Set, Tuple, Optional

import logging
Expand Down Expand Up @@ -46,6 +46,17 @@ def update(self, data: "MultimodalData") -> None:
raise ValueError(f"Detected duplicated key '{key}'")
self.data[key] = data.data[key]

# Check if the img field is there
@property
def img(self) -> Union[pd.DataFrame, None]:
return self._unidata.img if self._unidata is not None and hasattr(self._unidata, 'img') else None

# Set the img field if needed
@img.setter
def img(self, img: pd.DataFrame):
assert self._unidata is not None
assert self._unidata.get_modality() == "visium", "data needs to be spatial"
self._unidata.img = img

@property
def obs(self) -> Union[pd.DataFrame, None]:
Expand Down Expand Up @@ -358,7 +369,6 @@ def filter_data(self,
"""
Filter each "rna" modality UnimodalData in the focus_list separately using the set filtration parameters. Then for all other UnimodalData objects, select only barcodes that are in the union of selected barcodes from previously filtered UnimodalData objects.
If focus_list is None, focus_list = [self._selected]
Parameters
----------
select_singlets: ``bool``, optional, default ``False``
Expand Down Expand Up @@ -588,4 +598,4 @@ def _clean_tmp(self) -> dict:

def _addback_tmp(self, _tmp_multi) -> None:
for key, _tmp_dict in _tmp_multi.items():
self.data[key]._addback_tmp(_tmp_dict)
self.data[key]._addback_tmp(_tmp_dict)
26 changes: 6 additions & 20 deletions pegasusio/readwrite.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,16 @@
from .vdj_utils import load_10x_vdj_file
from .cyto_utils import load_fcs_file
from .nanostring_utils import load_nanostring_files
from .spatial_utils import load_visium_folder


def infer_file_type(input_file: Union[str, List[str]]) -> Tuple[str, str, str]:
""" Infer file format from input_file name
This function infer file type by inspecting the file name.
Parameters
----------
input_file : `str` or `List[str]`
Input file name.
Returns
-------
`str`
Expand All @@ -38,7 +35,6 @@ def infer_file_type(input_file: Union[str, List[str]]) -> Tuple[str, str, str]:
The path covering all input files. Most time this is the same as input_file. But for HCA mtx and csv, this should be parent directory.
`str`
Type of the path, either 'file' or 'directory'.
Note: The last two `str`s are mainly used for transfer to cloud
"""
file_type = None
Expand Down Expand Up @@ -104,16 +100,13 @@ def read_input(
select_modality: Set[str] = None,
) -> MultimodalData:
"""Load data into memory.
This function is used to load input data into memory. Inputs can be in 'zarr', 'h5ad', 'loom', '10x', 'mtx', 'csv', 'tsv', 'fcs' (for flow/mass cytometry data) or 'nanostring' (Nanostring GeoMx spatial data) formats.
Parameters
----------
input_file : `str`
Input file name.
file_type : `str`, optional (default: None)
File type, choosing from 'zarr', 'h5ad', 'loom', '10x', 'mtx', 'csv', 'tsv', 'fcs' (for flow/mass cytometry data) or 'nanostring'. If None, inferred from input_file.
File type, choosing from 'zarr', 'h5ad', 'loom', '10x', 'mtx', 'csv', 'tsv', 'fcs' (for flow/mass cytometry data), 'nanostring' or 'visium'. If None, inferred from input_file.
mode: `str`, optional (default: 'r')
File open mode, options are 'r' or 'a'. If mode == 'a', file_type must be zarr and ngene/select_singlets cannot be set.
genome : `str`, optional (default: None)
Expand All @@ -128,19 +121,15 @@ def read_input(
Only select data with genomes in select_genome. Select_data, select_genome and select_modality are mutually exclusive.
select_modality: `Set[str]`, optional (default: None)
Only select data with modalities in select_modality. Select_data, select_genome and select_modality are mutually exclusive.
Returns
-------
A MultimodalData object.
Examples
--------
>>> data = io.read_input('example_10x.h5')
>>> data = io.read_input('example.h5ad')
>>> data = io.read_input('example_ADT.csv', genome = 'hashing_HTO', modality = 'hashing')
"""

if is_list_like(input_file):
input_file = [os.path.expanduser(os.path.expandvars(x)) for x in input_file]
else:
Expand Down Expand Up @@ -171,6 +160,8 @@ def read_input(
segment_file = input_file[1]
annotation_file = input_file[2] if len(input_file) > 2 else None
data = load_nanostring_files(input_matrix, segment_file, annotation_file = annotation_file, genome = genome)
elif file_type == 'visium':
data = load_visium_folder(input_file)
elif file_type == "mtx":
data = load_mtx_file(input_file, genome = genome, modality = modality)
else:
Expand Down Expand Up @@ -198,27 +189,22 @@ def write_output(
precision: int = 2
) -> None:
""" Write data back to disk.
This function is used to write data back to disk.
Parameters
----------
data : MutimodalData
data to write back.
output_file : `str`
output file name. Note that for mtx files, output_file specifies a directory. For scp format, file_type must be specified.
output file name. Note that for mtx files, output_file specifies a directory. For scp format, file_type must be specified.
file_type : `str`, optional (default: None)
File type can be 'zarr' (as folder), 'zarr.zip' (as a ZIP file), 'h5ad', 'loom', 'mtx' or 'scp'. If file_type is None, it will be inferred based on output_file.
is_sparse : `bool`, optional (default: True)
Only used for writing out SCP-compatible files, if write expression as a sparse matrix.
precision : `int`, optional (default: 2)
Precision after decimal point for values in mtx and scp expression matrix.
Returns
-------
`None`
Examples
--------
>>> io.write_output(data, 'test.zarr')
Expand Down Expand Up @@ -262,4 +248,4 @@ def _infer_output_file_type(output_File: str) -> str:
write_scp_file(data, output_file, is_sparse = is_sparse, precision = precision)

data._addback_tmp(_tmp_multi)
logger.info(f"{file_type} file '{output_file}' is written.")
logger.info(f"{file_type} file '{output_file}' is written.")
62 changes: 62 additions & 0 deletions pegasusio/spatial_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from typing import Dict, Optional, Union

import logging

from pegasusio.unimodal_data import UnimodalData

logger = logging.getLogger(__name__)


class SpatialData(UnimodalData):
"""
Class to implement data structure to
manipulate spatial data with the spatial image (img) field
This class extends UnimodalData with additional
functions specific to the img field
"""

def __init__(
self,
barcode_metadata: Optional[Union[dict, pd.DataFrame]] = None,
feature_metadata: Optional[Union[dict, pd.DataFrame]] = None,
matrices: Optional[Dict[str, csr_matrix]] = None,
metadata: Optional[dict] = None,
barcode_multiarrays: Optional[Dict[str, np.ndarray]] = None,
feature_multiarrays: Optional[Dict[str, np.ndarray]] = None,
barcode_multigraphs: Optional[Dict[str, csr_matrix]] = None,
feature_multigraphs: Optional[Dict[str, csr_matrix]] = None,
cur_matrix: str = "raw.data",
img=None,
) -> None:
assert metadata["modality"] == "visium"
super().__init__(
barcode_metadata,
feature_metadata,
matrices,
metadata,
barcode_multiarrays,
feature_multiarrays,
barcode_multigraphs,
feature_multigraphs,
cur_matrix,
)
self._img = img

@property
def img(self) -> Optional[pd.DataFrame]:
return self._img

@img.setter
def img(self, img: pd.DataFrame):
self._img = img

def __repr__(self) -> str:
repr_str = super().__repr__()
key = "img"
fstr = self._gen_repr_str_for_attrs(key)
if fstr != "":
repr_str += f"\n {key}: {fstr}"
return repr_str
109 changes: 109 additions & 0 deletions pegasusio/spatial_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
import os

import pandas as pd

from pegasusio import MultimodalData, SpatialData
from .hdf5_utils import load_10x_h5_file
import json
from PIL import Image


def process_spatial_metadata(df):
df["in_tissue"] = df["in_tissue"].apply(lambda n: True if n == 1 else False)
df["barcodekey"] = df["barcodekey"].map(lambda s: s.split("-")[0])
df.set_index("barcodekey", inplace=True)


def is_image(filename):
return filename.endswith((".png", ".jpg"))


def load_visium_folder(input_path) -> MultimodalData:
"""
Method to read the visium spatial data folder
into MultimodalData object that contains SpatialData
"""
file_list = os.listdir(input_path)
sample_id = input_path.split("/")[-1]
# Load count matrix.
hdf5_filename = "raw_feature_bc_matrix.h5"
assert hdf5_filename in file_list, "Raw count hdf5 file is missing!"
rna_data = load_10x_h5_file(f"{input_path}/{hdf5_filename}")

# Load spatial metadata.
assert ("spatial" in file_list) and (
os.path.isdir(f"{input_path}/spatial")
), "Spatial folder is missing!"
tissue_pos_csv = "spatial/tissue_positions_list.csv"
spatial_metadata = pd.read_csv(
f"{input_path}/{tissue_pos_csv}",
names=[
"barcodekey",
"in_tissue",
"array_row",
"array_col",
"pxl_col_in_fullres",
"pxl_row_in_fullres",
],
)
process_spatial_metadata(spatial_metadata)

barcode_metadata = pd.concat([rna_data.obs, spatial_metadata], axis=1)
feature_metadata = rna_data.var

matrices = {"raw.data": rna_data.X}
metadata = {"genome": rna_data.get_genome(), "modality": "visium"}

# Store “pxl_col_in_fullres” and ”pxl_row_in_fullres” as a 2D array,
# which is the spatial location info of each cell in the dataset.
obsm = spatial_metadata[["pxl_col_in_fullres", "pxl_row_in_fullres"]]
barcode_multiarrays = {"spatial_coordinates": obsm.to_numpy()}

# Store all the other spatial info of cells, i.e. “in_tissue”, “array_row”, and “array_col”
obs = spatial_metadata[["in_tissue", "array_row", "array_col"]]
barcode_metadata = obs

# Store image metadata as a Pandas DataFrame, with the following structure:
img = pd.DataFrame()
spatial_path = f"{input_path}/spatial"

with open(f"{spatial_path}/scalefactors_json.json") as fp:
scale_factors = json.load(fp)

arr = os.listdir(spatial_path)
for png in arr:
if not is_image(png):
continue
if "hires" in png:
with Image.open(f"{spatial_path}/{png}") as data:
data.load()
dict = {
"sample_id": sample_id,
"image_id": "hires",
"data": data,
"scaleFactor": scale_factors["tissue_hires_scalef"],
}
img = img.append(dict, ignore_index=True)
elif "lowres" in png:
with Image.open(f"{spatial_path}/{png}") as data:
data.load()
dict = {
"sample_id": sample_id,
"image_id": "lowres",
"data": data,
"scaleFactor": scale_factors["tissue_lowres_scalef"],
}
img = img.append(dict, ignore_index=True)

assert not img.empty, "the image data frame is empty"
spdata = SpatialData(
barcode_metadata,
feature_metadata,
matrices,
metadata,
barcode_multiarrays=barcode_multiarrays,
img=img,
)
data = MultimodalData(spdata)

return data

0 comments on commit 43d7b0a

Please sign in to comment.