Merge pull request #84 from klarman-cell-observatory/yiming

Improve 10X Visium data loading
lilab-bcb · Jan 9, 2022 · 6c82c8c · 6c82c8c
2 parents 8e2e597 + 2331c10
commit 6c82c8c
Show file tree

Hide file tree

Showing 4 changed files with 21 additions and 21 deletions.
diff --git a/.gitignore b/.gitignore
@@ -4,6 +4,7 @@ ext_modules/*.so
 __pycache__
 build/
 dist/
+.eggs/
 pegasusio.egg-info/
 pegasusio/cylib/*.so
 .ipynb_checkpoints

diff --git a/pegasusio/spatial_data.py b/pegasusio/spatial_data.py
@@ -29,7 +29,7 @@ def __init__(
         barcode_multigraphs: Optional[Dict[str, csr_matrix]] = None,
         feature_multigraphs: Optional[Dict[str, csr_matrix]] = None,
         cur_matrix: str = "X",
-        img: Optional[pd.DataFrame] = None,
+        image_metadata: Optional[pd.DataFrame] = None,
     ) -> None:
         assert metadata["modality"] == "visium"
         super().__init__(
@@ -43,15 +43,15 @@ def __init__(
             feature_multigraphs,
             cur_matrix,
         )
-        self._img = img
+        self.image_metadata = image_metadata
 
     @property
     def img(self) -> Optional[pd.DataFrame]:
-        return self._img
+        return self.image_metadata
 
     @img.setter
     def img(self, img: pd.DataFrame):
-        self._img = img
+        self.image_metadata = img
 
     def __repr__(self) -> str:
         repr_str = super().__repr__()

diff --git a/pegasusio/spatial_utils.py b/pegasusio/spatial_utils.py
@@ -22,7 +22,7 @@ def load_visium_folder(input_path) -> MultimodalData:
     file_list = os.listdir(input_path)
     sample_id = input_path.split("/")[-1]
     # Load count matrix.
-    hdf5_filename = "raw_feature_bc_matrix.h5"
+    hdf5_filename = "filtered_feature_bc_matrix.h5"
     assert hdf5_filename in file_list, "Raw count hdf5 file is missing!"
     rna_data = load_10x_h5_file(f"{input_path}/{hdf5_filename}")
 
@@ -44,35 +44,33 @@ def load_visium_folder(input_path) -> MultimodalData:
     )
     process_spatial_metadata(spatial_metadata)
 
-    barcode_metadata = pd.concat([rna_data.obs, spatial_metadata], axis=1)
+    barcode_metadata = rna_data.obs.join(spatial_metadata, how='left')
     feature_metadata = rna_data.var
 
     matrices = {"X": rna_data.X}
     metadata = {"genome": rna_data.get_genome(), "modality": "visium"}
 
     #  Store “pxl_col_in_fullres” and ”pxl_row_in_fullres” as a 2D array,
     # which is the spatial location info of each cell in the dataset.
-    obsm = spatial_metadata[["pxl_col_in_fullres", "pxl_row_in_fullres"]]
-    barcode_multiarrays = {"spatial_coordinates": obsm.to_numpy()}
-
-    #  Store all the other spatial info of cells, i.e. “in_tissue”, “array_row”, and “array_col”
-    obs = spatial_metadata[["in_tissue", "array_row", "array_col"]]
-    barcode_metadata = obs
+    spatial_coords = barcode_metadata[['pxl_row_in_fullres', 'pxl_col_in_fullres']]
+    barcode_multiarrays = {"X_spatial": spatial_coords.to_numpy()}
+    barcode_metadata.drop(columns=['pxl_row_in_fullres', 'pxl_col_in_fullres'], inplace=True)
 
     # Store image metadata as a Pandas DataFrame, with the following structure:
-    img = pd.DataFrame()
+    image_metadata = pd.DataFrame()
     spatial_path = f"{input_path}/spatial"
 
     with open(f"{spatial_path}/scalefactors_json.json") as fp:
         scale_factors = json.load(fp)
 
-    def get_image_data(filepath, sample_id, image_id, scaleFactor):
+    def get_image_data(filepath, sample_id, image_id, scaleFactor, spot_diameter_fullres):
         data = Image.open(filepath)
         dict = {
             "sample_id": sample_id,
             "image_id": image_id,
             "data": data,
-            "scaleFactor": scaleFactor,
+            "scale_factor": scaleFactor,
+            "spot_diameter": spot_diameter_fullres * scaleFactor,
         }
         return dict
 
@@ -84,18 +82,19 @@ def get_image_data(filepath, sample_id, image_id, scaleFactor):
                 filepath,
                 sample_id,
                 res_tag,
-                scale_factors[f"tissue_{res_tag}_scalef"]
+                scale_factors[f"tissue_{res_tag}_scalef"],
+                scale_factors["spot_diameter_fullres"]
             )
-            img = img.append(image_item, ignore_index=True)
+            image_metadata = image_metadata.append(image_item, ignore_index=True)
 
-    assert not img.empty, "the image data frame is empty"
+    assert not image_metadata.empty, "the image data frame is empty"
     spdata = SpatialData(
         barcode_metadata,
         feature_metadata,
         matrices,
         metadata,
         barcode_multiarrays=barcode_multiarrays,
-        img=img,
+        image_metadata=image_metadata,
     )
     data = MultimodalData(spdata)
 

diff --git a/pegasusio/zarr_utils.py b/pegasusio/zarr_utils.py
@@ -254,7 +254,7 @@ def read_unimodal_data(self, group: zarr.Group) -> UnimodalData:
             else dict(),
         )
         if isinstance (unidata, SpatialData):
-            unidata.img = self.read_dataframe(group["img"]) if "img" in group else dict()
+            unidata.image_metadata = self.read_dataframe(group["image_metadata"]) if "image_metadata" in group else dict()
 
         if group.attrs.get("_cur_matrix", None) is not None:
             unidata.select_matrix(group.attrs["_cur_matrix"])
@@ -443,7 +443,7 @@ def write_unimodal_data(self, parent: zarr.Group, name: str, data: UnimodalData,
         self.write_dataframe(group, 'feature_metadata', data.feature_metadata)
 
         if hasattr(data, 'img'):
-            self.write_dataframe(group, 'img', data.img)
+            self.write_dataframe(group, 'image_metadata', data.image_metadata)
 
         if overwrite or data.matrices.is_dirty():
             self.write_mapping(group, 'matrices', data.matrices, overwrite = overwrite)