Merge pull request #17 from klarman-cell-observatory/boli

Updated
lilab-bcb · Jun 8, 2020 · cf767ec · cf767ec
2 parents aabe4e3 + de061a5
commit cf767ec
Show file tree

Hide file tree

Showing 6 changed files with 130 additions and 35 deletions.
diff --git a/pegasusio/__init__.py b/pegasusio/__init__.py
@@ -22,7 +22,7 @@
 from .vdj_data import VDJData
 from .citeseq_data import CITESeqData
 from .cyto_data import CytoData
-from .qc_utils import apply_qc_filters
+from .qc_utils import calc_qc_filters, apply_qc_filters
 from .multimodal_data import MultimodalData
 from .aggr_data import AggrData
 from .readwrite import infer_file_type, read_input, write_output

diff --git a/pegasusio/commands/AggregateMatrix.py b/pegasusio/commands/AggregateMatrix.py
@@ -19,6 +19,8 @@ class AggregateMatrix(Base):
   --attributes <attributes>                Specify a comma-separated list of outputted attributes. These attributes should be column names in the csv file.
   --default-reference <reference>          If sample count matrix is in either DGE, mtx, csv, tsv or loom format and there is no Reference column in the csv_file, use <reference> as the reference.
   --select-only-singlets                   If we have demultiplexed data, turning on this option will make pegasusio only include barcodes that are predicted as singlets.
+  --remap-singlets <remap_string>          Remap singlet names using <remap_string>, where <remap_string> takes the format "new_name_i:old_name_1,old_name_2;new_name_ii:old_name_3;...". For example, if we hashed 5 libraries from 3 samples sample1_lib1, sample1_lib2, sample2_lib1, sample2_lib2 and sample3, we can remap them to 3 samples using this string: "sample1:sample1_lib1,sample1_lib2;sample2:sample2_lib1,sample2_lib2". In this way, the new singlet names will be in metadata field with key 'assignment', while the old names will be kept in metadata field with key 'assignment.orig'.
+  --subset-singlets <subset_string>        If select singlets, only select singlets in the <subset_string>, which takes the format "name1,name2,...". Note that if --remap-singlets is specified, subsetting happens after remapping. For example, we can only select singlets from sampe 1 and 3 using "sample1,sample3".
   --min-genes <number>                     Only keep cells with at least <number> of genes.
   --max-genes <number>                     Only keep cells with less than <number> of genes. 
   --min-umis <number>                      Only keep cells with at least <number> of UMIs.
@@ -43,6 +45,8 @@ def execute(self):
             default_ref=self.args["--default-reference"],
             append_sample_name=not self.args["--no-append-sample-name"],
             select_singlets=self.args["--select-only-singlets"],
+            remap_string=self.args["--remap-singlets"],
+            subset_string=self.args["--subset-singlets"],
             min_genes=self.convert_to_int(self.args["--min-genes"]),
             max_genes=self.convert_to_int(self.args["--max-genes"]),
             min_umis=self.convert_to_int(self.args["--min-umis"]),
@@ -51,4 +55,3 @@ def execute(self):
             percent_mito=self.convert_to_float(self.args["--percent-mito"])
         )
         write_output(data, self.args["<output_name>"] + ".zarr.zip")
-
diff --git a/pegasusio/data_aggregation.py b/pegasusio/data_aggregation.py
@@ -62,6 +62,8 @@ def aggregate_matrices(
     default_ref: str = None,
     append_sample_name: bool = True,
     select_singlets: bool = False,
+    remap_string: str = None,
+    subset_string: str = None,
     min_genes: int = None,
     max_genes: int = None,
     min_umis: int = None,
@@ -90,6 +92,10 @@ def aggregate_matrices(
         By default, append sample_name to each channel. Turn this option off if each channel has distinct barcodes.
     select_singlets : `bool`, optional (default: False)
         If we have demultiplexed data, turning on this option will make pegasus only include barcodes that are predicted as singlets.
+    remap_string: ``str``, optional, default ``None``
+        Remap singlet names using <remap_string>, where <remap_string> takes the format "new_name_i:old_name_1,old_name_2;new_name_ii:old_name_3;...". For example, if we hashed 5 libraries from 3 samples sample1_lib1, sample1_lib2, sample2_lib1, sample2_lib2 and sample3, we can remap them to 3 samples using this string: "sample1:sample1_lib1,sample1_lib2;sample2:sample2_lib1,sample2_lib2". In this way, the new singlet names will be in metadata field with key 'assignment', while the old names will be kept in metadata field with key 'assignment.orig'.
+    subset_string: ``str``, optional, default ``None``
+        If select singlets, only select singlets in the <subset_string>, which takes the format "name1,name2,...". Note that if --remap-singlets is specified, subsetting happens after remapping. For example, we can only select singlets from sampe 1 and 3 using "sample1,sample3".
     min_genes: ``int``, optional, default: None
        Only keep cells with at least ``min_genes`` genes.
     max_genes: ``int``, optional, default: None
@@ -175,7 +181,7 @@ def aggregate_matrices(
         if row["Sample"] != curr_sample:
             if curr_data is not None:
                 curr_data._propogate_genome()
-                curr_data.filter_data(select_singlets = select_singlets, min_genes = min_genes, max_genes = max_genes, min_umis = min_umis, max_umis = max_umis, mito_prefix = mito_prefix, percent_mito = percent_mito)
+                curr_data.filter_data(select_singlets = select_singlets, remap_string = remap_string, subset_string = subset_string, min_genes = min_genes, max_genes = max_genes, min_umis = min_umis, max_umis = max_umis, mito_prefix = mito_prefix, percent_mito = percent_mito)
                 curr_data._update_barcode_metadata_info(curr_row, attributes, append_sample_name)
                 aggrData.add_data(curr_data)
             curr_data = data
@@ -188,7 +194,7 @@ def aggregate_matrices(
 
     if curr_data is not None:
         curr_data._propogate_genome()
-        curr_data.filter_data(select_singlets = select_singlets, min_genes = min_genes, max_genes = max_genes, min_umis = min_umis, max_umis = max_umis, mito_prefix = mito_prefix, percent_mito = percent_mito)
+        curr_data.filter_data(select_singlets = select_singlets, remap_string = remap_string, subset_string = subset_string, min_genes = min_genes, max_genes = max_genes, min_umis = min_umis, max_umis = max_umis, mito_prefix = mito_prefix, percent_mito = percent_mito)
         curr_data._update_barcode_metadata_info(curr_row, attributes, append_sample_name)
         aggrData.add_data(curr_data)
 

diff --git a/pegasusio/multimodal_data.py b/pegasusio/multimodal_data.py
@@ -233,10 +233,14 @@ def select_data(self, key: str) -> None:
         self._unidata = self.data[self._selected]
 
 
-    def current_data(self) -> str:
+    def current_key(self) -> str:
         return self._selected
 
 
+    def current_data(self) -> UnimodalData:
+        return self._unidata
+
+
     def get_data(self, key: str = None, genome: str = None, modality: str = None, keep_list: bool = False) -> Union[UnimodalData, List[UnimodalData]]:
         """ get UnimodalData or list of UnimodalData based on MultimodalData key or genome or modality; accept negation '~' before genome or modality
             keep_list = True will return a list even with one data point.
@@ -291,34 +295,75 @@ def drop_data(self, key: str) -> UnimodalData:
         return self.data.pop(key)
 
 
-    def filter_data(self, select_singlets: bool = False, min_genes: int = None, max_genes: int = None, min_umis: int = None, max_umis: int = None, mito_prefix: str = None, percent_mito: float = None) -> None:
+    def filter_data(self, 
+        select_singlets: bool = False,
+        remap_string: str = None,
+        subset_string: str = None,
+        min_genes: int = None,
+        max_genes: int = None,
+        min_umis: int = None,
+        max_umis: int = None,
+        mito_prefix: str = None,
+        percent_mito: float = None,
+        focus_list: List[str] = None
+    ) -> None:
         """
-        Filter all UnimodalData with modality == "rna". For all other modalities, select barcodes as Union of the selected "rna" barcodes.
+        Filter each "rna" modality UnimodalData with key in union list separately. Then for all other modalities, select only the union of selected barcodes in focus_list as barcodes.
+        If focus_list is None and self._selected's modality is "rna", focus_list = [self._selected]
 
         Parameters
         ----------
         select_singlets: ``bool``, optional, default ``False``
             If select only singlets.
+        remap_string: ``str``, optional, default ``None``
+            Remap singlet names using <remap_string>, where <remap_string> takes the format "new_name_i:old_name_1,old_name_2;new_name_ii:old_name_3;...". For example, if we hashed 5 libraries from 3 samples sample1_lib1, sample1_lib2, sample2_lib1, sample2_lib2 and sample3, we can remap them to 3 samples using this string: "sample1:sample1_lib1,sample1_lib2;sample2:sample2_lib1,sample2_lib2". In this way, the new singlet names will be in metadata field with key 'assignment', while the old names will be kept in metadata field with key 'assignment.orig'.
+        subset_string: ``str``, optional, default ``None``
+            If select singlets, only select singlets in the <subset_string>, which takes the format "name1,name2,...". Note that if --remap-singlets is specified, subsetting happens after remapping. For example, we can only select singlets from sampe 1 and 3 using "sample1,sample3".
         min_genes: ``int``, optional, default: None
-           Only keep cells with at least ``min_genes`` genes.
+            Only keep cells with at least ``min_genes`` genes.
         max_genes: ``int``, optional, default: None
-           Only keep cells with less than ``max_genes`` genes.
+            Only keep cells with less than ``max_genes`` genes.
         min_umis: ``int``, optional, default: None
-           Only keep cells with at least ``min_umis`` UMIs.
+            Only keep cells with at least ``min_umis`` UMIs.
         max_umis: ``int``, optional, default: None
-           Only keep cells with less than ``max_umis`` UMIs.
+            Only keep cells with less than ``max_umis`` UMIs.
         mito_prefix: ``str``, optional, default: None
-           Prefix for mitochondrial genes.
+            Prefix for mitochondrial genes. For example, GRCh38:MT-,mm10:mt-.
         percent_mito: ``float``, optional, default: None
-           Only keep cells with percent mitochondrial genes less than ``percent_mito`` % of total counts. Only when both mito_prefix and percent_mito set, the mitochondrial filter will be triggered.
+            Only keep cells with percent mitochondrial genes less than ``percent_mito`` % of total counts. Only when both mito_prefix and percent_mito set, the mitochondrial filter will be triggered.
+        focus_list: ``List[str]``, optional, default None
+            Filter each UnimodalData with key in focus_list separately using the filtration criteria.
         """
         selected_barcodes = None
-        for unidata in self.get_data(modality = "rna", keep_list = True):
-            apply_qc_filters(unidata, select_singlets = select_singlets, min_genes = min_genes, max_genes = max_genes, min_umis = min_umis, max_umis = max_umis, mito_prefix = mito_prefix, percent_mito = percent_mito)
-            selected_barcodes = unidata.obs_names if selected_barcodes is None else selected_barcodes.union(unidata.obs_names)
-
-        if selected_barcodes is not None:
-            for unidata in self.get_data(modality = "~rna", keep_list = True):
+
+        if focus_list is None:
+            focus_list = [self._selected]
+        focus_set = set(focus_list)
+
+        mito_dict = {}
+        default_mito = None
+        if mito_prefix is not None:
+            fields = mito_prefix.split(',')
+            if len(fields) == 1 and fields[0].find(':') < 0:
+                default_mito = fields[0]
+            else:
+                for field in fields:
+                    genome, mito_pref = field.split(':')
+                    mito_dict[genome] = mito_pref
+
+        unselected = []
+        for key, unidata in self.data.items():
+            if (key in focus_set) and (unidata.get_modality() == "rna"):
+                mito_prefix = mito_dict.get(unidata.get_genome(), default_mito)
+                if "passed_qc" not in unidata.obs:
+                    calc_qc_filters(unidata, select_singlets = select_singlets, remap_string = remap_string, subset_string = subset_string, min_genes = min_genes, max_genes = max_genes, min_umis = min_umis, max_umis = max_umis, mito_prefix = mito_prefix, percent_mito = percent_mito)
+                apply_qc_filters(unidata)
+                selected_barcodes = unidata.obs_names if selected_barcodes is None else selected_barcodes.union(unidata.obs_names)
+            else:
+                unselected.append(unidata)
+
+        if (selected_barcodes is not None) and len(unselected) > 0:
+            for unidata in unselected:
                 selected = unidata.obs_names.isin(selected_barcodes)
                 prior_n = unidata.shape[0]
                 unidata._inplace_subset_obs(selected)

diff --git a/pegasusio/qc_utils.py b/pegasusio/qc_utils.py
@@ -7,24 +7,30 @@
 
 
 
-def apply_qc_filters(
+def calc_qc_filters(
     unidata: UnimodalData,
     select_singlets: bool = False,
+    remap_string: str = None,
+    subset_string: str = None,
     min_genes: int = None,
     max_genes: int = None,
     min_umis: int = None,
     max_umis: int = None,
     mito_prefix: str = None,
     percent_mito: float = None
 ) -> None:
-    """Generate Quality Control (QC) metrics and filter dataset based on the QCs.
+    """Calculate Quality Control (QC) metrics and mark barcodes based on the combination of QC metrics.
 
     Parameters
     ----------
-    data: ``UnimodalData``
+    unidata: ``UnimodalData``
        Unimodal data matrix with rows for cells and columns for genes.
     select_singlets: ``bool``, optional, default ``False``
         If select only singlets.
+    remap_string: ``str``, optional, default ``None``
+        Remap singlet names using <remap_string>, where <remap_string> takes the format "new_name_i:old_name_1,old_name_2;new_name_ii:old_name_3;...". For example, if we hashed 5 libraries from 3 samples sample1_lib1, sample1_lib2, sample2_lib1, sample2_lib2 and sample3, we can remap them to 3 samples using this string: "sample1:sample1_lib1,sample1_lib2;sample2:sample2_lib1,sample2_lib2". In this way, the new singlet names will be in metadata field with key 'assignment', while the old names will be kept in metadata field with key 'assignment.orig'.
+    subset_string: ``str``, optional, default ``None``
+        If select singlets, only select singlets in the <subset_string>, which takes the format "name1,name2,...". Note that if --remap-singlets is specified, subsetting happens after remapping. For example, we can only select singlets from sampe 1 and 3 using "sample1,sample3".
     min_genes: ``int``, optional, default: None
        Only keep cells with at least ``min_genes`` genes.
     max_genes: ``int``, optional, default: None
@@ -47,19 +53,45 @@ def apply_qc_filters(
         * ``n_genes``: Total number of genes for each cell.
         * ``n_counts``: Total number of counts for each cell.
         * ``percent_mito``: Percent of mitochondrial genes for each cell.
+        * ``passed_qc``: Boolean type indicating if a cell passes the QC process based on the QC metrics.
         * ``demux_type``: this column might be deleted if select_singlets is on.
 
     Examples
     --------
-    >>> apply_qc_filters(unidata, min_umis = 500, select_singlets = True)
+    >>> calc_qc_filters(unidata, min_umis = 500, select_singlets = True)
     """
     assert unidata.uns["modality"] == "rna"
 
     filters = []
 
     if select_singlets and ("demux_type" in unidata.obs):
-        filters.append(unidata.obs["demux_type"] == "singlet")
-        unidata.obs.drop(columns="demux_type", inplace=True)
+        if remap_string is not None:
+            if "assignment" not in unidata.obs:
+                raise ValueError("No assignment field detected!")
+            unidata.obs["assignment.orig"] = unidata.obs["assignment"]
+
+            remap = {}
+            tokens = remap_string.split(";")
+            for token in tokens:
+                new_key, old_str = token.split(":")
+                old_keys = old_str.split(",")
+                for key in old_keys:
+                    remap[key] = new_key
+
+            unidata.obs["assignment"] = pd.Categorical(unidata.obs["assignment"].apply(lambda x: remap[x] if x in remap else x))
+            logger.info("Singlets are remapped.")
+
+
+        if subset_string is None:
+            filters.append(unidata.obs["demux_type"] == "singlet")
+        else:
+            if "assignment" not in data.obs:
+                raise ValueError("No assignment field detected!")
+
+            subset = np.array(subset_string.split(","))
+            filters.append(np.isin(data.obs["assignment"], subset))
+
+        unidata.uns["__del_demux_type"] = True
 
     min_cond = min_genes is not None
     max_cond = max_genes is not None
@@ -80,15 +112,7 @@ def apply_qc_filters(
         if max_cond:
             filters.append(unidata.obs["n_counts"] < max_umis)
         if calc_mito:
-            mito_prefixes = mito_prefix.split(",")
-
-            def _startswith(name):
-                for prefix in mito_prefixes:
-                    if name.startswith(prefix):
-                        return True
-                return False
-
-            mito_genes = unidata.var_names.map(_startswith).values.nonzero()[0]
+            mito_genes = unidata.var_names.map(lambda x: x.startswith(mito_prefix)).values.nonzero()[0]
 
             unidata.obs["percent_mito"] = (
                 unidata.X[:, mito_genes].sum(axis=1).A1
@@ -99,6 +123,19 @@ def _startswith(name):
 
     if len(filters) > 0:
         selected = np.logical_and.reduce(filters)
+        unidata.obs["passed_qc"] = selected
+
+
+def apply_qc_filters(unidata: UnimodalData):
+    """ Apply QC filters to filter out low quality cells """
+    if "passed_qc" in unidata.obs:
         prior_n = unidata.shape[0]
-        unidata._inplace_subset_obs(selected)
+        unidata._inplace_subset_obs(unidata.obs["passed_qc"])
+
+        cols = ["passed_qc"]
+        if unidata.uns.get("__del_demux_type", False):
+            cols.append("demux_type")
+            del unidata.uns["__del_demux_type"]
+
+        unidata.obs.drop(columns=cols, inplace=True)
         logger.info(f"After filtration, {unidata.shape[0]} out of {prior_n} cell barcodes are kept in UnimodalData object {unidata.get_uid()}.")
diff --git a/pegasusio/unimodal_data.py b/pegasusio/unimodal_data.py
@@ -312,6 +312,8 @@ def _inplace_subset_obs(self, index: List[bool]) -> None:
         """ Subset barcode_metadata inplace """
         if isinstance(index, pd.Series):
             index = index.values
+        if index.sum() == self._shape[0]:
+            return None
         self.barcode_metadata = self.barcode_metadata.loc[index].copy(deep = False)
         for key in list(self.matrices):
             self.matrices[key] = self.matrices[key][index, :]
@@ -328,6 +330,8 @@ def _inplace_subset_var(self, index: List[bool]) -> None:
         """ Subset feature_metadata inplace """
         if isinstance(index, pd.Series):
             index = index.values
+        if index.sum() == self._shape[1]:
+            return None
         self.feature_metadata = self.feature_metadata.loc[index].copy(deep = False)
         for key in list(self.matrices):
             self.matrices[key] = self.matrices[key][:, index]