Skip to content

Commit

Permalink
Merge pull request #17 from klarman-cell-observatory/boli
Browse files Browse the repository at this point in the history
Updated
  • Loading branch information
bli25 committed Jun 8, 2020
2 parents aabe4e3 + de061a5 commit cf767ec
Show file tree
Hide file tree
Showing 6 changed files with 130 additions and 35 deletions.
2 changes: 1 addition & 1 deletion pegasusio/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from .vdj_data import VDJData
from .citeseq_data import CITESeqData
from .cyto_data import CytoData
from .qc_utils import apply_qc_filters
from .qc_utils import calc_qc_filters, apply_qc_filters
from .multimodal_data import MultimodalData
from .aggr_data import AggrData
from .readwrite import infer_file_type, read_input, write_output
Expand Down
5 changes: 4 additions & 1 deletion pegasusio/commands/AggregateMatrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ class AggregateMatrix(Base):
--attributes <attributes> Specify a comma-separated list of outputted attributes. These attributes should be column names in the csv file.
--default-reference <reference> If sample count matrix is in either DGE, mtx, csv, tsv or loom format and there is no Reference column in the csv_file, use <reference> as the reference.
--select-only-singlets If we have demultiplexed data, turning on this option will make pegasusio only include barcodes that are predicted as singlets.
--remap-singlets <remap_string> Remap singlet names using <remap_string>, where <remap_string> takes the format "new_name_i:old_name_1,old_name_2;new_name_ii:old_name_3;...". For example, if we hashed 5 libraries from 3 samples sample1_lib1, sample1_lib2, sample2_lib1, sample2_lib2 and sample3, we can remap them to 3 samples using this string: "sample1:sample1_lib1,sample1_lib2;sample2:sample2_lib1,sample2_lib2". In this way, the new singlet names will be in metadata field with key 'assignment', while the old names will be kept in metadata field with key 'assignment.orig'.
--subset-singlets <subset_string> If select singlets, only select singlets in the <subset_string>, which takes the format "name1,name2,...". Note that if --remap-singlets is specified, subsetting happens after remapping. For example, we can only select singlets from sampe 1 and 3 using "sample1,sample3".
--min-genes <number> Only keep cells with at least <number> of genes.
--max-genes <number> Only keep cells with less than <number> of genes.
--min-umis <number> Only keep cells with at least <number> of UMIs.
Expand All @@ -43,6 +45,8 @@ def execute(self):
default_ref=self.args["--default-reference"],
append_sample_name=not self.args["--no-append-sample-name"],
select_singlets=self.args["--select-only-singlets"],
remap_string=self.args["--remap-singlets"],
subset_string=self.args["--subset-singlets"],
min_genes=self.convert_to_int(self.args["--min-genes"]),
max_genes=self.convert_to_int(self.args["--max-genes"]),
min_umis=self.convert_to_int(self.args["--min-umis"]),
Expand All @@ -51,4 +55,3 @@ def execute(self):
percent_mito=self.convert_to_float(self.args["--percent-mito"])
)
write_output(data, self.args["<output_name>"] + ".zarr.zip")

10 changes: 8 additions & 2 deletions pegasusio/data_aggregation.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ def aggregate_matrices(
default_ref: str = None,
append_sample_name: bool = True,
select_singlets: bool = False,
remap_string: str = None,
subset_string: str = None,
min_genes: int = None,
max_genes: int = None,
min_umis: int = None,
Expand Down Expand Up @@ -90,6 +92,10 @@ def aggregate_matrices(
By default, append sample_name to each channel. Turn this option off if each channel has distinct barcodes.
select_singlets : `bool`, optional (default: False)
If we have demultiplexed data, turning on this option will make pegasus only include barcodes that are predicted as singlets.
remap_string: ``str``, optional, default ``None``
Remap singlet names using <remap_string>, where <remap_string> takes the format "new_name_i:old_name_1,old_name_2;new_name_ii:old_name_3;...". For example, if we hashed 5 libraries from 3 samples sample1_lib1, sample1_lib2, sample2_lib1, sample2_lib2 and sample3, we can remap them to 3 samples using this string: "sample1:sample1_lib1,sample1_lib2;sample2:sample2_lib1,sample2_lib2". In this way, the new singlet names will be in metadata field with key 'assignment', while the old names will be kept in metadata field with key 'assignment.orig'.
subset_string: ``str``, optional, default ``None``
If select singlets, only select singlets in the <subset_string>, which takes the format "name1,name2,...". Note that if --remap-singlets is specified, subsetting happens after remapping. For example, we can only select singlets from sampe 1 and 3 using "sample1,sample3".
min_genes: ``int``, optional, default: None
Only keep cells with at least ``min_genes`` genes.
max_genes: ``int``, optional, default: None
Expand Down Expand Up @@ -175,7 +181,7 @@ def aggregate_matrices(
if row["Sample"] != curr_sample:
if curr_data is not None:
curr_data._propogate_genome()
curr_data.filter_data(select_singlets = select_singlets, min_genes = min_genes, max_genes = max_genes, min_umis = min_umis, max_umis = max_umis, mito_prefix = mito_prefix, percent_mito = percent_mito)
curr_data.filter_data(select_singlets = select_singlets, remap_string = remap_string, subset_string = subset_string, min_genes = min_genes, max_genes = max_genes, min_umis = min_umis, max_umis = max_umis, mito_prefix = mito_prefix, percent_mito = percent_mito)
curr_data._update_barcode_metadata_info(curr_row, attributes, append_sample_name)
aggrData.add_data(curr_data)
curr_data = data
Expand All @@ -188,7 +194,7 @@ def aggregate_matrices(

if curr_data is not None:
curr_data._propogate_genome()
curr_data.filter_data(select_singlets = select_singlets, min_genes = min_genes, max_genes = max_genes, min_umis = min_umis, max_umis = max_umis, mito_prefix = mito_prefix, percent_mito = percent_mito)
curr_data.filter_data(select_singlets = select_singlets, remap_string = remap_string, subset_string = subset_string, min_genes = min_genes, max_genes = max_genes, min_umis = min_umis, max_umis = max_umis, mito_prefix = mito_prefix, percent_mito = percent_mito)
curr_data._update_barcode_metadata_info(curr_row, attributes, append_sample_name)
aggrData.add_data(curr_data)

Expand Down
75 changes: 60 additions & 15 deletions pegasusio/multimodal_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,10 +233,14 @@ def select_data(self, key: str) -> None:
self._unidata = self.data[self._selected]


def current_data(self) -> str:
def current_key(self) -> str:
return self._selected


def current_data(self) -> UnimodalData:
return self._unidata


def get_data(self, key: str = None, genome: str = None, modality: str = None, keep_list: bool = False) -> Union[UnimodalData, List[UnimodalData]]:
""" get UnimodalData or list of UnimodalData based on MultimodalData key or genome or modality; accept negation '~' before genome or modality
keep_list = True will return a list even with one data point.
Expand Down Expand Up @@ -291,34 +295,75 @@ def drop_data(self, key: str) -> UnimodalData:
return self.data.pop(key)


def filter_data(self, select_singlets: bool = False, min_genes: int = None, max_genes: int = None, min_umis: int = None, max_umis: int = None, mito_prefix: str = None, percent_mito: float = None) -> None:
def filter_data(self,
select_singlets: bool = False,
remap_string: str = None,
subset_string: str = None,
min_genes: int = None,
max_genes: int = None,
min_umis: int = None,
max_umis: int = None,
mito_prefix: str = None,
percent_mito: float = None,
focus_list: List[str] = None
) -> None:
"""
Filter all UnimodalData with modality == "rna". For all other modalities, select barcodes as Union of the selected "rna" barcodes.
Filter each "rna" modality UnimodalData with key in union list separately. Then for all other modalities, select only the union of selected barcodes in focus_list as barcodes.
If focus_list is None and self._selected's modality is "rna", focus_list = [self._selected]
Parameters
----------
select_singlets: ``bool``, optional, default ``False``
If select only singlets.
remap_string: ``str``, optional, default ``None``
Remap singlet names using <remap_string>, where <remap_string> takes the format "new_name_i:old_name_1,old_name_2;new_name_ii:old_name_3;...". For example, if we hashed 5 libraries from 3 samples sample1_lib1, sample1_lib2, sample2_lib1, sample2_lib2 and sample3, we can remap them to 3 samples using this string: "sample1:sample1_lib1,sample1_lib2;sample2:sample2_lib1,sample2_lib2". In this way, the new singlet names will be in metadata field with key 'assignment', while the old names will be kept in metadata field with key 'assignment.orig'.
subset_string: ``str``, optional, default ``None``
If select singlets, only select singlets in the <subset_string>, which takes the format "name1,name2,...". Note that if --remap-singlets is specified, subsetting happens after remapping. For example, we can only select singlets from sampe 1 and 3 using "sample1,sample3".
min_genes: ``int``, optional, default: None
Only keep cells with at least ``min_genes`` genes.
Only keep cells with at least ``min_genes`` genes.
max_genes: ``int``, optional, default: None
Only keep cells with less than ``max_genes`` genes.
Only keep cells with less than ``max_genes`` genes.
min_umis: ``int``, optional, default: None
Only keep cells with at least ``min_umis`` UMIs.
Only keep cells with at least ``min_umis`` UMIs.
max_umis: ``int``, optional, default: None
Only keep cells with less than ``max_umis`` UMIs.
Only keep cells with less than ``max_umis`` UMIs.
mito_prefix: ``str``, optional, default: None
Prefix for mitochondrial genes.
Prefix for mitochondrial genes. For example, GRCh38:MT-,mm10:mt-.
percent_mito: ``float``, optional, default: None
Only keep cells with percent mitochondrial genes less than ``percent_mito`` % of total counts. Only when both mito_prefix and percent_mito set, the mitochondrial filter will be triggered.
Only keep cells with percent mitochondrial genes less than ``percent_mito`` % of total counts. Only when both mito_prefix and percent_mito set, the mitochondrial filter will be triggered.
focus_list: ``List[str]``, optional, default None
Filter each UnimodalData with key in focus_list separately using the filtration criteria.
"""
selected_barcodes = None
for unidata in self.get_data(modality = "rna", keep_list = True):
apply_qc_filters(unidata, select_singlets = select_singlets, min_genes = min_genes, max_genes = max_genes, min_umis = min_umis, max_umis = max_umis, mito_prefix = mito_prefix, percent_mito = percent_mito)
selected_barcodes = unidata.obs_names if selected_barcodes is None else selected_barcodes.union(unidata.obs_names)

if selected_barcodes is not None:
for unidata in self.get_data(modality = "~rna", keep_list = True):

if focus_list is None:
focus_list = [self._selected]
focus_set = set(focus_list)

mito_dict = {}
default_mito = None
if mito_prefix is not None:
fields = mito_prefix.split(',')
if len(fields) == 1 and fields[0].find(':') < 0:
default_mito = fields[0]
else:
for field in fields:
genome, mito_pref = field.split(':')
mito_dict[genome] = mito_pref

unselected = []
for key, unidata in self.data.items():
if (key in focus_set) and (unidata.get_modality() == "rna"):
mito_prefix = mito_dict.get(unidata.get_genome(), default_mito)
if "passed_qc" not in unidata.obs:
calc_qc_filters(unidata, select_singlets = select_singlets, remap_string = remap_string, subset_string = subset_string, min_genes = min_genes, max_genes = max_genes, min_umis = min_umis, max_umis = max_umis, mito_prefix = mito_prefix, percent_mito = percent_mito)
apply_qc_filters(unidata)
selected_barcodes = unidata.obs_names if selected_barcodes is None else selected_barcodes.union(unidata.obs_names)
else:
unselected.append(unidata)

if (selected_barcodes is not None) and len(unselected) > 0:
for unidata in unselected:
selected = unidata.obs_names.isin(selected_barcodes)
prior_n = unidata.shape[0]
unidata._inplace_subset_obs(selected)
Expand Down
69 changes: 53 additions & 16 deletions pegasusio/qc_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,24 +7,30 @@



def apply_qc_filters(
def calc_qc_filters(
unidata: UnimodalData,
select_singlets: bool = False,
remap_string: str = None,
subset_string: str = None,
min_genes: int = None,
max_genes: int = None,
min_umis: int = None,
max_umis: int = None,
mito_prefix: str = None,
percent_mito: float = None
) -> None:
"""Generate Quality Control (QC) metrics and filter dataset based on the QCs.
"""Calculate Quality Control (QC) metrics and mark barcodes based on the combination of QC metrics.
Parameters
----------
data: ``UnimodalData``
unidata: ``UnimodalData``
Unimodal data matrix with rows for cells and columns for genes.
select_singlets: ``bool``, optional, default ``False``
If select only singlets.
remap_string: ``str``, optional, default ``None``
Remap singlet names using <remap_string>, where <remap_string> takes the format "new_name_i:old_name_1,old_name_2;new_name_ii:old_name_3;...". For example, if we hashed 5 libraries from 3 samples sample1_lib1, sample1_lib2, sample2_lib1, sample2_lib2 and sample3, we can remap them to 3 samples using this string: "sample1:sample1_lib1,sample1_lib2;sample2:sample2_lib1,sample2_lib2". In this way, the new singlet names will be in metadata field with key 'assignment', while the old names will be kept in metadata field with key 'assignment.orig'.
subset_string: ``str``, optional, default ``None``
If select singlets, only select singlets in the <subset_string>, which takes the format "name1,name2,...". Note that if --remap-singlets is specified, subsetting happens after remapping. For example, we can only select singlets from sampe 1 and 3 using "sample1,sample3".
min_genes: ``int``, optional, default: None
Only keep cells with at least ``min_genes`` genes.
max_genes: ``int``, optional, default: None
Expand All @@ -47,19 +53,45 @@ def apply_qc_filters(
* ``n_genes``: Total number of genes for each cell.
* ``n_counts``: Total number of counts for each cell.
* ``percent_mito``: Percent of mitochondrial genes for each cell.
* ``passed_qc``: Boolean type indicating if a cell passes the QC process based on the QC metrics.
* ``demux_type``: this column might be deleted if select_singlets is on.
Examples
--------
>>> apply_qc_filters(unidata, min_umis = 500, select_singlets = True)
>>> calc_qc_filters(unidata, min_umis = 500, select_singlets = True)
"""
assert unidata.uns["modality"] == "rna"

filters = []

if select_singlets and ("demux_type" in unidata.obs):
filters.append(unidata.obs["demux_type"] == "singlet")
unidata.obs.drop(columns="demux_type", inplace=True)
if remap_string is not None:
if "assignment" not in unidata.obs:
raise ValueError("No assignment field detected!")
unidata.obs["assignment.orig"] = unidata.obs["assignment"]

remap = {}
tokens = remap_string.split(";")
for token in tokens:
new_key, old_str = token.split(":")
old_keys = old_str.split(",")
for key in old_keys:
remap[key] = new_key

unidata.obs["assignment"] = pd.Categorical(unidata.obs["assignment"].apply(lambda x: remap[x] if x in remap else x))
logger.info("Singlets are remapped.")


if subset_string is None:
filters.append(unidata.obs["demux_type"] == "singlet")
else:
if "assignment" not in data.obs:
raise ValueError("No assignment field detected!")

subset = np.array(subset_string.split(","))
filters.append(np.isin(data.obs["assignment"], subset))

unidata.uns["__del_demux_type"] = True

min_cond = min_genes is not None
max_cond = max_genes is not None
Expand All @@ -80,15 +112,7 @@ def apply_qc_filters(
if max_cond:
filters.append(unidata.obs["n_counts"] < max_umis)
if calc_mito:
mito_prefixes = mito_prefix.split(",")

def _startswith(name):
for prefix in mito_prefixes:
if name.startswith(prefix):
return True
return False

mito_genes = unidata.var_names.map(_startswith).values.nonzero()[0]
mito_genes = unidata.var_names.map(lambda x: x.startswith(mito_prefix)).values.nonzero()[0]

unidata.obs["percent_mito"] = (
unidata.X[:, mito_genes].sum(axis=1).A1
Expand All @@ -99,6 +123,19 @@ def _startswith(name):

if len(filters) > 0:
selected = np.logical_and.reduce(filters)
unidata.obs["passed_qc"] = selected


def apply_qc_filters(unidata: UnimodalData):
""" Apply QC filters to filter out low quality cells """
if "passed_qc" in unidata.obs:
prior_n = unidata.shape[0]
unidata._inplace_subset_obs(selected)
unidata._inplace_subset_obs(unidata.obs["passed_qc"])

cols = ["passed_qc"]
if unidata.uns.get("__del_demux_type", False):
cols.append("demux_type")
del unidata.uns["__del_demux_type"]

unidata.obs.drop(columns=cols, inplace=True)
logger.info(f"After filtration, {unidata.shape[0]} out of {prior_n} cell barcodes are kept in UnimodalData object {unidata.get_uid()}.")
4 changes: 4 additions & 0 deletions pegasusio/unimodal_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,8 @@ def _inplace_subset_obs(self, index: List[bool]) -> None:
""" Subset barcode_metadata inplace """
if isinstance(index, pd.Series):
index = index.values
if index.sum() == self._shape[0]:
return None
self.barcode_metadata = self.barcode_metadata.loc[index].copy(deep = False)
for key in list(self.matrices):
self.matrices[key] = self.matrices[key][index, :]
Expand All @@ -328,6 +330,8 @@ def _inplace_subset_var(self, index: List[bool]) -> None:
""" Subset feature_metadata inplace """
if isinstance(index, pd.Series):
index = index.values
if index.sum() == self._shape[1]:
return None
self.feature_metadata = self.feature_metadata.loc[index].copy(deep = False)
for key in list(self.matrices):
self.matrices[key] = self.matrices[key][:, index]
Expand Down

0 comments on commit cf767ec

Please sign in to comment.