From 0db82c8f2bf4b09b6b954151db79da1d23a1d8cc Mon Sep 17 00:00:00 2001 From: Bo Li Date: Mon, 16 May 2022 15:33:56 -0400 Subject: [PATCH 1/5] Added ways to infer the number of PCs --- pegasus/__init__.py | 2 + pegasus/plotting/__init__.py | 1 + pegasus/plotting/plot_library.py | 68 +++++++++++++++++++++++++++++- pegasus/tools/__init__.py | 1 + pegasus/tools/doublet_detection.py | 4 +- pegasus/tools/nearest_neighbors.py | 10 ++++- pegasus/tools/utils.py | 46 +++++++++++++++++++- 7 files changed, 125 insertions(+), 7 deletions(-) diff --git a/pegasus/__init__.py b/pegasus/__init__.py index a8cdd5a5..b1339753 100644 --- a/pegasus/__init__.py +++ b/pegasus/__init__.py @@ -75,6 +75,7 @@ run_scvi, train_scarches_scanvi, predict_scarches_scanvi, + largest_variance_from_random_matrix, ) from .annotate_cluster import infer_cell_types, annotate, infer_cluster_names from .misc import search_genes, search_de_genes, find_outlier_clusters @@ -94,6 +95,7 @@ ridgeplot, wordcloud, plot_gsea, + elbowplot, ) from . import pseudo diff --git a/pegasus/plotting/__init__.py b/pegasus/plotting/__init__.py index 07e755f8..43e53ff5 100644 --- a/pegasus/plotting/__init__.py +++ b/pegasus/plotting/__init__.py @@ -14,4 +14,5 @@ ridgeplot, wordcloud, plot_gsea, + elbowplot, ) diff --git a/pegasus/plotting/plot_library.py b/pegasus/plotting/plot_library.py index 7b0431b6..82aa7331 100644 --- a/pegasus/plotting/plot_library.py +++ b/pegasus/plotting/plot_library.py @@ -17,7 +17,7 @@ import logging logger = logging.getLogger(__name__) -from pegasus.tools import X_from_rep, slicing +from pegasus.tools import X_from_rep, slicing, largest_variance_from_random_matrix from .plot_utils import ( _transform_basis, _get_nrows_and_ncols, @@ -44,6 +44,7 @@ def scatter( fix_corners: Optional[bool] = True, alpha: Optional[Union[float, List[float]]] = 1.0, legend_loc: Optional[Union[str, List[str]]] = "right margin", + legend_fontsize: Optional[Union[int, List[int]]] = 10, legend_ncol: Optional[str] = None, palettes: Optional[Union[str, List[str]]] = None, cmaps: Optional[Union[str, List[str]]] = "YlOrRd", @@ -85,6 +86,8 @@ def scatter( Alpha value for blending, from 0.0 (transparent) to 1.0 (opaque). If this is a list, the length must match attrs, which means we set a separate alpha value for each attribute. legend_loc: ``str`` or ``List[str]``, optional, default: ``right margin`` Legend location. Can be either "right margin" or "on data". If a list is provided, set 'legend_loc' for each attribute in 'attrs' separately. + legend_fontsize: ``int`` or ``List[int]``, optional, default: ``10`` + Legend fontsize. If a list is provided, set 'legend_fontsize' for each attribute in 'attrs' separately. legend_ncol: ``str``, optional, default: None Only applicable if legend_loc == "right margin". Set number of columns used to show legends. palettes: ``str`` or ``List[str]``, optional, default: None @@ -164,7 +167,9 @@ def scatter( if not is_list_like(legend_loc): legend_loc = [legend_loc] * nattrs - legend_fontsize = [5 if x == "on data" else 10 for x in legend_loc] + + if not is_list_like(legend_fontsize): + legend_fontsize = [legend_fontsize] * nattrs palettes = DictWithDefault(palettes) cmaps = DictWithDefault(cmaps) @@ -2177,3 +2182,62 @@ def plot_gsea( axes[1].set_xlabel('-log10(q-value)') return fig if return_fig else None + + +def elbowplot( + data: Union[MultimodalData, UnimodalData], + rep: str = "pca", + pval: str = "0.05", + panel_size: Optional[Tuple[float, float]] = (6, 4), + return_fig: Optional[bool] = False, + dpi: Optional[float] = 300.0, + **kwargs, +) -> Union[plt.Figure, None]: + """Generate Elbowplot and suggest n_comps to select based on random matrix theory (see utils.largest_variance_from_random_matrix). + + Parameters + ---------- + + data : ``UnimodalData`` or ``MultimodalData`` object. + The main data object. + rep: ``str``, optional, default: ``pca`` + Representation to consider, either "pca" or "tsvd". + pval: ``str``, optional (default: "0.05"). + P value cutoff on the null distribution (random matrix), choosing from "0.01" and "0.05". + top_n: ``int``, optional, default: ``20`` + Only show top_n up/down regulated pathways. + panel_size: `tuple`, optional (default: `(6, 4)`) + The plot size (width, height) in inches. + return_fig: ``bool``, optional, default: ``False`` + Return a ``Figure`` object if ``True``; return ``None`` otherwise. + dpi: ``float``, optional, default: ``300.0`` + The resolution in dots per inch. + + Returns + ------- + + `Figure` object + A ``matplotlib.figure.Figure`` object containing the dot plot if ``return_fig == True``. + + Update ``data.uns``: + + * ``{rep}_ncomps``: Recommended components to pick. + + Examples + -------- + >>> fig = pg.elbowplot(data, dpi = 500) + """ + assert rep in data.uns + thre = largest_variance_from_random_matrix(data.shape[0], data.var[data.uns[f"{rep}_features"]].sum(), pval) + ncomps = (data.uns[rep]["variance"] > thre).sum() + data.uns[f"{rep}_ncomps"] = ncomps + logger.info(f"Selecting {ncomps} is recommended!") + + fig, ax = _get_subplot_layouts(panel_size=panel_size, dpi=dpi) + ax.scatter(range(1, data.uns[rep]["variance"].size + 1), data.uns[rep]["variance"], s=8, c='k') + ax.set_yscale('log') + ax.set_xlabel(rep.upper()) + ax.set_ylabel("Variance") + ax.axvline(x = ncomps + 0.5, ls = "--", c = "r", linewidth=1) + + return fig if return_fig else None diff --git a/pegasus/tools/__init__.py b/pegasus/tools/__init__.py index 3680fe54..f362ebab 100644 --- a/pegasus/tools/__init__.py +++ b/pegasus/tools/__init__.py @@ -15,6 +15,7 @@ predefined_signatures, predefined_pathways, load_signatures_from_file, + largest_variance_from_random_matrix, ) from .preprocessing import ( diff --git a/pegasus/tools/doublet_detection.py b/pegasus/tools/doublet_detection.py index 6de5dbde..f463afb5 100644 --- a/pegasus/tools/doublet_detection.py +++ b/pegasus/tools/doublet_detection.py @@ -518,7 +518,7 @@ def infer_doublets( Random seed for reproducing results. plot_hist: ``str``, optional, default: ``sample`` - If not None, plot diagnostic histograms using ``plot_hist`` as the prefix. If `channel_attr` is None, ``plot_hist.dbl.png`` is generated; Otherwise, ``plot_hist.channel_name.dbl.png`` files are generated. Each figure consists of 4 panels showing histograms of doublet scores for observed cells (panel 1, density in log scale), simulated doublets (panel 2, density in log scale), KDE plot (panel 3) and signed curvature plot (panel 4) of log doublet scores for simulated doublets. + If not None, plot diagnostic histograms using ``plot_hist`` as the prefix. If `channel_attr` is None, ``plot_hist.dbl.png`` is generated; Otherwise, ``plot_hist.channel_name.dbl.png`` files are generated. Each figure consists of 4 panels showing histograms of doublet scores for observed cells (panel 1, density in log scale), simulated doublets (panel 2, density in log scale), KDE plot (panel 3) and signed curvature plot (panel 4) of log doublet scores for simulated doublets. Each plot contains two dashed lines. The red dashed line represents the theoretical cutoff (calucalted based on number of cells and 10x doublet table) and the black dashed line represents the cutof inferred from the data. manual_correction: ``str``, optional, default: ``None`` Use human guide to correct doublet threshold for certain channels. This is string representing a comma-separately list. Each item in the list represent one sample and the sample name and correction guide are separated using ':'. The only correction guide supported is 'peak', which means cut at the center of the peak. If only one sample available, use '' as the sample name. @@ -528,7 +528,7 @@ def infer_doublets( ``None`` Update ``data.obs``: - * ``data.obs['pred_dbl_type']``: Predicted singlet/doublet types. + * ``data.obs['pred_dbl']``: Predicted singlet/doublet types. * ``data.uns['pred_dbl_cluster']``: Only generated if 'clust_attr' is not None. This is a dataframe with two columns, 'Cluster' and 'Qval'. Only clusters with significantly more doublets than expected will be recorded here. diff --git a/pegasus/tools/nearest_neighbors.py b/pegasus/tools/nearest_neighbors.py index 1c981e81..fdc16109 100644 --- a/pegasus/tools/nearest_neighbors.py +++ b/pegasus/tools/nearest_neighbors.py @@ -110,6 +110,7 @@ def get_neighbors( data: MultimodalData, K: int = 100, rep: str = "pca", + n_comps: int = None, n_jobs: int = -1, random_state: int = 0, full_speed: bool = False, @@ -127,6 +128,8 @@ def get_neighbors( Number of neighbors, including the data point itself. rep : `str`, optional (default: 'pca') Representation used to calculate kNN. If `None` use data.X + n_comps: `int`, optional (default: None) + Number of components to be used in the `rep`. If n_comps == None, use all components; otherwise, use the minimum of n_comps and rep's dimensions. n_jobs : `int`, optional (default: -1) Number of threads to use. -1 refers to using all physical CPU cores. random_state: `int`, optional (default: 0) @@ -158,7 +161,7 @@ def get_neighbors( logger.info("Found cached kNN results, no calculation is required.") else: indices, distances = calculate_nearest_neighbors( - X_from_rep(data, rep), + X_from_rep(data, rep, n_comps), K=K, n_jobs=eff_n_jobs(n_jobs), random_state=random_state, @@ -228,6 +231,7 @@ def neighbors( data: MultimodalData, K: int = 100, rep: "str" = "pca", + n_comps: int = None, n_jobs: int = -1, random_state: int = 0, full_speed: bool = False, @@ -250,6 +254,9 @@ def neighbors( rep: ``str``, optional, default: ``"pca"`` Embedding representation used to calculate kNN. If ``None``, use ``data.X``; otherwise, keyword ``'X_' + rep`` must exist in ``data.obsm``. + n_comps: `int`, optional (default: None) + Number of components to be used in the `rep`. If n_comps == None, use all components; otherwise, use the minimum of n_comps and rep's dimensions. + n_jobs: ``int``, optional, default: ``-1`` Number of threads to use. If ``-1``, use all physical CPU cores. @@ -289,6 +296,7 @@ def neighbors( data, K=K, rep=rep, + n_comps=n_comps, n_jobs=n_jobs, random_state=random_state, full_speed=full_speed, diff --git a/pegasus/tools/utils.py b/pegasus/tools/utils.py index 625eb266..71da4b40 100644 --- a/pegasus/tools/utils.py +++ b/pegasus/tools/utils.py @@ -29,7 +29,7 @@ def update_rep(rep: str) -> str: return rep if rep is not None else "mat" -def X_from_rep(data: AnnData, rep: str) -> np.array: +def X_from_rep(data: AnnData, rep: str, n_comps: int = None) -> np.array: """ If rep is not mat, first check if X_rep is in data.obsm. If not, raise an error. If rep is None, return data.X as a numpy array @@ -38,7 +38,12 @@ def X_from_rep(data: AnnData, rep: str) -> np.array: rep_key = "X_" + rep if rep_key not in data.obsm.keys(): raise ValueError("Cannot find {0} matrix. Please run {0} first".format(rep)) - return data.obsm[rep_key] + if n_comps is None: + return data.obsm[rep_key] + else: + assert n_comps > 0 + n_dim = min(n_comps, data.obsm[rep_key].shape[1]) + return data.obsm[rep_key][:, 0:n_dim] else: return data.X if not issparse(data.X) else data.X.toarray() @@ -209,3 +214,40 @@ def load_signatures_from_file(input_file: str) -> Dict[str, List[str]]: signatures[items[0]] = list(set(items[2:])) logger.info(f"Loaded signatures from GMT file {input_file}.") return signatures + + +def largest_variance_from_random_matrix( + ncells: int, + nfeatures: int, + pval: str = "0.05", +) -> float: + """ Select the largest variance from a random generated matrix. See [Johnstone 2001](https://projecteuclid.org/journals/annals-of-statistics/volume-29/issue-2/On-the-distribution-of-the-largest-eigenvalue-in-principal/10.1214/aos/1009210544.full) and [Shekhar et al. 2022](https://elifesciences.org/articles/73809) for more details. + + Parameters + ---------- + ncells: ``int``. + Number of cells. + + nfeatures: ``int``. + Number of featuers (e.g. highly variable genes) + + pval: ``str``, optional (default: "0.05"). + P value cutoff on the null distribution (random matrix), choosing from "0.01" and "0.05". + + Returns + ------- + ``float``, the largest variance from a random matrix at pval. + + Examples + -------- + >>> pg.largest_variance_from_random_matrix(10000, 2000) + """ + quantiles = {"0.01": 2.023335, "0.05": 0.9792895} # quantiles from the Tracy-Widom distribution of order 1. + assert pval in ["0.01", "0.05"] + val1 = (ncells - 1) ** 0.5 + val2 = nfeatures ** 0.5 + mu = (val1 + val2) ** 2 + sigma = (val1 + val2) * (1.0 / val1 + 1.0 / val2) ** (1.0 / 3.0) + res = (quantiles[pval] * sigma + mu) / (ncells - 1) + + return res From 64df536c434169f8f35a5d278aa07eee7db52fca Mon Sep 17 00:00:00 2001 From: Bo Li Date: Thu, 2 Jun 2022 01:01:21 -0400 Subject: [PATCH 2/5] Fixed a bug in plot gsea --- pegasus/plotting/plot_library.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pegasus/plotting/plot_library.py b/pegasus/plotting/plot_library.py index 82aa7331..dac8a2c8 100644 --- a/pegasus/plotting/plot_library.py +++ b/pegasus/plotting/plot_library.py @@ -2174,7 +2174,7 @@ def plot_gsea( df['NES Abs'] = np.abs(df['NES']) df['pathway'] = df['pathway'].map(lambda x: ' '.join(x.split('_'))) - fig, axes = _get_subplot_layouts(panel_size=panel_size, nrows=2, dpi=dpi, left=0.6, hspace=0.2) + fig, axes = _get_subplot_layouts(panel_size=panel_size, nrows=2, dpi=dpi, left=0.6, hspace=0.2, sharey=False) df_up = df.loc[df['NES']>0] _make_one_gsea_plot(df_up, axes[0], color='red') df_dn = df.loc[df['NES']<0] From e4adffb5d3684d973374257475e46951ed137615 Mon Sep 17 00:00:00 2001 From: Bo Li Date: Mon, 4 Jul 2022 11:41:02 -0400 Subject: [PATCH 3/5] Added n_comps option for harmony and visualization tools; updated scatter to plot multiple bases and components --- pegasus/__init__.py | 1 + .../human_lung_cell_markers.json | 2 +- pegasus/plotting/plot_library.py | 119 ++++++++++++------ pegasus/tools/__init__.py | 1 + pegasus/tools/batch_correction.py | 6 +- pegasus/tools/preprocessing.py | 62 ++++++++- pegasus/tools/visualization.py | 19 ++- 7 files changed, 165 insertions(+), 45 deletions(-) diff --git a/pegasus/__init__.py b/pegasus/__init__.py index b1339753..7cde9a92 100644 --- a/pegasus/__init__.py +++ b/pegasus/__init__.py @@ -30,6 +30,7 @@ filter_data, identify_robust_genes, log_norm, + arcsinh_transform, select_features, pca, pc_transform, diff --git a/pegasus/annotate_cluster/human_lung_cell_markers.json b/pegasus/annotate_cluster/human_lung_cell_markers.json index 985f77f5..4e18b1ad 100644 --- a/pegasus/annotate_cluster/human_lung_cell_markers.json +++ b/pegasus/annotate_cluster/human_lung_cell_markers.json @@ -268,7 +268,7 @@ } ], "subtypes" : { - "title" : "SMC subtype markers", + "title" : "Fibro/Myofib subtype markers", "cell_types" : [ { "name" : "Adventitial fibroblast", diff --git a/pegasus/plotting/plot_library.py b/pegasus/plotting/plot_library.py index dac8a2c8..84f791ce 100644 --- a/pegasus/plotting/plot_library.py +++ b/pegasus/plotting/plot_library.py @@ -36,8 +36,9 @@ def scatter( data: Union[MultimodalData, UnimodalData, anndata.AnnData], - attrs: Union[str, List[str]] = None, - basis: Optional[str] = "umap", + attrs: Optional[Union[str, List[str]]] = None, + basis: Optional[Union[str, List[str]]] = "umap", + components: Optional[Union[Tuple[int, int], List[Tuple[int, int]]]] = (1, 2), matkey: Optional[str] = None, restrictions: Optional[Union[str, List[str]]] = None, show_background: Optional[bool] = False, @@ -72,8 +73,10 @@ def scatter( Use current selected modality in data. attrs: ``str`` or ``List[str]``, default: None Color scatter plots by attrs. Each attribute in attrs can be one key in data.obs, data.var_names (e.g. one gene) or data.obsm (attribute has the format of 'obsm_key@component', like 'X_pca@0'). If one attribute is categorical, a palette will be used to color each category separately. Otherwise, a color map will be used. If no attributes are provided, plot the basis for all data. - basis: ``str``, optional, default: ``umap`` - Basis to be used to generate scatter plots. Can be either 'umap', 'tsne', 'fitsne', 'fle', 'net_tsne', 'net_fitsne', 'net_umap' or 'net_fle'. + basis: ``str`` or ``List[str]``, optional, default: ``umap`` + Basis to be used to generate scatter plots. Can be either 'pca', 'diffmap', 'umap', 'tsne', 'fitsne', 'fle', 'net_tsne', 'net_fitsne', 'net_umap' or 'net_fle'. If `basis` is a list, each of element in `attrs` will be plotted for each basis in `basis`. + components: ``Tuple[int, int]`` or ``List[Tuple[int, int]]``, optional, default: ``(1, 2)`` + Components in basis to be used. Default to the first two components. If `components` is a list, for each element in `attrs` and each `basis`, all components enumeration will be plotted. matkey: ``str``, optional, default: None If matkey is set, select matrix with matkey as keyword in the current modality. Only works for MultimodalData or UnimodalData objects. restrictions: ``str`` or ``List[str]``, optional, default: None @@ -134,14 +137,6 @@ def scatter( >>> pg.scatter(data, attrs=['louvain_labels', 'Channel'], basis='fitsne') >>> pg.scatter(data, attrs=['CD14', 'TRAC'], basis='umap') """ - if attrs is None: - attrs = ['_all'] # default, plot all points - if palettes is None: - palettes = '_all:slategrey' - elif not is_list_like(attrs): - attrs = [attrs] - nattrs = len(attrs) - if not isinstance(data, anndata.AnnData): cur_matkey = data.current_matrix() @@ -149,17 +144,33 @@ def scatter( assert not isinstance(data, anndata.AnnData) data.select_matrix(matkey) - x = data.obsm[f"X_{basis}"][:, 0] - y = data.obsm[f"X_{basis}"][:, 1] - # four corners of the plot - corners = np.array(np.meshgrid([x.min(), x.max()], [y.min(), y.max()])).T.reshape(-1, 2) + if attrs is None: + attrs = ['_all'] # default, plot all points + if palettes is None: + palettes = '_all:slategrey' + elif not is_list_like(attrs): + attrs = [attrs] + if isinstance(basis, str): + basis = [basis] + if isinstance(components, tuple): + components = [components] - basis = _transform_basis(basis) - global_marker_size = _get_marker_size(x.size) if marker_size is None else marker_size - nrows, ncols = _get_nrows_and_ncols(nattrs, nrows, ncols) - fig, axes = _get_subplot_layouts(nrows=nrows, ncols=ncols, panel_size=panel_size, dpi=dpi, left=left, bottom=bottom, wspace=wspace, hspace=hspace, squeeze=False) + # check validity for basis and components + max_comp = max(max([x[0] for x in components]), max([x[1] for x in components])) + for basis_key in basis: + rep = f"X_{basis_key}" + if rep not in data.obsm: + raise KeyError(f"Basis {basis_key} does not exist!") + if data.obsm[rep].shape[1] < max_comp: + raise KeyError(f"Basis {basis_key} only has {data.obsm[rep].shape[1]} components, less than max component {max_comp} specified in components!") + + nattrs = len(attrs) + nbasis = len(basis) + ncomps = len(components) + nfigs = nattrs * nbasis * ncomps + share_xy = (nbasis == 1) and (ncomps == 1) if not is_list_like(alpha): @@ -176,16 +187,42 @@ def scatter( restr_obj = RestrictionParser(restrictions) restr_obj.calc_default(data) + global_marker_size = None + + nrows, ncols = _get_nrows_and_ncols(nfigs, nrows, ncols) + fig, axes = _get_subplot_layouts(nrows=nrows, ncols=ncols, panel_size=panel_size, dpi=dpi, left=left, bottom=bottom, wspace=wspace, hspace=hspace, squeeze=False, sharex=share_xy, sharey=share_xy) + for i in range(nrows): for j in range(ncols): ax = axes[i, j] ax.grid(False) ax.set_xticks([]) ax.set_yticks([]) + if i * ncols + j >= nfigs: + ax.set_frame_on(False) - if i * ncols + j < nattrs: - pos = i * ncols + j - attr = attrs[pos] + offset_start_ = 0 + offset_inc_ = nbasis * ncomps + for basis_key in basis: + basis_ = _transform_basis(basis_key) + for comp_key in components: + x = data.obsm[f"X_{basis_key}"][:, comp_key[0]-1] + y = data.obsm[f"X_{basis_key}"][:, comp_key[1]-1] + + # four corners of the plot + corners = np.array(np.meshgrid([x.min(), x.max()], [y.min(), y.max()])).T.reshape(-1, 2) + + if global_marker_size == None: + global_marker_size = _get_marker_size(x.size) if marker_size is None else marker_size + + x_label = f"{basis_}{comp_key[0]}" + y_label = f"{basis_}{comp_key[1]}" + + pos = offset_start_ + for attr_id, attr in enumerate(attrs): + i = pos // ncols + j = pos % ncols + ax = axes[i, j] if attr == '_all': # if default values = pd.Categorical.from_codes(np.zeros(data.shape[0], dtype=int), categories=['cell']) @@ -224,7 +261,7 @@ def scatter( c=values[selected], s=local_marker_size, marker=".", - alpha=alpha[pos], + alpha=alpha[attr_id], edgecolors="none", cmap=cmap, vmin=vmin, @@ -237,7 +274,7 @@ def scatter( y[selected] * scale_factor, c=values[selected], s=local_marker_size, - alpha=alpha[pos], + alpha=alpha[attr_id], edgecolors="none", cmap=cmap, vmin=vmin, @@ -250,7 +287,6 @@ def scatter( rect = [left + width * (1.0 + 0.05), bottom, width * 0.1, height] ax_colorbar = fig.add_axes(rect) fig.colorbar(img, cax=ax_colorbar) - else: # Categorical attribute labels, with_background = _generate_categories(values, restr_obj.get_satisfied(data, attr)) @@ -266,10 +302,10 @@ def scatter( for k, cat in enumerate(labels.categories): idx = labels == cat if idx.sum() > 0: - scatter_kwargs = {"alpha": alpha[pos], "edgecolors": "none", "rasterized": True} + scatter_kwargs = {"alpha": alpha[attr_id], "edgecolors": "none", "rasterized": True} if cat != "": - if (legend_loc[pos] != "on data") and (scale_factor is None): + if (legend_loc[attr_id] != "on data") and (scale_factor is None): scatter_kwargs["label"] = cat else: text_list.append((np.median(x[idx]), np.median(y[idx]), cat)) @@ -298,7 +334,7 @@ def scatter( _plot_corners(ax, corners, local_marker_size) if attr != '_all': - if legend_loc[pos] == "right margin": + if legend_loc[attr_id] == "right margin": if scale_factor is not None: for k, cat in enumerate(labels.categories): ax.scatter([], [], c=palette[k], label=cat) @@ -306,27 +342,30 @@ def scatter( loc="center left", bbox_to_anchor=(1, 0.5), frameon=False, - fontsize=legend_fontsize[pos], + fontsize=legend_fontsize[attr_id], ncol=_get_legend_ncol(label_size, legend_ncol), ) for handle in legend.legendHandles: handle.set_sizes([300.0 if scale_factor is None else 100.0]) - elif legend_loc[pos] == "on data": + elif legend_loc[attr_id] == "on data": texts = [] for px, py, txt in text_list: - texts.append(ax.text(px, py, txt, fontsize=legend_fontsize[pos], fontweight = "bold", ha = "center", va = "center")) + texts.append(ax.text(px, py, txt, fontsize=legend_fontsize[attr_id], fontweight = "bold", ha = "center", va = "center")) # from adjustText import adjust_text # adjust_text(texts, arrowprops=dict(arrowstyle='-', color='k', lw=0.5)) if attr != '_all': ax.set_title(attr) - else: - ax.set_frame_on(False) - if i == nrows - 1: - ax.set_xlabel(f"{basis}1") - if j == 0: - ax.set_ylabel(f"{basis}2") + if (share_xy and (i + 1) * ncols + j >= nfigs) or (not share_xy): + ax.set_xlabel(x_label) + + if (share_xy and j == 0) or (not share_xy): + ax.set_ylabel(y_label) + + pos += offset_inc_ + + offset_start_ += 1 # Reset current matrix if needed. if not isinstance(data, anndata.AnnData): @@ -2228,7 +2267,9 @@ def elbowplot( >>> fig = pg.elbowplot(data, dpi = 500) """ assert rep in data.uns - thre = largest_variance_from_random_matrix(data.shape[0], data.var[data.uns[f"{rep}_features"]].sum(), pval) + repf = data.uns[f"{rep}_features"] + nfeatures = data.var[repf].sum() if repf != None else data.shape[1] + thre = largest_variance_from_random_matrix(data.shape[0], nfeatures, pval) ncomps = (data.uns[rep]["variance"] > thre).sum() data.uns[f"{rep}_ncomps"] = ncomps logger.info(f"Selecting {ncomps} is recommended!") diff --git a/pegasus/tools/__init__.py b/pegasus/tools/__init__.py index f362ebab..1cf8366e 100644 --- a/pegasus/tools/__init__.py +++ b/pegasus/tools/__init__.py @@ -25,6 +25,7 @@ identify_robust_genes, _run_filter_data, log_norm, + arcsinh_transform, select_features, pca, pc_transform, diff --git a/pegasus/tools/batch_correction.py b/pegasus/tools/batch_correction.py index 977a0024..b4521c51 100644 --- a/pegasus/tools/batch_correction.py +++ b/pegasus/tools/batch_correction.py @@ -18,6 +18,7 @@ def run_harmony( data: Union[MultimodalData, UnimodalData], batch: str = "Channel", rep: str = "pca", + n_comps: int = None, n_jobs: int = -1, n_clusters: int = None, random_state: int = 0, @@ -39,6 +40,9 @@ def run_harmony( rep: ``str``, optional, default: ``"pca"``. Which representation to use as input of Harmony, default is PCA. + n_comps: `int`, optional (default: None) + Number of components to be used in the `rep`. If n_comps == None, use all components; otherwise, use the minimum of n_comps and rep's dimensions. + n_jobs : ``int``, optional, default: ``-1``. Number of threads to use in Harmony. ``-1`` refers to using all physical CPU cores. @@ -81,7 +85,7 @@ def run_harmony( logger.info("Start integration using Harmony.") out_rep = rep + "_harmony" data.obsm["X_" + out_rep] = harmonize( - X_from_rep(data, rep), + X_from_rep(data, rep, n_comps), data.obs, batch, n_clusters = n_clusters, diff --git a/pegasus/tools/preprocessing.py b/pegasus/tools/preprocessing.py index 46f78f87..aa825800 100644 --- a/pegasus/tools/preprocessing.py +++ b/pegasus/tools/preprocessing.py @@ -340,6 +340,66 @@ def log_norm( data.uns["norm_count"] = norm_count +@timer(logger=logger) +def arcsinh_transform(data: Union[MultimodalData, UnimodalData], + cofactor: float = 5.0, + jitter = False, + random_state = 0, + backup_matrix: str = "raw.X", +) -> None: + """Conduct arcsinh transform on the current matrix. + + If jitter == True, jittering by adding a randomized value in U([-0.5, 0.5)). This will also make the matrix dense. Mimic Cytobank. + + Parameters + ---------- + cofactor: ``float``, optional, default: ``5.0`` + Cofactor used in cytobank, arcsinh(x / cofactor). + + jitter: ``bool``, optional, default: ``False`` + Add a 'arcsinh.jitter' matrix in dense format, jittering by adding a randomized value in U([-0.5, 0.5)). + + random_state: ``int``, optional, default: ``0`` + Random seed for generating jitters. + + backup_matrix: ``str``, optional, default: ``raw.X``. + The key name of the backup count matrix, usually the raw counts. + + Returns + ------- + ``None`` + + Update ``data.X`` with count matrix after log-normalization. In addition, back up the original count matrix as ``backup_matrix``. + + In case of rerunning normalization while ``backup_matrix`` already exists, use ``backup_matrix`` instead of ``data.X`` for normalization. + + Examples + -------- + >>> pg.arcsinh_transform(data) + """ + if isinstance(data, MultimodalData): + data = data.current_data() + + assert data.get_modality() in {"rna"} + + if backup_matrix not in data.list_keys(): + data.add_matrix(backup_matrix, data.X) + data.X = data.X.astype(np.float32) # force copy + else: + # The case of rerunning log_norm. Use backup matrix as source. + data.X = data.get_matrix(backup_matrix).astype(np.float32) # force copy + logger.warning("Rerun log-normalization. Use the raw counts in backup instead.") + + if not jitter: + data.X.data = np.arcsinh(data.X.data / cofactor, dtype = np.float32) + else: + np.random.seed(random_state) + jitters = np.random.uniform(low = -0.5, high = 0.5, size = data.X.shape) + signal = np.add(data.X.toarray(), jitters, dtype = np.float32) + signal = np.arcsinh(signal / cofactor, dtype = np.float32) + data.X = signal + + @run_gc def select_features( data: Union[MultimodalData, UnimodalData], @@ -428,7 +488,7 @@ def pca( Number of Principal Components to get. features: ``str``, optional, default: ``"highly_variable_features"``. - Keyword in ``data.var`` to specify features used for PCA. + Keyword in ``data.var`` to specify features used for PCA. If ``None``, all features will be selected. standardize: ``bool``, optional, default: ``True``. Whether to scale the data to unit variance and zero mean. diff --git a/pegasus/tools/visualization.py b/pegasus/tools/visualization.py index e6b9ded1..e8cf808a 100644 --- a/pegasus/tools/visualization.py +++ b/pegasus/tools/visualization.py @@ -161,6 +161,7 @@ def calc_force_directed_layout( def tsne( data: MultimodalData, rep: str = "pca", + rep_ncomps: int = None, n_jobs: int = -1, n_components: int = 2, perplexity: float = 30, @@ -183,6 +184,9 @@ def tsne( rep: ``str``, optional, default: ``"pca"`` Representation of data used for the calculation. By default, use PCA coordinates. If ``None``, use the count matrix ``data.X``. + + rep_ncomps: `int`, optional (default: None) + Number of components to be used in `rep`. If rep_ncomps == None, use all components; otherwise, use the minimum of rep_ncomps and rep's dimensions. n_jobs: ``int``, optional, default: ``-1`` Number of threads to use. If ``-1``, use all physical CPU cores. @@ -222,7 +226,7 @@ def tsne( rep = update_rep(rep) n_jobs = eff_n_jobs(n_jobs) - X = X_from_rep(data, rep).astype(np.float64) + X = X_from_rep(data, rep, rep_ncomps).astype(np.float64) if learning_rate == "auto": learning_rate = max(X.shape[0] / early_exaggeration, 200.0) @@ -261,6 +265,7 @@ def tsne( def umap( data: MultimodalData, rep: str = "pca", + rep_ncomps: int = None, n_components: int = 2, n_neighbors: int = 15, min_dist: float = 0.5, @@ -288,6 +293,9 @@ def umap( rep: ``str``, optional, default: ``"pca"`` Representation of data used for the calculation. By default, use PCA coordinates. If ``None``, use the count matrix ``data.X``. + rep_ncomps: `int`, optional (default: None) + Number of components to be used in `rep`. If rep_ncomps == None, use all components; otherwise, use the minimum of rep_ncomps and rep's dimensions. + n_components: ``int``, optional, default: ``2`` Dimension of calculated UMAP coordinates. By default, generate 2-dimensional data for 2D visualization. @@ -344,7 +352,7 @@ def umap( >>> pg.umap(data) """ rep = update_rep(rep) - X = X_from_rep(data, rep) + X = X_from_rep(data, rep, rep_ncomps) if data.shape[0] < n_neighbors: logger.warning(f"Warning: Number of samples = {data.shape[0]} < K = {n_neighbors}!\n Set K to {data.shape[0]}.") @@ -378,6 +386,7 @@ def fle( file_name: str = None, n_jobs: int = -1, rep: str = "diffmap", + rep_ncomps: int = None, K: int = 50, full_speed: bool = False, target_change_per_node: float = 2.0, @@ -410,6 +419,9 @@ def fle( rep: ``str``, optional, default: ``"diffmap"`` Representation of data used for the calculation. By default, use Diffusion Map coordinates. If ``None``, use the count matrix ``data.X``. + rep_ncomps: `int`, optional (default: None) + Number of components to be used in `rep`. If rep_ncomps == None, use all components; otherwise, use the minimum of rep_ncomps and rep's dimensions. + K: ``int``, optional, default: ``50`` Number of nearest neighbors to be considered during the computation. @@ -460,6 +472,7 @@ def fle( data, K=K, rep=rep, + n_comps=rep_ncomps, n_jobs=n_jobs, random_state=random_state, full_speed=full_speed, @@ -467,7 +480,7 @@ def fle( key = f"X_{out_basis}" data.obsm[key] = calc_force_directed_layout( - W_from_rep(data, rep), + W_from_rep(data, rep, rep_ncomps), file_name, n_jobs, target_change_per_node, From ad8bf21f5e0ce5f8415fe42011e6d828295d1060 Mon Sep 17 00:00:00 2001 From: Yiming Yang Date: Tue, 5 Jul 2022 11:27:09 -0400 Subject: [PATCH 4/5] update W_from_rep function interface --- pegasus/tools/utils.py | 16 ++++++++++------ pegasus/tools/visualization.py | 2 +- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/pegasus/tools/utils.py b/pegasus/tools/utils.py index 71da4b40..00bf100c 100644 --- a/pegasus/tools/utils.py +++ b/pegasus/tools/utils.py @@ -2,7 +2,7 @@ import pandas as pd from pandas.api.types import is_categorical_dtype from scipy.sparse import issparse, csr_matrix -from typing import Union, List, Tuple, Dict +from typing import Union, List, Tuple, Dict, Optional from anndata import AnnData from pegasusio import UnimodalData, MultimodalData @@ -48,14 +48,18 @@ def X_from_rep(data: AnnData, rep: str, n_comps: int = None) -> np.array: return data.X if not issparse(data.X) else data.X.toarray() -def W_from_rep(data: AnnData, rep: str) -> csr_matrix: +def W_from_rep( + data: Union[MultimodalData, UnimodalData, AnnData], + rep: str, + n_comps: Optional[int] = None, +) -> csr_matrix: """ Return affinity matrix W based on representation rep. """ rep_key = "W_" + rep if rep_key not in data.obsp: raise ValueError("Affinity matrix does not exist. Please run neighbors first!") - return data.obsp[rep_key] + return data.obsp[rep_key] if n_comps is None else data.obsp[rep_key][:, 0:n_comps] # slicing is not designed to work at extracting one element, convert to dense matrix @@ -198,7 +202,7 @@ def check_batch_key(data: Union[MultimodalData, UnimodalData], batch: str, warni ribosomal_genes_human=pkg_resources.resource_filename("pegasus", "data_files/ribosomal_genes_human.gmt"), ribosomal_genes_mouse=pkg_resources.resource_filename("pegasus", "data_files/ribosomal_genes_mouse.gmt"), apoptosis_human=pkg_resources.resource_filename("pegasus", "data_files/apoptosis_human.gmt"), - apoptosis_mouse=pkg_resources.resource_filename("pegasus", "data_files/apoptosis_mouse.gmt"), + apoptosis_mouse=pkg_resources.resource_filename("pegasus", "data_files/apoptosis_mouse.gmt"), ) predefined_pathways = dict( @@ -221,7 +225,7 @@ def largest_variance_from_random_matrix( nfeatures: int, pval: str = "0.05", ) -> float: - """ Select the largest variance from a random generated matrix. See [Johnstone 2001](https://projecteuclid.org/journals/annals-of-statistics/volume-29/issue-2/On-the-distribution-of-the-largest-eigenvalue-in-principal/10.1214/aos/1009210544.full) and [Shekhar et al. 2022](https://elifesciences.org/articles/73809) for more details. + """ Select the largest variance from a random generated matrix. See [Johnstone 2001](https://projecteuclid.org/journals/annals-of-statistics/volume-29/issue-2/On-the-distribution-of-the-largest-eigenvalue-in-principal/10.1214/aos/1009210544.full) and [Shekhar et al. 2022](https://elifesciences.org/articles/73809) for more details. Parameters ---------- @@ -245,7 +249,7 @@ def largest_variance_from_random_matrix( quantiles = {"0.01": 2.023335, "0.05": 0.9792895} # quantiles from the Tracy-Widom distribution of order 1. assert pval in ["0.01", "0.05"] val1 = (ncells - 1) ** 0.5 - val2 = nfeatures ** 0.5 + val2 = nfeatures ** 0.5 mu = (val1 + val2) ** 2 sigma = (val1 + val2) * (1.0 / val1 + 1.0 / val2) ** (1.0 / 3.0) res = (quantiles[pval] * sigma + mu) / (ncells - 1) diff --git a/pegasus/tools/visualization.py b/pegasus/tools/visualization.py index e8cf808a..66945ecd 100644 --- a/pegasus/tools/visualization.py +++ b/pegasus/tools/visualization.py @@ -184,7 +184,7 @@ def tsne( rep: ``str``, optional, default: ``"pca"`` Representation of data used for the calculation. By default, use PCA coordinates. If ``None``, use the count matrix ``data.X``. - + rep_ncomps: `int`, optional (default: None) Number of components to be used in `rep`. If rep_ncomps == None, use all components; otherwise, use the minimum of rep_ncomps and rep's dimensions. From 54db5cb061908ec9f3a130182bd0aab422ab3cce Mon Sep 17 00:00:00 2001 From: Yiming Yang Date: Tue, 5 Jul 2022 11:36:42 -0400 Subject: [PATCH 5/5] reset W_from_rep definition; correct function call in fle --- pegasus/tools/utils.py | 5 ++--- pegasus/tools/visualization.py | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/pegasus/tools/utils.py b/pegasus/tools/utils.py index 00bf100c..c7a26fd6 100644 --- a/pegasus/tools/utils.py +++ b/pegasus/tools/utils.py @@ -2,7 +2,7 @@ import pandas as pd from pandas.api.types import is_categorical_dtype from scipy.sparse import issparse, csr_matrix -from typing import Union, List, Tuple, Dict, Optional +from typing import Union, List, Tuple, Dict from anndata import AnnData from pegasusio import UnimodalData, MultimodalData @@ -51,7 +51,6 @@ def X_from_rep(data: AnnData, rep: str, n_comps: int = None) -> np.array: def W_from_rep( data: Union[MultimodalData, UnimodalData, AnnData], rep: str, - n_comps: Optional[int] = None, ) -> csr_matrix: """ Return affinity matrix W based on representation rep. @@ -59,7 +58,7 @@ def W_from_rep( rep_key = "W_" + rep if rep_key not in data.obsp: raise ValueError("Affinity matrix does not exist. Please run neighbors first!") - return data.obsp[rep_key] if n_comps is None else data.obsp[rep_key][:, 0:n_comps] + return data.obsp[rep_key] # slicing is not designed to work at extracting one element, convert to dense matrix diff --git a/pegasus/tools/visualization.py b/pegasus/tools/visualization.py index 66945ecd..b4d39bb8 100644 --- a/pegasus/tools/visualization.py +++ b/pegasus/tools/visualization.py @@ -480,7 +480,7 @@ def fle( key = f"X_{out_basis}" data.obsm[key] = calc_force_directed_layout( - W_from_rep(data, rep, rep_ncomps), + W_from_rep(data, rep), file_name, n_jobs, target_change_per_node,