From b7b1868ba85c38a4accd9fae85dddfd4b59f478c Mon Sep 17 00:00:00 2001 From: John Vivian Date: Thu, 30 May 2019 13:48:39 -0700 Subject: [PATCH] Change ANOVA to percent of total genes (resolves #56) --- gene_outlier_detection/lib.py | 13 ++++--------- tests/test_gene_outlier_detection.py | 3 +-- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/gene_outlier_detection/lib.py b/gene_outlier_detection/lib.py index a52e8a4..99c9fdb 100644 --- a/gene_outlier_detection/lib.py +++ b/gene_outlier_detection/lib.py @@ -94,7 +94,7 @@ def anova_distances( df: pd.DataFrame, genes: List[str], group: str = "tissue", - n_genes=2000, + percent_genes=0.10, ): """ Calculates distance to each group via pairwise distance using top N ANOVA genes @@ -104,19 +104,14 @@ def anova_distances( df: background dataset genes: genes to use for pairwise distance group: Column to use as class discriminator - n_genes: Number of ANOVA genes to use + percent_genes: Percent of ANOVA genes to use for pairwise distance Returns: DataFrame of pairwise distances """ click.echo(f"Ranking background datasets by {group} via ANOVA") - if n_genes >= len(genes): - click.secho( - f"# of ANOVA genes {n_genes} greater than {len(genes)}", fg="yellow" - ) - skb_genes = genes - else: - skb_genes = select_k_best_genes(df, genes, n=n_genes) + n_genes = int(percent_genes * len(genes)) + skb_genes = select_k_best_genes(df, genes, n=n_genes) dist = pairwise_distances(np.array(sample[skb_genes]).reshape(1, -1), df[skb_genes]) dist = pd.DataFrame([dist.ravel(), df["tissue"]]).T dist.columns = ["Distance", "Group"] diff --git a/tests/test_gene_outlier_detection.py b/tests/test_gene_outlier_detection.py index 8767d4c..1fe8af0 100644 --- a/tests/test_gene_outlier_detection.py +++ b/tests/test_gene_outlier_detection.py @@ -106,9 +106,8 @@ def test_anova_distances(load_data): sample, df, genes = load_data dist = anova_distances(sample, df, genes) - print(dist) assert list(dist.Group) == ["Thyroid", "Brain"] - assert [int(x) for x in dist.MedianDistance] == [57, 131] + assert [int(x) for x in dist.MedianDistance] == [63, 142] def test_run_model(model_output):