Remove alpha to make model convex

- Move model error to _run_info - clean up some messages / comments ( - Fix traceplot bug when nbg == 1 - black style in toil workflow
jvivian · May 5, 2019 · 8c983c5 · 8c983c5
1 parent 8a72431
commit 8c983c5
Show file tree

Hide file tree

Showing 4 changed files with 34 additions and 35 deletions.
diff --git a/gene_outlier_detection/lib.py b/gene_outlier_detection/lib.py
@@ -110,10 +110,11 @@ def pca_distances(
         DataFrame of pairwise distances
     """
     # Check n_components value which must be between 0 and min(n_samples, n_features)
+    click.echo("Ranking background datasets by {group}")
     n_samples, n_features = df.shape
     if n_components >= min(n_samples, n_features):
         n_components = min(n_samples, n_features)
-        click.secho(f"Number of components changed to {n_components}", fg="yellow")
+        click.secho(f"Number of PCA components changed to {n_components}", fg="yellow")
 
     # Concatenate sample to background
     concat = df.append(sample)
@@ -210,15 +211,14 @@ def run_model(
 
     click.echo("Building model")
     with pm.Model() as model:
-        # Linear model priors
-        a = pm.Normal("a", mu=0, sd=1)
+        # Convex model priors
         b = [1] if len(classes) == 1 else pm.Dirichlet("b", a=np.ones(len(classes)))
         # Model error
         eps = pm.InverseGamma("eps", 1, 1)
 
-        # Linear model declaration
+        # Convex model declaration
         for gene in tqdm(training_genes):
-            mu = a
+            mu = 0
             for i, dataset in enumerate(classes):
                 name = f"{gene}={dataset}"
                 m, nu, lambd = ys[name]
@@ -245,7 +245,7 @@ def calculate_weights(groups: List[str], trace) -> pd.DataFrame:
     """
     class_col = []
     for c in groups:
-        class_col.extend([c for _ in range(len(trace["a"]))])
+        class_col.extend([c for _ in range(len(trace["eps"]))])
 
     weight_by_class = pd.DataFrame(
         {
@@ -306,7 +306,7 @@ def _gene_ppc(trace, gene: str) -> np.array:
         Random variates representing PPC of the gene
     """
     y_gene = [x for x in trace.varnames if x.startswith(f"{gene}=")]
-    b = trace["a"]
+    b = 0
     if "b" in trace.varnames:
         for i, y_name in enumerate(y_gene):
             b += trace["b"][:, i] * trace[y_name]
@@ -377,21 +377,11 @@ def save_traceplot(trace, out_dir: str, b: bool = True) -> None:
     """
     import pymc3 as pm
 
-    if b:
-        fig, axarr = plt.subplots(3, 2, figsize=(10, 5))
-        varnames = ["a", "b", "eps"]
-    else:
-        fig, axarr = plt.subplots(2, 2, figsize=(10, 5))
-        varnames = ["a", "eps"]
-    pm.traceplot(trace, varnames=varnames, ax=axarr)
+    varnames = ["b", "eps"] if b else ["eps"]
+    pm.traceplot(trace, varnames=varnames)
     traceplot_out = os.path.join(out_dir, "traceplot.png")
+    fig = plt.gcf()
     fig.savefig(traceplot_out)
-    # Save values
-    trace_vals = {}
-    trace_vals["median"] = [np.median(trace["a"]), np.median(trace["eps"])]
-    trace_vals["std"] = [np.std(trace["a"]), np.std(trace["eps"])]
-    trace_df = pd.DataFrame(trace_vals, index=["a", "eps"])
-    trace_df.to_csv(os.path.join(out_dir, "_model_params.tsv"), sep="\t")
 
 
 def save_weights(trace, groups: List[str], out_dir: str) -> None:

diff --git a/gene_outlier_detection/main.py b/gene_outlier_detection/main.py
@@ -5,6 +5,7 @@
 from argparse import Namespace
 
 import click
+import numpy as np
 import pandas as pd
 import scipy.stats as st
 
@@ -44,7 +45,7 @@ def iter_run(opts: Namespace):
     opts.ranks = pca_distances(opts.sample, opts.df, opts.genes, opts.group)
     ranks_out = os.path.join(opts.out_dir, "ranks.tsv")
     opts.ranks.to_csv(ranks_out, sep="\t")
-    opts.n_bg = opts.n_bg if opts.n_bg < len(opts.ranks) else len(opts.ranks)
+    opts.n_bg = min(opts.n_bg, len(opts.ranks))
 
     # Parse training genes
     if opts.gene_list is None:
@@ -71,7 +72,7 @@ def iter_run(opts: Namespace):
     for i in range(1, opts.n_bg + 1):
         if opts.disable_iter:
             click.secho(
-                f"Performing one run with {i} backgrounds due to disable-iter flag",
+                f"Performing one run with {opts.n_bg} backgrounds due to `disable-iter` flag",
                 fg="red",
             )
             i = opts.n_bg
@@ -117,10 +118,15 @@ def iter_run(opts: Namespace):
     # Output run command and run time
     with open(os.path.join(opts.out_dir, "_run_info.tsv"), "w") as f:
         for k in vars(opts):
-            if k in ["sample", "genes", "df"]:
+            if k in ["sample", "genes", "df", "ranks", "base_genes"]:
                 continue
             f.write(f"{k}\t{getattr(opts, k)}\n")
-        f.write(f"Runtime\t{runtime} {unit}")
+        f.write(f"Runtime\t{runtime} {unit}\n")
+        # Add model error
+        err_med = np.median(trace["eps"])
+        err_std = np.std(trace["eps"])
+        f.write(f"epsilon_median\t{err_med}\n")
+        f.write(f"epsilon_std\t{err_std}\n")
 
     # Traceplot - if there is only one background then b = 1 instead of a Dirichlet RV
     b = True if opts.n_bg > 1 else False

diff --git a/tests/test_gene_outlier_detection.py b/tests/test_gene_outlier_detection.py
@@ -188,7 +188,7 @@ def test_pickle_model(tmpdir, model_output):
 def test_meta_runner(datadir, parameters):
     from gene_outlier_detection.meta_runner import cli
 
-    parameters.extend(["--num-training-genes", "10", "-m", "10", "-nbg", "2"])
+    parameters.extend(["--num-training-genes", "10", "-m", "10", "-nbg", "1"])
     runner = CliRunner()
     result = runner.invoke(cli, parameters, catch_exceptions=False)
     assert result.exit_code == 0
@@ -206,7 +206,7 @@ def test_gene_list_and_disable_iter(datadir, parameters):
             "-m",
             "11",
             "-nbg",
-            "1",
+            "2",
         ]
     )
     runner = CliRunner()
@@ -235,7 +235,6 @@ def test_save_traceplot(tmpdir, model_output):
     _, t = model_output
     save_traceplot(t, tmpdir)
     assert os.path.exists(os.path.join(tmpdir, "traceplot.png"))
-    assert os.path.exists(os.path.join(tmpdir, "_model_params.tsv"))
 
 
 def test_save_weights(tmpdir, load_data, model_output):

diff --git a/toil/toil-outlier-detection.py b/toil/toil-outlier-detection.py
@@ -19,7 +19,7 @@ def workflow(job, samples, args):
 
 
 def run_outlier_model(
-        job, sample_info, sample_id, background_id, gene_id, args, cores=2, memory="5G"
+    job, sample_info, sample_id, background_id, gene_id, args, cores=2, memory="5G"
 ):
     # Unpack sample information and add sample specific options
     name, sample_opts = sample_info
@@ -104,7 +104,7 @@ def cli():
         required=True,
         type=str,
         help="Samples by Genes matrix with metadata columns first (including a group column that "
-             "discriminates samples by some category) (csv/tsv/hd5)",
+        "discriminates samples by some category) (csv/tsv/hd5)",
     )
     parser.add_argument(
         "--manifest",
@@ -139,8 +139,8 @@ def cli():
         default=100,
         type=int,
         help="Maximum number of genes to run. I.e. if a gene list is input, how many additional"
-             "genes to add via SelectKBest. Useful for improving beta coefficients"
-             "if gene list does not contain enough tissue-specific genes.",
+        "genes to add via SelectKBest. Useful for improving beta coefficients"
+        "if gene list does not contain enough tissue-specific genes.",
     )
     parser.add_argument(
         "--num-training-genes",
@@ -202,7 +202,7 @@ def partitions(l, partition_size):
     :param int partition_size: Size of partitions
     """
     for i in xrange(0, len(l), partition_size):
-        yield l[i: i + partition_size]
+        yield l[i : i + partition_size]
 
 
 def parse_manifest(manifest_path):
@@ -218,17 +218,21 @@ def parse_manifest(manifest_path):
     :return:
         Sample information with type: List[Tuple[str, dict]]
     """
-    df = pd.read_csv(manifest_path, sep='\t')
+    df = pd.read_csv(manifest_path, sep="\t")
 
     # If manifest is single-column, then return samples and no options
     if len(df.columns) == 1:
-        samples = [(x.strip(), None) for x in open(manifest_path, 'r').readlines() if not x.isspace()]
+        samples = [
+            (x.strip(), None)
+            for x in open(manifest_path, "r").readlines()
+            if not x.isspace()
+        ]
         return samples
 
     # Otherwise return samples and option dictionary
     else:
         samples = []
-        df = pd.read_csv(manifest_path, sep='\t')
+        df = pd.read_csv(manifest_path, sep="\t")
         sample_ids = list(df.iloc[:, 0])
         df = df.drop(df.columns[0], axis=1)
         for i, row in df.iterrows():