Skip to content

Commit

Permalink
Remove alpha to make model convex
Browse files Browse the repository at this point in the history
- Move model error to _run_info
- clean up some messages / comments (
- Fix traceplot bug when nbg == 1
- black style in toil workflow
  • Loading branch information
jvivian committed May 5, 2019
1 parent 8a72431 commit 8c983c5
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 35 deletions.
30 changes: 10 additions & 20 deletions gene_outlier_detection/lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,10 +110,11 @@ def pca_distances(
DataFrame of pairwise distances
"""
# Check n_components value which must be between 0 and min(n_samples, n_features)
click.echo("Ranking background datasets by {group}")
n_samples, n_features = df.shape
if n_components >= min(n_samples, n_features):
n_components = min(n_samples, n_features)
click.secho(f"Number of components changed to {n_components}", fg="yellow")
click.secho(f"Number of PCA components changed to {n_components}", fg="yellow")

# Concatenate sample to background
concat = df.append(sample)
Expand Down Expand Up @@ -210,15 +211,14 @@ def run_model(

click.echo("Building model")
with pm.Model() as model:
# Linear model priors
a = pm.Normal("a", mu=0, sd=1)
# Convex model priors
b = [1] if len(classes) == 1 else pm.Dirichlet("b", a=np.ones(len(classes)))
# Model error
eps = pm.InverseGamma("eps", 1, 1)

# Linear model declaration
# Convex model declaration
for gene in tqdm(training_genes):
mu = a
mu = 0
for i, dataset in enumerate(classes):
name = f"{gene}={dataset}"
m, nu, lambd = ys[name]
Expand All @@ -245,7 +245,7 @@ def calculate_weights(groups: List[str], trace) -> pd.DataFrame:
"""
class_col = []
for c in groups:
class_col.extend([c for _ in range(len(trace["a"]))])
class_col.extend([c for _ in range(len(trace["eps"]))])

weight_by_class = pd.DataFrame(
{
Expand Down Expand Up @@ -306,7 +306,7 @@ def _gene_ppc(trace, gene: str) -> np.array:
Random variates representing PPC of the gene
"""
y_gene = [x for x in trace.varnames if x.startswith(f"{gene}=")]
b = trace["a"]
b = 0
if "b" in trace.varnames:
for i, y_name in enumerate(y_gene):
b += trace["b"][:, i] * trace[y_name]
Expand Down Expand Up @@ -377,21 +377,11 @@ def save_traceplot(trace, out_dir: str, b: bool = True) -> None:
"""
import pymc3 as pm

if b:
fig, axarr = plt.subplots(3, 2, figsize=(10, 5))
varnames = ["a", "b", "eps"]
else:
fig, axarr = plt.subplots(2, 2, figsize=(10, 5))
varnames = ["a", "eps"]
pm.traceplot(trace, varnames=varnames, ax=axarr)
varnames = ["b", "eps"] if b else ["eps"]
pm.traceplot(trace, varnames=varnames)
traceplot_out = os.path.join(out_dir, "traceplot.png")
fig = plt.gcf()
fig.savefig(traceplot_out)
# Save values
trace_vals = {}
trace_vals["median"] = [np.median(trace["a"]), np.median(trace["eps"])]
trace_vals["std"] = [np.std(trace["a"]), np.std(trace["eps"])]
trace_df = pd.DataFrame(trace_vals, index=["a", "eps"])
trace_df.to_csv(os.path.join(out_dir, "_model_params.tsv"), sep="\t")


def save_weights(trace, groups: List[str], out_dir: str) -> None:
Expand Down
14 changes: 10 additions & 4 deletions gene_outlier_detection/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from argparse import Namespace

import click
import numpy as np
import pandas as pd
import scipy.stats as st

Expand Down Expand Up @@ -44,7 +45,7 @@ def iter_run(opts: Namespace):
opts.ranks = pca_distances(opts.sample, opts.df, opts.genes, opts.group)
ranks_out = os.path.join(opts.out_dir, "ranks.tsv")
opts.ranks.to_csv(ranks_out, sep="\t")
opts.n_bg = opts.n_bg if opts.n_bg < len(opts.ranks) else len(opts.ranks)
opts.n_bg = min(opts.n_bg, len(opts.ranks))

# Parse training genes
if opts.gene_list is None:
Expand All @@ -71,7 +72,7 @@ def iter_run(opts: Namespace):
for i in range(1, opts.n_bg + 1):
if opts.disable_iter:
click.secho(
f"Performing one run with {i} backgrounds due to disable-iter flag",
f"Performing one run with {opts.n_bg} backgrounds due to `disable-iter` flag",
fg="red",
)
i = opts.n_bg
Expand Down Expand Up @@ -117,10 +118,15 @@ def iter_run(opts: Namespace):
# Output run command and run time
with open(os.path.join(opts.out_dir, "_run_info.tsv"), "w") as f:
for k in vars(opts):
if k in ["sample", "genes", "df"]:
if k in ["sample", "genes", "df", "ranks", "base_genes"]:
continue
f.write(f"{k}\t{getattr(opts, k)}\n")
f.write(f"Runtime\t{runtime} {unit}")
f.write(f"Runtime\t{runtime} {unit}\n")
# Add model error
err_med = np.median(trace["eps"])
err_std = np.std(trace["eps"])
f.write(f"epsilon_median\t{err_med}\n")
f.write(f"epsilon_std\t{err_std}\n")

# Traceplot - if there is only one background then b = 1 instead of a Dirichlet RV
b = True if opts.n_bg > 1 else False
Expand Down
5 changes: 2 additions & 3 deletions tests/test_gene_outlier_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ def test_pickle_model(tmpdir, model_output):
def test_meta_runner(datadir, parameters):
from gene_outlier_detection.meta_runner import cli

parameters.extend(["--num-training-genes", "10", "-m", "10", "-nbg", "2"])
parameters.extend(["--num-training-genes", "10", "-m", "10", "-nbg", "1"])
runner = CliRunner()
result = runner.invoke(cli, parameters, catch_exceptions=False)
assert result.exit_code == 0
Expand All @@ -206,7 +206,7 @@ def test_gene_list_and_disable_iter(datadir, parameters):
"-m",
"11",
"-nbg",
"1",
"2",
]
)
runner = CliRunner()
Expand Down Expand Up @@ -235,7 +235,6 @@ def test_save_traceplot(tmpdir, model_output):
_, t = model_output
save_traceplot(t, tmpdir)
assert os.path.exists(os.path.join(tmpdir, "traceplot.png"))
assert os.path.exists(os.path.join(tmpdir, "_model_params.tsv"))


def test_save_weights(tmpdir, load_data, model_output):
Expand Down
20 changes: 12 additions & 8 deletions toil/toil-outlier-detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def workflow(job, samples, args):


def run_outlier_model(
job, sample_info, sample_id, background_id, gene_id, args, cores=2, memory="5G"
job, sample_info, sample_id, background_id, gene_id, args, cores=2, memory="5G"
):
# Unpack sample information and add sample specific options
name, sample_opts = sample_info
Expand Down Expand Up @@ -104,7 +104,7 @@ def cli():
required=True,
type=str,
help="Samples by Genes matrix with metadata columns first (including a group column that "
"discriminates samples by some category) (csv/tsv/hd5)",
"discriminates samples by some category) (csv/tsv/hd5)",
)
parser.add_argument(
"--manifest",
Expand Down Expand Up @@ -139,8 +139,8 @@ def cli():
default=100,
type=int,
help="Maximum number of genes to run. I.e. if a gene list is input, how many additional"
"genes to add via SelectKBest. Useful for improving beta coefficients"
"if gene list does not contain enough tissue-specific genes.",
"genes to add via SelectKBest. Useful for improving beta coefficients"
"if gene list does not contain enough tissue-specific genes.",
)
parser.add_argument(
"--num-training-genes",
Expand Down Expand Up @@ -202,7 +202,7 @@ def partitions(l, partition_size):
:param int partition_size: Size of partitions
"""
for i in xrange(0, len(l), partition_size):
yield l[i: i + partition_size]
yield l[i : i + partition_size]


def parse_manifest(manifest_path):
Expand All @@ -218,17 +218,21 @@ def parse_manifest(manifest_path):
:return:
Sample information with type: List[Tuple[str, dict]]
"""
df = pd.read_csv(manifest_path, sep='\t')
df = pd.read_csv(manifest_path, sep="\t")

# If manifest is single-column, then return samples and no options
if len(df.columns) == 1:
samples = [(x.strip(), None) for x in open(manifest_path, 'r').readlines() if not x.isspace()]
samples = [
(x.strip(), None)
for x in open(manifest_path, "r").readlines()
if not x.isspace()
]
return samples

# Otherwise return samples and option dictionary
else:
samples = []
df = pd.read_csv(manifest_path, sep='\t')
df = pd.read_csv(manifest_path, sep="\t")
sample_ids = list(df.iloc[:, 0])
df = df.drop(df.columns[0], axis=1)
for i, row in df.iterrows():
Expand Down

0 comments on commit 8c983c5

Please sign in to comment.