Skip to content

Commit

Permalink
Merge pull request #112 from hammerlab/p_star
Browse files Browse the repository at this point in the history
Add p-value indicator
  • Loading branch information
tavinathanson committed Aug 11, 2016
2 parents 4a45617 + 0bfdc82 commit 606c57f
Show file tree
Hide file tree
Showing 4 changed files with 203 additions and 31 deletions.
40 changes: 28 additions & 12 deletions cohorts/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
from .varcode_utils import (filter_variants, filter_effects,
filter_neoantigens, filter_polyphen)
from .variant_filters import no_filter
from .styling import set_styling
from . import variant_filters

class InvalidDataError(ValueError):
Expand Down Expand Up @@ -307,6 +308,8 @@ def __init__(self,
if print_provenance:
pprint.pprint(self.summarize_data_sources())

set_styling()

def verify_id_uniqueness(self):
patient_ids = set([patient.id for patient in self])
if len(patient_ids) != len(self):
Expand Down Expand Up @@ -1089,31 +1092,35 @@ def plot_benefit(self, on, col=None, benefit_col="benefit", label="Response", ax
ax=ax,
**kwargs)

def plot_boolean(self,
on,
boolean_col,
def plot_boolean(self,
on,
boolean_col,
plot_col=None,
boolean_label=None,
boolean_value_map={},
col=None,
order=None,
col=None,
order=None,
ax=None,
alternative="two-sided",
alternative="two-sided",
**kwargs):
"""Plot a comparison of `boolean_col` in the cohort on a given variable via
`on` or `col`.
If the variable (through `on` or `col`) is binary this will compare
odds-ratios and perform a Fisher's exact test.
If the variable is numeric, this will compare the distributions through
a Mann-Whitney test and plot the distributions with box-strip plot
Parameters
----------
on : str or function
on : str or function or list or dict
See `cohort.load.as_dataframe`
plot_col : str, optional
If on has many columns, this is the one whose values we are plotting.
If on has a single column, this is unnecessary.
boolean_col : str
Column name of boolean column to plot or compare against
Column name of boolean column to plot or compare against.
boolean_label : None, optional
Label to give boolean column in the plot
boolean_value_map : dict, optional
Expand All @@ -1132,7 +1139,15 @@ def plot_boolean(self,
(Test statistic, p-value): (float, float)
"""
plot_col, df = self.as_dataframe(on, col, **kwargs)
cols, df = self.as_dataframe(on, col, **kwargs)
if type(cols) == str:
if plot_col is not None:
raise ValueError("plot_col is specified when it isn't ndeeded.")
plot_col = cols
elif type(cols) == list:
if plot_col is None:
raise ValueError("plot_col must be specified when multiple `on`s are present.")

df = filter_not_null(df, boolean_col)
df = filter_not_null(df, plot_col)

Expand All @@ -1153,6 +1168,7 @@ def plot_boolean(self,
condition1=boolean_col,
condition2=plot_col,
alternative=alternative,
order=order,
ax=ax)
else:
results = mann_whitney_plot(
Expand Down
107 changes: 88 additions & 19 deletions cohorts/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,52 @@
from sklearn.metrics import roc_curve
from .model import bootstrap_auc

def stripboxplot(x, y, data, ax=None, **kwargs):
def vertical_percent(plot, percent=0.1):
"""
Using the size of the y axis, return a fraction of that size.
"""
plot_bottom, plot_top = plot.get_ylim()
return percent * (plot_top - plot_bottom)

def as_numeric(text):
try:
return float(text)
except:
return None

def hide_negative_y_ticks(plot):
y_ticks = plot.get_yticks()
plot.set_yticks([tick for tick in y_ticks if as_numeric(tick) is not None and as_numeric(tick) >= 0])

def only_percentage_ticks(plot):
"""
Only show ticks from 0.0 to 1.0.
"""
hide_negative_y_ticks(plot)
y_ticks = plot.get_yticks()
less_1_ticks = [tick for tick in y_ticks if as_numeric(tick) is not None and as_numeric(tick) <= 1]
if 1.0 not in less_1_ticks:
less_1_ticks.append(1.0)
plot.set_yticks(less_1_ticks)

def add_significance_indicator(plot, col_a=0, col_b=1, significant=False):
"""
Add a p-value significance indicator.
"""
plot_bottom, plot_top = plot.get_ylim()
# Give the plot a little room for the significance indicator
line_height = vertical_percent(plot, 0.1)
# Add some extra spacing below the indicator
plot_top = plot_top + line_height
# Add some extra spacing above the indicator
plot.set_ylim(top=plot_top + line_height * 2)
color = "black"
line_top = plot_top + line_height
plot.plot([col_a, col_a, col_b, col_b], [plot_top, line_top, line_top, plot_top], lw=1.5, color=color)
indicator = "*" if significant else "ns"
plot.text((col_a + col_b) * 0.5, line_top, indicator, ha="center", va="bottom", color=color)

def stripboxplot(x, y, data, ax=None, significant=None, **kwargs):
"""
Overlay a stripplot on top of a boxplot.
"""
Expand All @@ -35,7 +80,7 @@ def stripboxplot(x, y, data, ax=None, **kwargs):
**kwargs
)

return sb.stripplot(
plot = sb.stripplot(
x=x,
y=y,
data=data,
Expand All @@ -45,6 +90,13 @@ def stripboxplot(x, y, data, ax=None, **kwargs):
**kwargs
)

if data[y].min() >= 0:
hide_negative_y_ticks(plot)
if significant is not None:
add_significance_indicator(plot=plot, significant=significant)

return plot

def sided_str_from_alternative(alternative, condition):
if alternative is None:
raise ValueError("Must pick an alternative")
Expand All @@ -54,9 +106,15 @@ def sided_str_from_alternative(alternative, condition):
op_str = ">" if alternative == "greater" else "<"
return "one-sided: %s %s not %s" % (condition, op_str, condition)

FishersExactResults = namedtuple("FishersExactResults", ["oddsratio", "pvalue", "sided_str", "plot"])
class FishersExactResults(namedtuple("FishersExactResults", ["oddsratio", "pvalue", "sided_str", "plot"])):
def __str__(self):
return "FishersExactResults(oddsratio=%s, pvalue=%s, sided_str='%s')" % (
self.oddsratio, self.pvalue, self.sided_str)

def fishers_exact_plot(data, condition1, condition2, ax=None, alternative="two-sided"):
def __repr__(self):
return self.__str__()

def fishers_exact_plot(data, condition1, condition2, ax=None, alternative="two-sided", **kwargs):
"""
Perform a Fisher's exact test to compare to binary columns
Expand All @@ -82,11 +140,15 @@ def fishers_exact_plot(data, condition1, condition2, ax=None, alternative="two-s
x=condition1,
y=condition2,
ax=ax,
data=data
data=data,
**kwargs
)

count_table = pd.crosstab(data[condition1], data[condition2])
print(count_table)
oddsratio, pvalue = fisher_exact(count_table, alternative=alternative)
only_percentage_ticks(plot)
add_significance_indicator(plot=plot, significant=pvalue <= 0.05)
if alternative != "two-sided":
raise ValueError("We need to better understand the one-sided Fisher's Exact test")
sided_str = "two-sided"
Expand All @@ -96,13 +158,19 @@ def fishers_exact_plot(data, condition1, condition2, ax=None, alternative="two-s
sided_str=sided_str,
plot=plot)

MannWhitneyResults = namedtuple("MannWhitneyResults", ["U", "pvalue", "sided_str", "with_condition_series", "without_condition_series", "plot"])
class MannWhitneyResults(namedtuple("MannWhitneyResults", ["U", "pvalue", "sided_str", "with_condition_series", "without_condition_series", "plot"])):
def __str__(self):
return "MannWhitneyResults(U=%s, pvalue=%s, sided_str='%s')" % (
self.U, self.pvalue, self.sided_str)

def mann_whitney_plot(data,
def __repr__(self):
return self.__str__()

def mann_whitney_plot(data,
condition,
distribution,
distribution,
ax=None,
condition_value=None,
condition_value=None,
alternative="two-sided",
skip_plot=False,
**kwargs):
Expand Down Expand Up @@ -134,16 +202,6 @@ def mann_whitney_plot(data,
skip_plot:
Calculate the test statistic and p-value, but don't plot.
"""
plot = None
if not skip_plot:
plot = stripboxplot(
x=condition,
y=distribution,
data=data,
ax=ax,
**kwargs
)

if condition_value:
condition_mask = data[condition] == condition_value
else:
Expand All @@ -154,6 +212,17 @@ def mann_whitney_plot(data,
alternative=alternative
)

plot = None
if not skip_plot:
plot = stripboxplot(
x=condition,
y=distribution,
data=data,
ax=ax,
significant=pvalue <= 0.05,
**kwargs
)

sided_str = sided_str_from_alternative(alternative, condition)
print("Mann-Whitney test: U={}, p-value={} ({})".format(U, pvalue, sided_str))
return MannWhitneyResults(U=U,
Expand Down
51 changes: 51 additions & 0 deletions cohorts/random_cohort.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# Copyright (c) 2016. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import print_function

import pandas as pd
from numpy.random import choice, randint, seed

from . import Cohort, Patient

def random_cohort(size, cache_dir, seed_val=1234):
seed(seed_val)
d = {}
d["id"] = [str(id) for id in range(size)]
d["age"] = choice([10, 15, 28, 32, 59, 62, 64, 66, 68], size)
d["OS"] = [os + randint(10) for os in choice([10, 100, 500, 1000], size)]
# Note: these values are not currently consistent with each other.
d["PFS"] = [int(os * 0.6) for os in d["OS"]]
d["benefit"] = choice([False, True], size)
d["random"] = [randint(100) for i in range(size)]
d["random_boolean"] = choice([False, True], size)
d["benefit_correlate"] = [randint(50) if benefit else randint(20) for benefit in d["benefit"]]
d["benefit_correlate_boolean"] = [True if corr > 10 else False for corr in d["benefit_correlate"]]
d["deceased"] = choice([False, True], size)
d["progressed_or_deceased"] = choice([False, True], size)
df = pd.DataFrame(d)
patients = []
for i, row in df.iterrows():
patient = Patient(
id=row["id"],
os=row["OS"],
pfs=row["PFS"],
benefit=row["benefit"],
deceased=row["deceased"],
progressed_or_deceased=row["progressed_or_deceased"],
additional_data=row)
patients.append(patient)
return Cohort(
patients=patients,
cache_dir=cache_dir)
36 changes: 36 additions & 0 deletions cohorts/styling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Copyright (c) 2016. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import print_function

import matplotlib as mpl
import matplotlib.colors as colors
import seaborn as sb
import numpy as np

def set_styling():
sb.set_style("white")
red = colors.hex2color("#bb3f3f")
blue = colors.hex2color("#5a86ad")
deep_colors = sb.color_palette("deep")
green = deep_colors[1]
custom_palette = [red, blue, green]
custom_palette.extend(deep_colors[3:])
sb.set_palette(custom_palette)
mpl.rcParams.update({"figure.figsize": np.array([6, 6]),
"font.size": 16,
"axes.labelsize": 16,
"axes.labelweight": "bold",
"xtick.labelsize": 16,
"ytick.labelsize": 16})

0 comments on commit 606c57f

Please sign in to comment.