Merge pull request #112 from hammerlab/p_star

Add p-value indicator
hammerlab · Aug 11, 2016 · 606c57f · 606c57f
2 parents 4a45617 + 0bfdc82
commit 606c57f
Show file tree

Hide file tree

Showing 4 changed files with 203 additions and 31 deletions.
diff --git a/cohorts/load.py b/cohorts/load.py
@@ -47,6 +47,7 @@
 from .varcode_utils import (filter_variants, filter_effects,
                             filter_neoantigens, filter_polyphen)
 from .variant_filters import no_filter
+from .styling import set_styling
 from . import variant_filters
 
 class InvalidDataError(ValueError):
@@ -307,6 +308,8 @@ def __init__(self,
         if print_provenance:
             pprint.pprint(self.summarize_data_sources())
 
+        set_styling()
+
     def verify_id_uniqueness(self):
         patient_ids = set([patient.id for patient in self])
         if len(patient_ids) != len(self):
@@ -1089,31 +1092,35 @@ def plot_benefit(self, on, col=None, benefit_col="benefit", label="Response", ax
                                  ax=ax,
                                  **kwargs)
 
-    def plot_boolean(self, 
-                     on, 
-                     boolean_col, 
+    def plot_boolean(self,
+                     on,
+                     boolean_col,
+                     plot_col=None,
                      boolean_label=None,
                      boolean_value_map={},
-                     col=None, 
-                     order=None, 
+                     col=None,
+                     order=None,
                      ax=None,
-                     alternative="two-sided", 
+                     alternative="two-sided",
                      **kwargs):
         """Plot a comparison of `boolean_col` in the cohort on a given variable via
         `on` or `col`.
-        
+
         If the variable (through `on` or `col`) is binary this will compare
         odds-ratios and perform a Fisher's exact test.
-        
+
         If the variable is numeric, this will compare the distributions through
         a Mann-Whitney test and plot the distributions with box-strip plot
-        
+
         Parameters
         ----------
-        on : str or function
+        on : str or function or list or dict
             See `cohort.load.as_dataframe`
+        plot_col : str, optional
+            If on has many columns, this is the one whose values we are plotting.
+            If on has a single column, this is unnecessary.
         boolean_col : str
-            Column name of boolean column to plot or compare against
+            Column name of boolean column to plot or compare against.
         boolean_label : None, optional
             Label to give boolean column in the plot
         boolean_value_map : dict, optional
@@ -1132,7 +1139,15 @@ def plot_boolean(self,
         (Test statistic, p-value): (float, float)
 
         """
-        plot_col, df = self.as_dataframe(on, col, **kwargs)
+        cols, df = self.as_dataframe(on, col, **kwargs)
+        if type(cols) == str:
+            if plot_col is not None:
+                raise ValueError("plot_col is specified when it isn't ndeeded.")
+            plot_col = cols
+        elif type(cols) == list:
+            if plot_col is None:
+                raise ValueError("plot_col must be specified when multiple `on`s are present.")
+
         df = filter_not_null(df, boolean_col)
         df = filter_not_null(df, plot_col)
 
@@ -1153,6 +1168,7 @@ def plot_boolean(self,
                 condition1=boolean_col,
                 condition2=plot_col,
                 alternative=alternative,
+                order=order,
                 ax=ax)
         else:
             results = mann_whitney_plot(

diff --git a/cohorts/plot.py b/cohorts/plot.py
@@ -22,7 +22,52 @@
 from sklearn.metrics import roc_curve
 from .model import bootstrap_auc
 
-def stripboxplot(x, y, data, ax=None, **kwargs):
+def vertical_percent(plot, percent=0.1):
+    """
+    Using the size of the y axis, return a fraction of that size.
+    """
+    plot_bottom, plot_top = plot.get_ylim()
+    return percent * (plot_top - plot_bottom)
+
+def as_numeric(text):
+    try:
+        return float(text)
+    except:
+        return None
+
+def hide_negative_y_ticks(plot):
+    y_ticks = plot.get_yticks()
+    plot.set_yticks([tick for tick in y_ticks if as_numeric(tick) is not None and as_numeric(tick) >= 0])
+
+def only_percentage_ticks(plot):
+    """
+    Only show ticks from 0.0 to 1.0.
+    """
+    hide_negative_y_ticks(plot)
+    y_ticks = plot.get_yticks()
+    less_1_ticks = [tick for tick in y_ticks if as_numeric(tick) is not None and as_numeric(tick) <= 1]
+    if 1.0 not in less_1_ticks:
+        less_1_ticks.append(1.0)
+    plot.set_yticks(less_1_ticks)
+
+def add_significance_indicator(plot, col_a=0, col_b=1, significant=False):
+    """
+    Add a p-value significance indicator.
+    """
+    plot_bottom, plot_top = plot.get_ylim()
+    # Give the plot a little room for the significance indicator
+    line_height = vertical_percent(plot, 0.1)
+    # Add some extra spacing below the indicator
+    plot_top = plot_top + line_height
+    # Add some extra spacing above the indicator
+    plot.set_ylim(top=plot_top + line_height * 2)
+    color = "black"
+    line_top = plot_top + line_height
+    plot.plot([col_a, col_a, col_b, col_b], [plot_top, line_top, line_top, plot_top], lw=1.5, color=color)
+    indicator = "*" if significant else "ns"
+    plot.text((col_a + col_b) * 0.5, line_top, indicator, ha="center", va="bottom", color=color)
+
+def stripboxplot(x, y, data, ax=None, significant=None, **kwargs):
     """
     Overlay a stripplot on top of a boxplot.
     """
@@ -35,7 +80,7 @@ def stripboxplot(x, y, data, ax=None, **kwargs):
         **kwargs
     )
 
-    return sb.stripplot(
+    plot = sb.stripplot(
         x=x,
         y=y,
         data=data,
@@ -45,6 +90,13 @@ def stripboxplot(x, y, data, ax=None, **kwargs):
         **kwargs
     )
 
+    if data[y].min() >= 0:
+        hide_negative_y_ticks(plot)
+    if significant is not None:
+        add_significance_indicator(plot=plot, significant=significant)
+
+    return plot
+
 def sided_str_from_alternative(alternative, condition):
     if alternative is None:
         raise ValueError("Must pick an alternative")
@@ -54,9 +106,15 @@ def sided_str_from_alternative(alternative, condition):
     op_str = ">" if alternative == "greater" else "<"
     return "one-sided: %s %s not %s" % (condition, op_str, condition)
 
-FishersExactResults = namedtuple("FishersExactResults", ["oddsratio", "pvalue", "sided_str", "plot"])
+class FishersExactResults(namedtuple("FishersExactResults", ["oddsratio", "pvalue", "sided_str", "plot"])):
+    def __str__(self):
+        return "FishersExactResults(oddsratio=%s, pvalue=%s, sided_str='%s')" % (
+            self.oddsratio, self.pvalue, self.sided_str)
 
-def fishers_exact_plot(data, condition1, condition2, ax=None, alternative="two-sided"):
+    def __repr__(self):
+        return self.__str__()
+
+def fishers_exact_plot(data, condition1, condition2, ax=None, alternative="two-sided", **kwargs):
     """
     Perform a Fisher's exact test to compare to binary columns
 
@@ -82,11 +140,15 @@ def fishers_exact_plot(data, condition1, condition2, ax=None, alternative="two-s
         x=condition1,
         y=condition2,
         ax=ax,
-        data=data
+        data=data,
+        **kwargs
     )
+
     count_table = pd.crosstab(data[condition1], data[condition2])
     print(count_table)
     oddsratio, pvalue = fisher_exact(count_table, alternative=alternative)
+    only_percentage_ticks(plot)
+    add_significance_indicator(plot=plot, significant=pvalue <= 0.05)
     if alternative != "two-sided":
         raise ValueError("We need to better understand the one-sided Fisher's Exact test")
     sided_str = "two-sided"
@@ -96,13 +158,19 @@ def fishers_exact_plot(data, condition1, condition2, ax=None, alternative="two-s
                                sided_str=sided_str,
                                plot=plot)
 
-MannWhitneyResults = namedtuple("MannWhitneyResults", ["U", "pvalue", "sided_str", "with_condition_series", "without_condition_series", "plot"])
+class MannWhitneyResults(namedtuple("MannWhitneyResults", ["U", "pvalue", "sided_str", "with_condition_series", "without_condition_series", "plot"])):
+    def __str__(self):
+        return "MannWhitneyResults(U=%s, pvalue=%s, sided_str='%s')" % (
+            self.U, self.pvalue, self.sided_str)
 
-def mann_whitney_plot(data, 
+    def __repr__(self):
+        return self.__str__()
+
+def mann_whitney_plot(data,
                       condition,
-                      distribution, 
+                      distribution,
                       ax=None,
-                      condition_value=None, 
+                      condition_value=None,
                       alternative="two-sided",
                       skip_plot=False,
                       **kwargs):
@@ -134,16 +202,6 @@ def mann_whitney_plot(data,
     skip_plot:
         Calculate the test statistic and p-value, but don't plot.
     """
-    plot = None
-    if not skip_plot:
-        plot = stripboxplot(
-            x=condition,
-            y=distribution,
-            data=data,
-            ax=ax,
-            **kwargs
-        )
-
     if condition_value:
         condition_mask = data[condition] == condition_value
     else:
@@ -154,6 +212,17 @@ def mann_whitney_plot(data,
         alternative=alternative
     )
 
+    plot = None
+    if not skip_plot:
+        plot = stripboxplot(
+            x=condition,
+            y=distribution,
+            data=data,
+            ax=ax,
+            significant=pvalue <= 0.05,
+            **kwargs
+        )
+
     sided_str = sided_str_from_alternative(alternative, condition)
     print("Mann-Whitney test: U={}, p-value={} ({})".format(U, pvalue, sided_str))
     return MannWhitneyResults(U=U,

diff --git a/cohorts/random_cohort.py b/cohorts/random_cohort.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2016. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import pandas as pd
+from numpy.random import choice, randint, seed
+
+from . import Cohort, Patient
+
+def random_cohort(size, cache_dir, seed_val=1234):
+    seed(seed_val)
+    d = {}
+    d["id"] = [str(id) for id in range(size)]
+    d["age"] = choice([10, 15, 28, 32, 59, 62, 64, 66, 68], size)
+    d["OS"] = [os + randint(10) for os in choice([10, 100, 500, 1000], size)]
+    # Note: these values are not currently consistent with each other.
+    d["PFS"] = [int(os * 0.6) for os in d["OS"]]
+    d["benefit"] = choice([False, True], size)
+    d["random"] = [randint(100) for i in range(size)]
+    d["random_boolean"] = choice([False, True], size)
+    d["benefit_correlate"] = [randint(50) if benefit else randint(20) for benefit in d["benefit"]]
+    d["benefit_correlate_boolean"] = [True if corr > 10 else False for corr in d["benefit_correlate"]]
+    d["deceased"] = choice([False, True], size)
+    d["progressed_or_deceased"] = choice([False, True], size)
+    df = pd.DataFrame(d)
+    patients = []
+    for i, row in df.iterrows():
+         patient = Patient(
+             id=row["id"],
+             os=row["OS"],
+             pfs=row["PFS"],
+             benefit=row["benefit"],
+             deceased=row["deceased"],
+             progressed_or_deceased=row["progressed_or_deceased"],
+             additional_data=row)
+         patients.append(patient)
+    return Cohort(
+        patients=patients,
+        cache_dir=cache_dir)
diff --git a/cohorts/styling.py b/cohorts/styling.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2016. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import matplotlib as mpl
+import matplotlib.colors as colors
+import seaborn as sb
+import numpy as np
+
+def set_styling():
+    sb.set_style("white")
+    red = colors.hex2color("#bb3f3f")
+    blue = colors.hex2color("#5a86ad")
+    deep_colors = sb.color_palette("deep")
+    green = deep_colors[1]
+    custom_palette = [red, blue, green]
+    custom_palette.extend(deep_colors[3:])
+    sb.set_palette(custom_palette)
+    mpl.rcParams.update({"figure.figsize": np.array([6, 6]),
+                         "font.size": 16,
+                         "axes.labelsize": 16,
+                         "axes.labelweight": "bold",
+                         "xtick.labelsize": 16,
+                         "ytick.labelsize": 16})