hammerlab · tavinathanson · May 3, 2016 · May 1, 2016 · Apr 24, 2016 · Apr 24, 2016
diff --git a/README.md b/README.md
@@ -20,41 +20,52 @@ Usage Examples
 --------------
 
 ```python
+patient_1 = Patient(
+    id="patient_1",
+    os=70,
+    pfs=24,
+    deceased=True,
+    progressed=True,
+    benefit=False)
+patient_2 = Patient(
+    id="patient_2",
+    os=100,
+    pfs=50,
+    deceased=False,
+    progressed=True,
+    benefit=False)
+)
 cohort = Cohort(
-    data_dir="/my/input/data",
-    cache_dir="/where/cohorts/results/get/saved",
-    sample_ids=["sample_1", "sample_2"],
-    clinical_dataframe=pandas_dataframe_with_clinical_data,
-    clinical_dataframe_id_col="sample_id_in_dataframe",
-    os_col="Overall Survival",
-    pfs_col="Progression-Free Survival",
-    deceased_col="deceased",
-    progressed_or_deceased_col="progressed_or_deceased"
+    patients=[patient_1, patient_2],
+    cache_dir="/where/cohorts/results/get/saved"
 )
 
 cohort.plot_survival(how="os")
 ```
 
 ```python
-def mutect_snv_file_format_func(sample_id, normal_bam_id, tumor_bam_id):
-    return "Mutect-%d-normal=%s.bam-tumor=%s.bam-merged.vcf" % (
-        sample_id, normal_bam_id, tumor_bam_id)
-
-def strelka_snv_file_format_func(...):
+sample_1_tumor = Sample(
+    id="sample_1_tumor",
+    bam_path_dna="/path/to/dna/bam",
+    bam_path_rna="/path/to/rna/bam"
+)
+patient_1 = Patient(
+    id="patient_1",
     ...
-
+    snv_vcf_paths=["/where/my/mutect/vcfs/live",
+                   "/where/my/strelka/vcfs/live"]
+    indel_vcfs_paths=[...],
+    tumor_sample=sample_1_tumor,
+    ...
+)
 cohort = Cohort(
     ...
-    benefit_col="patient_durable_benefit",
-    snv_file_format_funcs=[
-        mutect_snv_file_format_func,
-        strelka_snv_file_format_func
-    ]
+    patients=[patient_1]
 )
 
 # Comparison plot of missense mutation counts between benefit and no-benefit patients
 cohort.plot_benefit(on=missense_snv_count)
 
 # Raw missense mutations counts
-missense_snv_col, updated_dataframe = missense_snv_count(cohort)
+missense_snv_col, dataframe = missense_snv_count(cohort)
 ```
diff --git a/cohorts/collection.py b/cohorts/collection.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2016. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+class Collection(object):
+    def __init__(self, elements):
+        self.elements = elements
+
+    def short_string(self):
+        """
+        Compact string representation which doesn't print any of the
+        collection elements.
+        """
+        file_str = ""
+        return "<%s with %d elements>" % (
+            self.__class__.__name__,
+            len(self))
+
+    def to_string(self, limit=None):
+        """
+        Create a string representation of this collection, showing up to
+        `limit` items.
+        """
+        header = self.short_string()
+        if len(self) == 0:
+            return header
+        contents = ""
+        element_lines = [
+            "  -- %s" % (element,)
+            for element in self.elements[:limit]
+        ]
+        contents = "\n".join(element_lines)
+
+        if limit is not None and len(self.elements) > limit:
+            contents += "\n  ... and %d more" % (len(self) - limit)
+        return "%s\n%s" % (header, contents)
+
+    def __str__(self):
+        return self.to_string(limit=50)
+
+    def __repr__(self):
+        return str(self)
+
+    def __len__(self):
+        return len(self.elements)
+
+    def __iter__(self):
+        return iter(self.elements)
+
+    def __getitem__(self, idx):
+        return self.elements[idx]
diff --git a/cohorts/count.py b/cohorts/count.py
@@ -21,48 +21,48 @@
 from varcode.effects import Substitution
 
 def snv_count(cohort, **kwargs):
-    sample_variants = cohort.load_variants(**kwargs)
-    def count_func(sample):
-        if sample in sample_variants:
-            return len(sample_variants[sample])
+    patient_variants = cohort.load_variants(**kwargs)
+    def count_func(patient_id):
+        if patient_id in patient_variants:
+            return len(patient_variants[patient_id])
         return np.nan
     return count(cohort, count_func, count_col="snv_count")
 
 def nonsynonymous_snv_count(cohort, **kwargs):
-    sample_nonsynonymous_effects = cohort.load_effects(only_nonsynonymous=True, **kwargs)
-    def count_func(sample):
-        if sample in sample_nonsynonymous_effects:
-            return len(sample_nonsynonymous_effects[sample])
+    patient_nonsynonymous_effects = cohort.load_effects(only_nonsynonymous=True, **kwargs)
+    def count_func(patient_id):
+        if patient_id in patient_nonsynonymous_effects:
+            return len(patient_nonsynonymous_effects[patient_id])
         return np.nan
     return count(cohort, count_func, count_col="nonsynonymous_snv_count")
 
 def missense_snv_count(cohort, **kwargs):
-    sample_nonsynonymous_effects = cohort.load_effects(only_nonsynonymous=True, **kwargs)
-    sample_missense_effects = dict(
-        [(sample,
+    patient_nonsynonymous_effects = cohort.load_effects(only_nonsynonymous=True, **kwargs)
+    patient_missense_effects = dict(
+        [(patient_id,
           EffectCollection(
               [effect for effect in effects if type(effect) == Substitution]))
-         for (sample, effects) in sample_nonsynonymous_effects.items()])
-    def count_func(sample):
-        if sample in sample_missense_effects:
-            return len(sample_missense_effects[sample])
+         for (patient_id, effects) in patient_nonsynonymous_effects.items()])
+    def count_func(patient_id):
+        if patient_id in patient_missense_effects:
+            return len(patient_missense_effects[patient_id])
         return np.nan
     return count(cohort, count_func, count_col="missense_snv_count")
 
 def neoantigen_count(cohort, **kwargs):
-    sample_neoantigens = cohort.load_neoantigens(**kwargs)
-    def count_func(sample):
-        if sample in sample_neoantigens["sample_id"].unique():
-            return len(sample_neoantigens[sample_neoantigens["sample_id"] == sample])
+    patient_neoantigens = cohort.load_neoantigens(**kwargs)
+    def count_func(patient_id):
+        if patient_id in patient_neoantigens["patient_id"].unique():
+            return len(patient_neoantigens[patient_neoantigens["patient_id"] == patient_id])
         return np.nan
     return count(cohort, count_func, count_col="neoantigen_count")
 
 def count(cohort, count_func, count_col):
-    df = cohort.clinical_dataframe.copy()
-    df[count_col] = df[cohort.clinical_dataframe_id_col].map(count_func)
+    df = cohort.as_dataframe()
+    df[count_col] = df["patient_id"].map(count_func)
     original_len = len(df)
     df = df[~df[count_col].isnull()]
     updated_len = len(df)
     if updated_len < original_len:
-        print("Missing count for %d samples: from %d to %d" % (original_len - updated_len, original_len, updated_len))
+        print("Missing count for %d patients: from %d to %d" % (original_len - updated_len, original_len, updated_len))
     return count_col, df