Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor cohorts #14

Merged
merged 19 commits into from
May 3, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
53 changes: 32 additions & 21 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,41 +20,52 @@ Usage Examples
--------------

```python
patient_1 = Patient(
id="patient_1",
os=70,
pfs=24,
deceased=True,
progressed=True,
benefit=False)
patient_2 = Patient(
id="patient_2",
os=100,
pfs=50,
deceased=False,
progressed=True,
benefit=False)
)
cohort = Cohort(
data_dir="/my/input/data",
cache_dir="/where/cohorts/results/get/saved",
sample_ids=["sample_1", "sample_2"],
clinical_dataframe=pandas_dataframe_with_clinical_data,
clinical_dataframe_id_col="sample_id_in_dataframe",
os_col="Overall Survival",
pfs_col="Progression-Free Survival",
deceased_col="deceased",
progressed_or_deceased_col="progressed_or_deceased"
patients=[patient_1, patient_2],
cache_dir="/where/cohorts/results/get/saved"
)

cohort.plot_survival(how="os")
```

```python
def mutect_snv_file_format_func(sample_id, normal_bam_id, tumor_bam_id):
return "Mutect-%d-normal=%s.bam-tumor=%s.bam-merged.vcf" % (
sample_id, normal_bam_id, tumor_bam_id)

def strelka_snv_file_format_func(...):
sample_1_tumor = Sample(
id="sample_1_tumor",
bam_path_dna="/path/to/dna/bam",
bam_path_rna="/path/to/rna/bam"
)
patient_1 = Patient(
id="patient_1",
...

snv_vcf_paths=["/where/my/mutect/vcfs/live",
"/where/my/strelka/vcfs/live"]
indel_vcfs_paths=[...],
tumor_sample=sample_1_tumor,
...
)
cohort = Cohort(
...
benefit_col="patient_durable_benefit",
snv_file_format_funcs=[
mutect_snv_file_format_func,
strelka_snv_file_format_func
]
patients=[patient_1]
)

# Comparison plot of missense mutation counts between benefit and no-benefit patients
cohort.plot_benefit(on=missense_snv_count)

# Raw missense mutations counts
missense_snv_col, updated_dataframe = missense_snv_count(cohort)
missense_snv_col, dataframe = missense_snv_count(cohort)
```
61 changes: 61 additions & 0 deletions cohorts/collection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# Copyright (c) 2016. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

class Collection(object):
def __init__(self, elements):
self.elements = elements

def short_string(self):
"""
Compact string representation which doesn't print any of the
collection elements.
"""
file_str = ""
return "<%s with %d elements>" % (
self.__class__.__name__,
len(self))

def to_string(self, limit=None):
"""
Create a string representation of this collection, showing up to
`limit` items.
"""
header = self.short_string()
if len(self) == 0:
return header
contents = ""
element_lines = [
" -- %s" % (element,)
for element in self.elements[:limit]
]
contents = "\n".join(element_lines)

if limit is not None and len(self.elements) > limit:
contents += "\n ... and %d more" % (len(self) - limit)
return "%s\n%s" % (header, contents)

def __str__(self):
return self.to_string(limit=50)

def __repr__(self):
return str(self)

def __len__(self):
return len(self.elements)

def __iter__(self):
return iter(self.elements)

def __getitem__(self, idx):
return self.elements[idx]
44 changes: 22 additions & 22 deletions cohorts/count.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,48 +21,48 @@
from varcode.effects import Substitution

def snv_count(cohort, **kwargs):
sample_variants = cohort.load_variants(**kwargs)
def count_func(sample):
if sample in sample_variants:
return len(sample_variants[sample])
patient_variants = cohort.load_variants(**kwargs)
def count_func(patient_id):
if patient_id in patient_variants:
return len(patient_variants[patient_id])
return np.nan
return count(cohort, count_func, count_col="snv_count")

def nonsynonymous_snv_count(cohort, **kwargs):
sample_nonsynonymous_effects = cohort.load_effects(only_nonsynonymous=True, **kwargs)
def count_func(sample):
if sample in sample_nonsynonymous_effects:
return len(sample_nonsynonymous_effects[sample])
patient_nonsynonymous_effects = cohort.load_effects(only_nonsynonymous=True, **kwargs)
def count_func(patient_id):
if patient_id in patient_nonsynonymous_effects:
return len(patient_nonsynonymous_effects[patient_id])
return np.nan
return count(cohort, count_func, count_col="nonsynonymous_snv_count")

def missense_snv_count(cohort, **kwargs):
sample_nonsynonymous_effects = cohort.load_effects(only_nonsynonymous=True, **kwargs)
sample_missense_effects = dict(
[(sample,
patient_nonsynonymous_effects = cohort.load_effects(only_nonsynonymous=True, **kwargs)
patient_missense_effects = dict(
[(patient_id,
EffectCollection(
[effect for effect in effects if type(effect) == Substitution]))
for (sample, effects) in sample_nonsynonymous_effects.items()])
def count_func(sample):
if sample in sample_missense_effects:
return len(sample_missense_effects[sample])
for (patient_id, effects) in patient_nonsynonymous_effects.items()])
def count_func(patient_id):
if patient_id in patient_missense_effects:
return len(patient_missense_effects[patient_id])
return np.nan
return count(cohort, count_func, count_col="missense_snv_count")

def neoantigen_count(cohort, **kwargs):
sample_neoantigens = cohort.load_neoantigens(**kwargs)
def count_func(sample):
if sample in sample_neoantigens["sample_id"].unique():
return len(sample_neoantigens[sample_neoantigens["sample_id"] == sample])
patient_neoantigens = cohort.load_neoantigens(**kwargs)
def count_func(patient_id):
if patient_id in patient_neoantigens["patient_id"].unique():
return len(patient_neoantigens[patient_neoantigens["patient_id"] == patient_id])
return np.nan
return count(cohort, count_func, count_col="neoantigen_count")

def count(cohort, count_func, count_col):
df = cohort.clinical_dataframe.copy()
df[count_col] = df[cohort.clinical_dataframe_id_col].map(count_func)
df = cohort.as_dataframe()
df[count_col] = df["patient_id"].map(count_func)
original_len = len(df)
df = df[~df[count_col].isnull()]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we warn or alert when these are null

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not a change here, but I might prefer getting the dataframe with the NA values. Most numpy/pandas/seaborn functions are good about handling them and you don't lose the other info on the patient. Can be handled elsewhere.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I agree this should be handled better; I'll file an issue?

updated_len = len(df)
if updated_len < original_len:
print("Missing count for %d samples: from %d to %d" % (original_len - updated_len, original_len, updated_len))
print("Missing count for %d patients: from %d to %d" % (original_len - updated_len, original_len, updated_len))
return count_col, df