-
Notifications
You must be signed in to change notification settings - Fork 4
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Refactor cohorts #14
Refactor cohorts #14
Changes from all commits
3d6cd68
476d25f
1c5dbb3
6f4f516
a015578
052b028
95e1287
03a12e6
064d6c8
f79bd59
cbb1f5a
265585e
42b6ad4
41dec1a
426b355
4d39cac
672e58b
feea216
96117d5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
# Copyright (c) 2016. Mount Sinai School of Medicine | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
class Collection(object): | ||
def __init__(self, elements): | ||
self.elements = elements | ||
|
||
def short_string(self): | ||
""" | ||
Compact string representation which doesn't print any of the | ||
collection elements. | ||
""" | ||
file_str = "" | ||
return "<%s with %d elements>" % ( | ||
self.__class__.__name__, | ||
len(self)) | ||
|
||
def to_string(self, limit=None): | ||
""" | ||
Create a string representation of this collection, showing up to | ||
`limit` items. | ||
""" | ||
header = self.short_string() | ||
if len(self) == 0: | ||
return header | ||
contents = "" | ||
element_lines = [ | ||
" -- %s" % (element,) | ||
for element in self.elements[:limit] | ||
] | ||
contents = "\n".join(element_lines) | ||
|
||
if limit is not None and len(self.elements) > limit: | ||
contents += "\n ... and %d more" % (len(self) - limit) | ||
return "%s\n%s" % (header, contents) | ||
|
||
def __str__(self): | ||
return self.to_string(limit=50) | ||
|
||
def __repr__(self): | ||
return str(self) | ||
|
||
def __len__(self): | ||
return len(self.elements) | ||
|
||
def __iter__(self): | ||
return iter(self.elements) | ||
|
||
def __getitem__(self, idx): | ||
return self.elements[idx] |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -21,48 +21,48 @@ | |
from varcode.effects import Substitution | ||
|
||
def snv_count(cohort, **kwargs): | ||
sample_variants = cohort.load_variants(**kwargs) | ||
def count_func(sample): | ||
if sample in sample_variants: | ||
return len(sample_variants[sample]) | ||
patient_variants = cohort.load_variants(**kwargs) | ||
def count_func(patient_id): | ||
if patient_id in patient_variants: | ||
return len(patient_variants[patient_id]) | ||
return np.nan | ||
return count(cohort, count_func, count_col="snv_count") | ||
|
||
def nonsynonymous_snv_count(cohort, **kwargs): | ||
sample_nonsynonymous_effects = cohort.load_effects(only_nonsynonymous=True, **kwargs) | ||
def count_func(sample): | ||
if sample in sample_nonsynonymous_effects: | ||
return len(sample_nonsynonymous_effects[sample]) | ||
patient_nonsynonymous_effects = cohort.load_effects(only_nonsynonymous=True, **kwargs) | ||
def count_func(patient_id): | ||
if patient_id in patient_nonsynonymous_effects: | ||
return len(patient_nonsynonymous_effects[patient_id]) | ||
return np.nan | ||
return count(cohort, count_func, count_col="nonsynonymous_snv_count") | ||
|
||
def missense_snv_count(cohort, **kwargs): | ||
sample_nonsynonymous_effects = cohort.load_effects(only_nonsynonymous=True, **kwargs) | ||
sample_missense_effects = dict( | ||
[(sample, | ||
patient_nonsynonymous_effects = cohort.load_effects(only_nonsynonymous=True, **kwargs) | ||
patient_missense_effects = dict( | ||
[(patient_id, | ||
EffectCollection( | ||
[effect for effect in effects if type(effect) == Substitution])) | ||
for (sample, effects) in sample_nonsynonymous_effects.items()]) | ||
def count_func(sample): | ||
if sample in sample_missense_effects: | ||
return len(sample_missense_effects[sample]) | ||
for (patient_id, effects) in patient_nonsynonymous_effects.items()]) | ||
def count_func(patient_id): | ||
if patient_id in patient_missense_effects: | ||
return len(patient_missense_effects[patient_id]) | ||
return np.nan | ||
return count(cohort, count_func, count_col="missense_snv_count") | ||
|
||
def neoantigen_count(cohort, **kwargs): | ||
sample_neoantigens = cohort.load_neoantigens(**kwargs) | ||
def count_func(sample): | ||
if sample in sample_neoantigens["sample_id"].unique(): | ||
return len(sample_neoantigens[sample_neoantigens["sample_id"] == sample]) | ||
patient_neoantigens = cohort.load_neoantigens(**kwargs) | ||
def count_func(patient_id): | ||
if patient_id in patient_neoantigens["patient_id"].unique(): | ||
return len(patient_neoantigens[patient_neoantigens["patient_id"] == patient_id]) | ||
return np.nan | ||
return count(cohort, count_func, count_col="neoantigen_count") | ||
|
||
def count(cohort, count_func, count_col): | ||
df = cohort.clinical_dataframe.copy() | ||
df[count_col] = df[cohort.clinical_dataframe_id_col].map(count_func) | ||
df = cohort.as_dataframe() | ||
df[count_col] = df["patient_id"].map(count_func) | ||
original_len = len(df) | ||
df = df[~df[count_col].isnull()] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is not a change here, but I might prefer getting the dataframe with the NA values. Most numpy/pandas/seaborn functions are good about handling them and you don't lose the other info on the patient. Can be handled elsewhere. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, I agree this should be handled better; I'll file an issue? |
||
updated_len = len(df) | ||
if updated_len < original_len: | ||
print("Missing count for %d samples: from %d to %d" % (original_len - updated_len, original_len, updated_len)) | ||
print("Missing count for %d patients: from %d to %d" % (original_len - updated_len, original_len, updated_len)) | ||
return count_col, df |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we warn or alert when these are null