-
Notifications
You must be signed in to change notification settings - Fork 0
/
candidates.py
executable file
·87 lines (71 loc) · 3.58 KB
/
candidates.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#!/usr/bin/env python
import sys
import numpy
if len(sys.argv) != 2:
print('usage: %s tab_separated_table')
exit(1)
tsv_f = open(sys.argv[1].strip(), 'r')
# Return True or False if the supplied dict shows a difference in repeat
# unit length that is consistent with a lineage
# Tuples in this function are (repeat unit length, number of supporting reads)
def is_candidate(info, n_single=2, min_supp_for_allele=5):
# Some utility functions
def pair(sample):
return (info['zygosity.' + sample], info['support.freq.' + sample])
def is_hom(x): return x[0] == 'hom'
def is_het(x): return x[0] == 'het'
def is_supported_het(x, supp=min_supp_for_allele):
"""Determine number of repeat unit lengths that are supported by at
least `supp` reads. If more than 2 repeat unit lengths are supported,
then consider `x` a supported heterozygous call."""
if x[0] is None:
return False
freqs = map(tuple, [ f.split(':') for f in x[1].split(' ') ])
supported_freqs = filter(lambda f: int(f[1]) >= supp, freqs)
return len(supported_freqs) >= 2
# Zip up zygosity and support calls into a tuple for easier processing
heart_bulk = pair('heart_bulk')
neurons = map(pair, samples[1:])
# Look for sites that are homozygous in bulk heart DNA but heterozygous
# in at least `n_single` single neuron samples.
# Also filter het sites by support frequencies. E.g., we may see:
# heart_bulk 8:14 neuronA 8:30 9:1 neuronB 8:22 7:2
# This meets our hom and het criterion, but there are very few
# supporting reads for the alternate alleles.
# Some things to consider:
# 1. alternate alleles that are very close in repeat unit length
# are likely false positives due to polymerase stutter
# 2. alternate alleles that appear in multiple independent single
# cell neurons are less likely to be caused by stutter.
supp_het_neurons = filter(is_supported_het, neurons)
return is_hom(heart_bulk) and len(supp_het_neurons) >= n_single
# headers should be in the table, but the format should be roughly:
# chr start end (zygosity, support)*N
# where there are N pairs of zygosity and support columns. Zygosity
# can be one of NA, hom, het or nocall. Support is NA or a list of
# x:y values, where x is the repeat unit length and y is the number
# of reads supporting that repeat unit length.
# We always want chr, start, end.
relevant_col_idxs = numpy.array([0,1,2], dtype=numpy.int)
colnames = numpy.array([])
samples = [ 'heart_bulk', 'cortex_bulk', 'neurons_100batch', 'neuron_2', 'neuron_3', 'neuron_51', 'neuron_6']
for line in tsv_f:
fields = numpy.array(map(str.strip, line.split('\t')))
# Handle the header: determine which column indexes contain the data
# we care about.
if fields[0] == 'chr':
for idx in range(0, len(fields)):
for s in samples:
if fields[idx].find(s) != -1:
relevant_col_idxs = numpy.append(relevant_col_idxs, [idx])
colnames = fields[relevant_col_idxs]
print('\t'.join(colnames))
continue
# If we get here, the header should've already been parsed and we should
# have a nonempty relevant_col_idxs. Grab only the relevant fields and
# translate any NA values to None.
fields = map(lambda x: x if x != 'NA' else None, fields[relevant_col_idxs])
info = dict(zip(colnames, fields))
if is_candidate(info):
# Translate None values back to 'NA'
print('\t'.join(map(lambda x: 'NA' if x is None else x, fields)))