forked from biopython/biopython
-
Notifications
You must be signed in to change notification settings - Fork 0
/
thresholds.py
113 lines (102 loc) · 4.28 KB
/
thresholds.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# Copyright 2008 by Norbert Dojer. All rights reserved.
# Adapted by Bartek Wilczynski.
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
"""Approximate calculation of appropriate thresholds for motif finding
"""
class ScoreDistribution(object):
""" Class representing approximate score distribution for a given motif.
Utilizes a dynamic programming approch to calculate the distribution of
scores with a predefined precision. Provides a number of methods for calculating
thresholds for motif occurences.
"""
def __init__(self, motif=None, precision=10**3, pssm=None, background=None):
if pssm is None:
self.min_score = min(0.0, motif.min_score())
self.interval = max(0.0, motif.max_score())-self.min_score
self.n_points = precision * motif.length
self.ic=motif.ic()
else:
self.min_score = min(0.0, pssm.min)
self.interval = max(0.0, pssm.max)-self.min_score
self.n_points = precision * pssm.length
self.ic = pssm.mean(background)
self.step = self.interval/(self.n_points-1)
self.mo_density = [0.0]*self.n_points
self.mo_density[-self._index_diff(self.min_score)] = 1.0
self.bg_density = [0.0]*self.n_points
self.bg_density[-self._index_diff(self.min_score)] = 1.0
if pssm is None:
for lo, mo in zip(motif.log_odds(), motif.pwm()):
self.modify(lo, mo, motif.background)
else:
for position in range(pssm.length):
mo_new=[0.0]*self.n_points
bg_new=[0.0]*self.n_points
lo = pssm[:, position]
for letter, score in lo.items():
bg = background[letter]
mo = pow(2, pssm[letter, position]) * bg
d=self._index_diff(score)
for i in range(self.n_points):
mo_new[self._add(i, d)]+=self.mo_density[i]*mo
bg_new[self._add(i, d)]+=self.bg_density[i]*bg
self.mo_density=mo_new
self.bg_density=bg_new
def _index_diff(self,x,y=0.0):
return int((x-y+0.5*self.step)//self.step)
def _add(self, i, j):
return max(0, min(self.n_points-1, i+j))
def modify(self, scores, mo_probs, bg_probs):
mo_new=[0.0]*self.n_points
bg_new=[0.0]*self.n_points
for k, v in scores.items():
d=self._index_diff(v)
for i in range(self.n_points):
mo_new[self._add(i, d)]+=self.mo_density[i]*mo_probs[k]
bg_new[self._add(i, d)]+=self.bg_density[i]*bg_probs[k]
self.mo_density=mo_new
self.bg_density=bg_new
def threshold_fpr(self, fpr):
"""
Approximate the log-odds threshold which makes the type I error (false positive rate).
"""
i=self.n_points
prob=0.0
while prob<fpr:
i-=1
prob+=self.bg_density[i]
return self.min_score+i*self.step
def threshold_fnr(self, fnr):
"""
Approximate the log-odds threshold which makes the type II error (false negative rate).
"""
i=-1
prob=0.0
while prob<fnr:
i+=1
prob+=self.mo_density[i]
return self.min_score+i*self.step
def threshold_balanced(self,rate_proportion=1.0,return_rate=False):
"""
Approximate the log-odds threshold which makes FNR equal to FPR times rate_proportion
"""
i=self.n_points
fpr=0.0
fnr=1.0
while fpr*rate_proportion<fnr:
i-=1
fpr+=self.bg_density[i]
fnr-=self.mo_density[i]
if return_rate:
return self.min_score+i*self.step, fpr
else:
return self.min_score+i*self.step
def threshold_patser(self):
"""Threshold selection mimicking the behaviour of patser (Hertz, Stormo 1999) software.
It selects such a threshold that the log(fpr)=-ic(M)
note: the actual patser software uses natural logarithms instead of log_2, so the numbers
are not directly comparable.
"""
return self.threshold_fpr(fpr=2**-self.ic)