-
Notifications
You must be signed in to change notification settings - Fork 0
/
Stats.py
173 lines (135 loc) · 4.51 KB
/
Stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
# Copyright 2013 by Kamil Koziara. All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
"""
Module containing functions used for statistical reasoning about the ontology
data.
"""
from Bio._py3k import range
from math import log, exp
_g = 7
_p = [0.99999999999980993, 676.5203681218851, -1259.1392167224028,
771.32342877765313, -176.61502916214059, 12.507343278686905,
-0.13857109526572012, 9.9843695780195716e-6, 1.5056327351493116e-7]
def lngamma(z):
"""
Lanchos approximation of log((z-1)!)
Reference: http://en.wikipedia.org/wiki/Lanczos_approximation
"""
z -= 1
x = _p[0]
for i in range(1, _g+2):
x += _p[i]/(z+i)
t = z + _g + 0.5
return 0.9189385332046727 + (z + 0.5) * log(t) - t + log(x) # log(sqrt(2*pi)) = 0.9189385332046727
def lnfactorial(n):
"""
Returns logarithm of factorial of n.
"""
return 0 if n < 1 else lngamma(n + 1)
def lncombination(n, k):
"""
Returns logarithm of number of k-element sets that can be drawn from
n-element set without replacement.
"""
return lnfactorial(n) - lnfactorial(k) - lnfactorial(n - k)
def hypergeometric_probability(k, n, K, N):
"""
Returns probability of k successes in n draws without replacement from
a finite population of size N containing a maximum of K successes.
"""
return exp(lncombination(K, k) + lncombination(N - K, n - k) - lncombination(N, n))
def hypergeometric_two_tail_test(k, n, K, N):
"""
Returns probability of getting k or less probable number of successes in
hypergeometric probability distribution.
"""
um = min(n, K)
lm = max(0, n + K - N)
eps = 1e-15
if um == lm:
return 1.0
prob = hypergeometric_probability(k, n, K, N)
two_tail = 0
for i in range(lm, um + 1):
p = hypergeometric_probability(i, n, K, N)
if p < prob + eps:
two_tail += p
two_tail = two_tail if two_tail < 1.0 else 1.0
return two_tail
def hypergeometric_test(k, n, K, N):
"""
Returns probability of getting k or more successes in
hypergeometric probability distribution.
"""
um = min(n, K)
one_tail = hypergeometric_probability(k, n, K, N)
for i in range(k, um + 1):
one_tail += hypergeometric_probability(i, n, K, N)
return one_tail if one_tail < 1.0 else 1.0
def bonferroni_correction(pvals):
"""
Bonferroni correction.
Reference: http://en.wikipedia.org/wiki/Bonferroni_correction
"""
n = len(pvals)
return [min(x * n , 1.0) for x in pvals]
def bh_fdr_correction(pvals):
"""
Benjamin-Hochberg FDR correction.
Reference: http://en.wikipedia.org/wiki/False_discovery_rate
"""
n = len(pvals)
k = 1
cr = [1.0] * n
mx = 0.0
for pos, pval in sorted(enumerate(pvals), key = lambda x : x[1]):
mx = max(pval * n / k, mx)
if mx < 1.0:
cr[pos] = mx
k += 1
return cr
def kolmogorov_smirnov_rank_test(gene_set, gene_list, gene_corr, p):
"""
Rank test used in GSEA method. It measures dispersion of genes from
gene_set over a gene_list. Every gene from gene_list has its weight
specified by gene_corr. p is a parameter changing weights importance.
Reference: http://www.pnas.org/content/102/43/15545.full
"""
cval = 0
Dn = 0
Nr = 0
N = len(gene_list)
Nh = 0
# Adjust correlations taking accoriding to p parameter
if p != 1:
adj_corr = [pow(abs(x), p) for x in gene_corr]
else:
adj_corr = [abs(x) for x in gene_corr]
for i in range(N):
if gene_list[i] in gene_set:
Nr += adj_corr[i]
Nh += 1
if N == Nh:
miss_pen = 1.
else:
miss_pen = float(1) / (N - Nh)
stat_plot = N * [None]
for i in range(N):
if gene_list[i] in gene_set:
cval += adj_corr[i] / Nr
else:
cval -= miss_pen
stat_plot[i] = cval
if abs(cval) > abs(Dn):
Dn = cval
return (Dn, stat_plot)
def mean(arr):
l = float(len(arr))
return sum(arr) / l if l > .0 else .0
corrections = { "bonferroni" : bonferroni_correction,
"bh_fdr" : bh_fdr_correction }
corrections_labels = { "bonferroni" : "Bonferroni correction",
"bh_fdr" : "FDR correction",
"fdr" : "FDR correction" }