forked from PMBio/limix
-
Notifications
You must be signed in to change notification settings - Fork 29
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
9 changed files
with
234 additions
and
15 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,176 @@ | ||
from __future__ import division | ||
|
||
import scipy.spatial | ||
from joblib import Parallel, cpu_count, delayed | ||
from numpy import (asarray, ascontiguousarray, double, einsum, isfinite, | ||
logical_not, minimum, newaxis, sqrt, unique, zeros) | ||
from scipy.spatial import _distance_wrap | ||
from tqdm import tqdm | ||
|
||
|
||
def _row_norms(X): | ||
norms = einsum('ij,ij->i', X, X, dtype=double) | ||
return sqrt(norms, out=norms) | ||
|
||
|
||
def _sq_pearson(X): | ||
m = X.shape[0] | ||
dm = zeros((m * (m - 1)) // 2, dtype=double) | ||
|
||
X2 = X - X.mean(1)[:, newaxis] | ||
X2 = ascontiguousarray(X2) | ||
norms = _row_norms(X2) | ||
_distance_wrap.pdist_cosine_wrap(X2, dm, norms) | ||
return (-dm + 1)**2 | ||
|
||
|
||
def _pdist_threshold(mark, dist, thr): | ||
mark[:] = False | ||
size = len(mark) | ||
|
||
l = 0 | ||
for i in range(0, size - 1): | ||
if mark[i]: | ||
l += size - (i + 1) | ||
continue | ||
|
||
for j in range(i + 1, size): | ||
if dist[l] > thr: | ||
mark[j] = True | ||
l += 1 | ||
|
||
|
||
def func(x, excls, threshold): | ||
dist = _sq_pearson(x) | ||
e = zeros(x.shape[0], dtype=bool) | ||
_pdist_threshold(e, dist, threshold) | ||
excls |= e | ||
|
||
|
||
def indep_pairwise(X, window_size, step_size, threshold, verbose=True): | ||
r""" | ||
Determine pair-wise independent variants. | ||
Independent variants are defined via squared Pearson correlations between | ||
pairs of variants inside a sliding window. | ||
Parameters | ||
---------- | ||
X : array_like | ||
Sample by variants matrix. | ||
window_size : int | ||
Number of variants inside each window. | ||
step_size : int | ||
Number of variants the sliding window skips. | ||
threshold : float | ||
Squared Pearson correlation threshold for independence. | ||
verbose : bool | ||
`True` for progress information; `False` otherwise. | ||
Returns | ||
------- | ||
ok : boolean array defining independent variants | ||
Examples | ||
-------- | ||
.. doctest:: | ||
>>> from numpy.random import RandomState | ||
>>> from limix.stats import indep_pairwise | ||
>>> | ||
>>> random = RandomState(0) | ||
>>> X = random.randn(10, 20) | ||
>>> | ||
>>> indep_pairwise(X, 4, 2, 0.5, verbose=False) | ||
array([ True, True, False, True, True, True, True, True, True, | ||
True, True, True, True, True, True, True, True, True, | ||
True, True], dtype=bool) | ||
""" | ||
left = 0 | ||
excls = zeros(X.shape[1], dtype=bool) | ||
excl = zeros(window_size, dtype=bool) | ||
|
||
assert step_size <= window_size | ||
|
||
n = (X.shape[1] + step_size) // step_size | ||
|
||
steps = list(range(n)) | ||
cc = max(1, cpu_count()) | ||
|
||
with tqdm(total=n, desc='Indep. pairwise', disable=not verbose) as pbar: | ||
|
||
while len(steps) > 0: | ||
i = 0 | ||
right = 0 | ||
delayeds = [] | ||
while i < len(steps): | ||
|
||
step = steps[i] | ||
left = step * step_size | ||
if left < right: | ||
i += 1 | ||
continue | ||
|
||
del steps[i] | ||
right = min(left + window_size, X.shape[1]) | ||
x = ascontiguousarray(X[:, left:right].T) | ||
|
||
delayeds.append(delayed(func)(x, excls[left:right], threshold)) | ||
if len(delayeds) == cc: | ||
Parallel( | ||
n_jobs=min(len(delayeds), cc), | ||
backend='threading')(delayeds) | ||
pbar.update(len(delayeds)) | ||
delayeds = [] | ||
|
||
if len(delayeds) == 0: | ||
continue | ||
|
||
Parallel( | ||
n_jobs=min(len(delayeds), cc), backend='threading')(delayeds) | ||
pbar.update(len(delayeds)) | ||
|
||
return logical_not(excls) | ||
|
||
|
||
def _check_encoding(X): | ||
u = unique(X) | ||
u = u[isfinite(u)] | ||
if len(u) > 3: | ||
return False | ||
return all([i in set([0, 1, 2]) for i in u]) | ||
|
||
|
||
def maf(X): | ||
r"""Compute minor allele frequencies. | ||
It assumes that `X` encodes 0, 1, and 2 representing the number | ||
of alleles. | ||
Args: | ||
X (array_like): Genotype matrix. | ||
Returns: | ||
array_like: minor allele frequencies. | ||
Examples | ||
-------- | ||
.. doctest:: | ||
>>> from numpy.random import RandomState | ||
>>> from limix.stats import maf | ||
>>> | ||
>>> random = RandomState(0) | ||
>>> X = random.randint(0, 3, size=(100, 10)) | ||
>>> | ||
>>> print(maf(X)) | ||
[ 0.49 0.49 0.445 0.495 0.5 0.45 0.48 0.48 0.47 0.435] | ||
""" | ||
ok = _check_encoding(X) | ||
if not ok: | ||
raise ValueError("It assumes that X encodes 0, 1, and 2 only.") | ||
s0 = X.sum(0) | ||
s0 = s0 / (2 * X.shape[0]) | ||
s1 = 1 - s0 | ||
return minimum(s0, s1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
from __future__ import division | ||
|
||
from dask.array import from_array | ||
from numpy import logical_not | ||
from numpy.random import RandomState | ||
from numpy.testing import assert_allclose, assert_equal | ||
|
||
from limix.stats import indep_pairwise, maf | ||
|
||
|
||
def test_preprocess_indep_pairwise(): | ||
random = RandomState(0) | ||
|
||
X = random.randn(3, 100) | ||
|
||
head = [True, True, False, True, False] | ||
tail = [True, True, False, False] | ||
|
||
assert_equal(indep_pairwise(X, 4, 2, 0.5, verbose=False)[:5], head) | ||
assert_equal(indep_pairwise(X, 4, 2, 0.5, verbose=False)[-4:], tail) | ||
|
||
X = from_array(X, chunks=(2, 10)) | ||
assert_equal(indep_pairwise(X, 4, 2, 0.5, verbose=False)[:5], head) | ||
assert_equal(indep_pairwise(X, 4, 2, 0.5, verbose=False)[-4:], tail) | ||
|
||
|
||
def test_preprocess_maf(): | ||
random = RandomState(0) | ||
|
||
X = random.randint(0, 3, size=(100, 10)) | ||
assert_allclose( | ||
maf(X), [0.49, 0.49, 0.445, 0.495, 0.5, 0.45, 0.48, 0.48, 0.47, 0.435]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters