stats doc ok

limix · Apr 16, 2019 · da3ab79 · da3ab79
1 parent e48c089
commit da3ab79
Show file tree

Hide file tree

Showing 11 changed files with 185 additions and 164 deletions.
diff --git a/doc/Makefile b/doc/Makefile
@@ -16,7 +16,7 @@ help:
 
 clean: Makefile
 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
-	rm -rf _generated/
+	rm -rf _generated/ ; rm -rf api/
 
 # Catch-all target: route all unknown targets to Sphinx using the new
 # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).

diff --git a/doc/api.rst b/doc/api.rst
@@ -45,7 +45,6 @@ Statistics
     limix.stats.allele_frequency
     limix.stats.compute_dosage
     limix.stats.confusion_matrix
-    limix.stats.effsizes_se
     limix.stats.empirical_pvalues
     limix.stats.linear_kinship
     limix.stats.lrt_pvalues

diff --git a/doc/api/limix.stats.effsizes_se.rst b/doc/api/limix.stats.effsizes_se.rst
diff --git a/doc/stats.rst b/doc/stats.rst
@@ -2,8 +2,8 @@
 Statistics
 **********
 
-.. autofunction:: limix.stats.Chi2Mixture
-    :noindex:
+Alleles
+=======
 
 .. autofunction:: limix.stats.allele_expectation
     :noindex:
@@ -14,23 +14,40 @@ Statistics
 .. autofunction:: limix.stats.compute_dosage
     :noindex:
 
-.. autofunction:: limix.stats.confusion_matrix
-    :noindex:
+Chi-square mixture
+==================
 
-.. autofunction:: limix.stats.effsizes_se
+.. autofunction:: limix.stats.Chi2Mixture
     :noindex:
 
-.. autofunction:: limix.stats.empirical_pvalues
+Ground truth evaluation
+=======================
+
+.. autofunction:: limix.stats.confusion_matrix
     :noindex:
 
+Kinship
+=======
+
 .. autofunction:: limix.stats.linear_kinship
     :noindex:
 
+Likelihood-ratio test
+=====================
+
 .. autofunction:: limix.stats.lrt_pvalues
     :noindex:
 
-.. autofunction:: limix.stats.multipletests
-    :noindex:
+Principal component analysis
+============================
 
 .. autofunction:: limix.stats.pca
     :noindex:
+
+P-value correction
+==================
+
+.. autofunction:: limix.stats.multipletests
+    :noindex:
+.. autofunction:: limix.stats.empirical_pvalues
+    :noindex:
diff --git a/limix/stats/__init__.py b/limix/stats/__init__.py
@@ -5,7 +5,7 @@
 from ._chi2 import Chi2Mixture
 from ._confusion import confusion_matrix
 from ._kinship import linear_kinship
-from ._lrt import effsizes_se, lrt_pvalues
+from ._lrt import lrt_pvalues
 from ._allele import allele_frequency, compute_dosage, allele_expectation
 from ._pvalue import multipletests, empirical_pvalues
 from ._pca import pca
@@ -16,7 +16,6 @@
     "allele_frequency",
     "compute_dosage",
     "confusion_matrix",
-    "effsizes_se",
     "empirical_pvalues",
     "linear_kinship",
     "lrt_pvalues",

diff --git a/limix/stats/_allele.py b/limix/stats/_allele.py
@@ -1,73 +1,80 @@
 from .._bits.dask import array_shape_reveal
 
 
-def allele_frequency(expec):
-    r"""Compute allele frequency from its expectation.
+def allele_frequency(X):
+    """
+    Compute allele frequency from its expectation.
+
+    ``X`` is a matrix whose last dimension correspond to the different set of
+    chromosomes. The first dimension represent different variants and the second
+    dimension represent the different samples.
 
     Parameters
     ----------
-    expec : array_like
+    X : array_like
         Allele expectations encoded as a variants-by-samples-by-alleles matrix.
 
     Returns
     -------
-    :class:`numpy.ndarray`
+    frequenct : ndarray
         Allele frequencies encoded as a variants-by-alleles matrix.
     """
-    ploidy = expec.shape[-1]
-    if expec.ndim < 3:
+    ploidy = X.shape[-1]
+    if X.ndim < 3:
         n = 1
     else:
-        n = expec.shape[1]
-    return expec.sum(-2) / (ploidy * n)
+        n = X.shape[1]
+    return X.sum(-2) / (ploidy * n)
 
 
-def compute_dosage(expec, alt=None):
-    r"""Compute dosage from allele expectation.
+def compute_dosage(X, alt=None):
+    """
+    Compute dosage from allele expectation.
 
     Parameters
     ----------
-    expec : array_like
+    X : array_like
         Allele expectations encoded as a variants-by-samples-by-alleles matrix.
     ref : array_like
         Allele reference of each locus. The allele having the minor allele frequency
-        for the provided ``expec`` is used as the reference if `None`. Defaults to
+        for the provided ``X`` is used as the reference if `None`. Defaults to
         `None`.
 
     Returns
     -------
-    :class:`numpy.ndarray`
+    dosage : ndarray
         Dosage encoded as a variants-by-samples matrix.
 
-    Examples
-    --------
+    Example
+    -------
 
     .. doctest::
 
-    >>> from bgen_reader import read_bgen, allele_expectation, example_files
-    >>> from bgen_reader import compute_dosage
-    >>>
-    >>> with example_files("example.32bits.bgen") as filepath:
-    ...     bgen = read_bgen(filepath, verbose=False)
-    ...     variant_idx = 2
-    ...     e = allele_expectation(bgen, variant_idx)
-    ...     dosage = compute_dosage(e)
-    ...     print(dosage[:5])
-    [0.01550294 0.99383543 1.97933958 0.99560547 1.97879027]
-    """
+        >>> from bgen_reader import read_bgen, allele_expectation, example_files
+        >>> from bgen_reader import compute_dosage
+        >>>
+        >>> with example_files("example.32bits.bgen") as filepath:
+        ...     bgen = read_bgen(filepath, verbose=False)
+        ...     variant_idx = 2
+        ...     e = allele_expectation(bgen, variant_idx)
+        ...     dosage = compute_dosage(e)
+        ...     print(dosage[:5])
+        [0.01550294 0.99383543 1.97933958 0.99560547 1.97879027]
+        """
     from numpy import asarray
 
     if alt is None:
-        return expec[..., -1]
+        return X[..., -1]
     try:
-        return expec[alt, :, alt]
+        return X[alt, :, alt]
     except NotImplementedError:
         alt = asarray(alt, int)
-        return asarray(expec, float)[alt, :, alt]
+        return asarray(X, float)[alt, :, alt]
 
 
 def allele_expectation(p, nalleles, ploidy):
-    r"""Allele expectation.
+    """
+    Allele expectation.
 
     Compute the expectation of each allele from the given probabilities.
     It accepts three shapes of matrices:
@@ -86,60 +93,59 @@ def allele_expectation(p, nalleles, ploidy):
 
     Returns
     -------
-    :class:`numpy.ndarray`
+    expectation : ndarray
         Last dimension will contain the expectation of each allele.
 
     Examples
     --------
 
     .. doctest::
 
-
-    >>> from texttable import Texttable
-    >>> from bgen_reader import read_bgen, allele_expectation, example_files
-    >>>
-    >>> sampleid = "sample_005"
-    >>> rsid = "RSID_6"
-    >>>
-    >>> with example_files("example.32bits.bgen") as filepath:
-    ...     bgen = read_bgen(filepath, verbose=False)
-    ...
-    ...     locus = bgen["variants"].query("rsid == '{}'".format(rsid)).index.compute().item()
-    ...     sample = bgen["samples"].to_frame().query("id == '{}'".format(sampleid)).index
-    ...     sample = sample[0]
-    ...
-    ...     nalleles = bgen["variants"].loc[locus]["nalleles"]
-    ...     ploidy = 2
-    ...
-    ...     p = bgen["genotype"][locus].compute()["probs"][sample]
-    ...     # For unphased genotypes only.
-    ...     e = allele_expectation(bgen, locus)[sample]
-    ...
-    ...     alleles = bgen["variants"].loc[locus]["allele_ids"].compute().item().split(",")
-    ...
-    ...     tab = Texttable()
-    ...
-    ...     print(tab.add_rows(
-    ...         [
-    ...             ["", "AA", "AG", "GG", "E[.]"],
-    ...             ["p"] + list(p) + [1.0],
-    ...             ["#" + alleles[0], 2, 1, 0, e[0]],
-    ...             ["#" + alleles[1], 0, 1, 2, e[1]],
-    ...         ]
-    ...     ).draw())
-    +----+-------+-------+-------+-------+
-    |    |  AA   |  AG   |  GG   | E[.]  |
-    +====+=======+=======+=======+=======+
-    | p  | 0.012 | 0.987 | 0.001 | 1     |
-    +----+-------+-------+-------+-------+
-    | #A | 2     | 1     | 0     | 1.011 |
-    +----+-------+-------+-------+-------+
-    | #G | 0     | 1     | 2     | 0.989 |
-    +----+-------+-------+-------+-------+
-    >>> print("variant: {}".format(rsid))
-    variant: RSID_6
-    >>> print("sample : {}".format(sampleid))
-    sample : sample_005
+        >>> from texttable import Texttable
+        >>> from bgen_reader import read_bgen, allele_expectation, example_files
+        >>>
+        >>> sampleid = "sample_005"
+        >>> rsid = "RSID_6"
+        >>>
+        >>> with example_files("example.32bits.bgen") as filepath:
+        ...     bgen = read_bgen(filepath, verbose=False)
+        ...
+        ...     locus = bgen["variants"].query("rsid == '{}'".format(rsid)).index.compute().item()
+        ...     sample = bgen["samples"].to_frame().query("id == '{}'".format(sampleid)).index
+        ...     sample = sample[0]
+        ...
+        ...     nalleles = bgen["variants"].loc[locus]["nalleles"]
+        ...     ploidy = 2
+        ...
+        ...     p = bgen["genotype"][locus].compute()["probs"][sample]
+        ...     # For unphased genotypes only.
+        ...     e = allele_expectation(bgen, locus)[sample]
+        ...
+        ...     alleles = bgen["variants"].loc[locus]["allele_ids"].compute().item().split(",")
+        ...
+        ...     tab = Texttable()
+        ...
+        ...     print(tab.add_rows(
+        ...         [
+        ...             ["", "AA", "AG", "GG", "E[.]"],
+        ...             ["p"] + list(p) + [1.0],
+        ...             ["#" + alleles[0], 2, 1, 0, e[0]],
+        ...             ["#" + alleles[1], 0, 1, 2, e[1]],
+        ...         ]
+        ...     ).draw())
+        +----+-------+-------+-------+-------+
+        |    |  AA   |  AG   |  GG   | E[.]  |
+        +====+=======+=======+=======+=======+
+        | p  | 0.012 | 0.987 | 0.001 | 1     |
+        +----+-------+-------+-------+-------+
+        | #A | 2     | 1     | 0     | 1.011 |
+        +----+-------+-------+-------+-------+
+        | #G | 0     | 1     | 2     | 0.989 |
+        +----+-------+-------+-------+-------+
+        >>> print("variant: {}".format(rsid))
+        variant: RSID_6
+        >>> print("sample : {}".format(sampleid))
+        sample : sample_005
 
     Note
     ----

diff --git a/limix/stats/_kinship.py b/limix/stats/_kinship.py
@@ -2,24 +2,52 @@
 
 
 def linear_kinship(G, out=None, verbose=True):
-    r"""Estimate Kinship matrix via linear kernel.
+    """
+    Estimate Kinship matrix via linear kernel.
+
+    Let 𝑑 be the number of columns of ``G``. The resulting matrix is given by:
+
+    .. math::
+
+        𝙺 = 𝚇𝚇ᵀ/𝑑
+
+    where
+
+    .. math::
+
+        𝚇ᵢⱼ = (𝙶ᵢⱼ - 𝑚ⱼ) / 𝑠ⱼ
+
+    is the matrix ``G`` column-wise normalized by means 𝑚ⱼ and standard deviations 𝑠ⱼ.
+    NaNs are ignored so as to produce matrix ``K`` having only real numbers.
+
+    This functions is specially designed to also handle large matrices ``G`` that would
+    otherwise require a large amount of memory if it were to be loaded in memory first.
+    For those cases, libraries like Dask come in handy.
+
+    Parameters
+    ----------
+    G : array_like
+        Samples-by-variants matrix.
+    out : ndarray
+        A location into which the result is stored.
+    verbose : bool, optional
+        ``True`` for showing progress; ``False`` otherwise. Defauts to ``True``.
 
     Examples
     --------
     .. doctest::
 
         >>> from numpy.random import RandomState
-        >>> from numpy import array_str
         >>> from limix.stats import linear_kinship
         >>>
         >>> random = RandomState(1)
         >>> X = random.randn(4, 100)
         >>> K = linear_kinship(X, verbose=False)
-        >>> print(array_str(K, precision=4))
-        [[ 0.9131 -0.1928 -0.3413 -0.379 ]
-         [-0.1928  0.8989 -0.2356 -0.4704]
-         [-0.3413 -0.2356  0.9578 -0.3808]
-         [-0.379  -0.4704 -0.3808  1.2302]]
+        >>> print(K) # doctest: +FLOAT_CMP
+        [[ 0.91314823 -0.19283362 -0.34133897 -0.37897564]
+         [-0.19283362  0.89885153 -0.2356003  -0.47041761]
+         [-0.34133897 -0.2356003   0.95777313 -0.38083386]
+         [-0.37897564 -0.47041761 -0.38083386  1.23022711]]
     """
     from numpy import sqrt, zeros, asfortranarray, where, asarray, nanmean, std, isnan
     from scipy.linalg.blas import get_blas_funcs