From 4100fc4206d59e225bcb8fc908fd451520de813d Mon Sep 17 00:00:00 2001
From: Michael Zietz <michael.zietz@gmail.com>
Date: Mon, 24 Jun 2019 14:31:23 -0400
Subject: [PATCH] Add utility files for XSwap priors and network format
 conversion

Merges https://github.com/greenelab/xswap/pull/11

Adds network format conversion and XSwap edge prior computation utilities
Updates README with acknowledgements and examples
Adds tests for Roaring bitset warnings, XSwap prior computation, and format conversion
---
 README.md                      |  68 +++++++--
 tests-require.txt              |   3 +
 tests/test_formats.py          |  54 +++++++
 tests/test_permute.py          |  24 +++
 tests/test_prior.py            |  74 ++++++++++
 xswap/__init__.py              |   9 +-
 xswap/network_formats.py       |  78 ++++++++++
 xswap/{xswap.py => permute.py} |   0
 xswap/prior.py                 | 263 +++++++++++++++++++++++++++++++++
 9 files changed, 560 insertions(+), 13 deletions(-)
 create mode 100644 tests/test_formats.py
 create mode 100644 tests/test_prior.py
 create mode 100644 xswap/network_formats.py
 rename xswap/{xswap.py => permute.py} (100%)
 create mode 100644 xswap/prior.py

diff --git a/README.md b/README.md
index 14ae94a..fb23c66 100644
--- a/README.md
+++ b/README.md
@@ -1,24 +1,68 @@
-# xswap
+# XSwap: Fast degree-preserving network permutation
+
+![](https://api.travis-ci.com/greenelab/xswap.svg?branch=master)
+
+XSwap is an algorithm for degree-preserving network randomization (permutation) [1].
+Permuted networks can be used for a number of purposes in network analysis, including for generating counterfactual distributions of features when only the network's degree sequence is maintained or for computing a prior probability of an edge given only the network's degree sequence.
+Overall, permuted networks allow one to quantify the effects of degree on analysis and prediction methods.
+Understanding this effect is useful when a network's degree sequence is subject to biases.
+This implementation is a modified version of the algorithm due to Hanhijärvi et al. with two additional parameters (`allow_self_loops` and `allow_antiparallel`), which enable greater generalizability to bipartite, directed, and undirected networks.
+
+[1] Sami Hanhijärvi, Gemma C. Garriga, Kai Puolamäki
+*Proceedings of the 2009 SIAM International Conference on Data Mining* (2009-04-30) <https://doi.org/f3mn58>
+DOI: [10.1137/1.9781611972795.67](https://doi.org/10.1137/1.9781611972795.67)
 
 ## Usage examples
 
+#### Permuting an edge list
+
+```python
+>>> edges = [(0, 1), (1, 0)]
+>>> permuted_edges, permutation_statistics = xswap.permute_edge_list(
+        edges, allow_self_loops=False, allow_antiparallel=True,
+        multiplier=10)
+>>> permuted_edges
+[(1, 0), (0, 1)]
+>>> permutation_statistics
+{'swap_attempts': 20, 'same_edge': 10, 'self_loop': 5, 'duplicate': 1,
+ 'undir_duplicate': 0, 'excluded': 0}
+```
+
+#### Computing degree-sequence based prior probabilities of edges existing
+
+```python
+>>> edges = [(0, 1), (1, 0)]
+>>> prior_prob_df = xswap.prior.compute_xswap_priors(
+        edges, n_permutations=10000, shape=(2, 2), allow_self_loops=True,
+        allow_antiparallel=True)
+>>> prior_prob_df
+   source_id  target_id   edge  source_degree  target_degree  xswap_prior
+0          0          0  False              1              1          0.5
+1          0          1   True              1              1          0.5
+2          1          0   True              1              1          0.5
+3          1          1  False              1              1          0.5
+```
+
 ## Choice of parameters
 
-### `directed` and `bipartite`
+#### Bipartite networks
+
+Bipartite networks should be indexed using the bi-adjacency matrix, meaning that the edge `(0, 0)` is from source node 0 to target node 0, and is not a self-loop.
+Moreover, bipartite networks should be permuted using `allow_self_loops=False` and `allow_antiparallel=True`.
 
-The `bipartite` argument determines the meaning of a node's value.
-A bipartite graph is a graph in which nodes can be divided into disjoint sets with connections exclusively between sets.
-For example, consider the graph shown in the figure below:
+#### Directed and undirected networks
 
-<a href="#bipartite_graph"><img src="docs/img/bipartite_graph.png" alt="Image of bipartite graph" width="50%" id="bipartite_graph"></a>
+For non-bipartite networks, the decisions of `allow_self_loops` and `allow_antiparallel` are not always the same.
+For undirected networks, set `allow_antiparallel=False`, as otherwise the edges (1, 0) and (0, 1), which represent the same edge, will be treated as separate.
+Antiparallel edges may or may not be allowed for directed networks, depending on context.
+Similarly, self-loops may or may not be allowed for directed or undirected networks, depending on the specific network being permuted.
 
-The adjacency matrix corresponding to a bipartite graph can be broken into four blocks.
+## Libraries
 
-<a href="#bipartite_adj"><img src="https://latex.codecogs.com/gif.latex?A&=\begin{bmatrix}0&B\\B^T&0\end{bmatrix}" title="Bipartite graph adjacency matrix" id="bipartite_adj" /></a>
+The XSwap library includes Roaring Bitmaps (https://github.com/RoaringBitmap/CRoaring), available under the Apache 2.0 license (https://github.com/RoaringBitmap/CRoaring/blob/LICENSE).
 
-The two diagonal blocks are entirely zero and the two off-diagonal blocks are the biadjacency matrix and its transpose.
+## Acknowledgments
 
-<a href="#biadj"><img src="https://latex.codecogs.com/gif.latex?B&=\begin{bmatrix}1&0&0&0\\0&1&1&0\\0&0&0&1\end{bmatrix}" title="Bipartite graph biadjacency matrix" id="biadj" /></a>
+Development of this project has largely taken place in the [Greene Lab](http://www.greenelab.com/) at the University of Pennsylvania. However, as an open source project under the `hetio` organization, this repository is grateful for its community of maintainers, contributors, and users.
 
-The biadjacency matrix is, in general, non-square.
-This means that the edge (0, 0) is not a self loop, as a 0 in the first position refers to a different node than a 0 in the second position.
+This work is funded in part by the Gordon and Betty Moore Foundation’s Data-Driven Discovery Initiative through Grants GBMF4552 to Casey Greene and GBMF4560 to Blair Sullivan.
diff --git a/tests-require.txt b/tests-require.txt
index fba4e2c..7213368 100644
--- a/tests-require.txt
+++ b/tests-require.txt
@@ -1,3 +1,6 @@
+numpy
+pandas
 pytest
 requests
+scipy
 setuptools
diff --git a/tests/test_formats.py b/tests/test_formats.py
new file mode 100644
index 0000000..dc8e8cd
--- /dev/null
+++ b/tests/test_formats.py
@@ -0,0 +1,54 @@
+import numpy
+import pytest
+import scipy.sparse
+
+import xswap.network_formats
+
+
+@pytest.mark.parametrize('matrix,correct_edges,include_reverse_edges', [
+    (numpy.array([[1,0,0,0],[0,0,1,0],[0,0,0,1]]), [(0, 0), (1, 2), (2, 3)], False),
+    (numpy.array([[1,0,0],[0,0,1],[0,1,1]]), [(0, 0), (1, 2), (2, 2)], False),
+    (numpy.array([[1,0,0],[0,0,1],[0,1,1]]), [(0, 0), (1, 2), (2, 1), (2, 2)], True),
+])
+def test_matrix_to_edges(matrix, correct_edges, include_reverse_edges):
+    edges = xswap.network_formats.matrix_to_edges(matrix, include_reverse_edges)
+    assert sorted(edges) == sorted(correct_edges)
+
+
+@pytest.mark.parametrize('edges,correct_matrix,add_reverse_edges,shape,dtype,sparse', [
+    (
+        [(0, 1), (0, 3), (2, 2)],
+        numpy.array([[0,1,0,1], [1,0,0,0], [0,0,1,0], [1,0,0,0]], dtype=int),
+        True, (4, 4), int, False),
+    (
+        [(0, 1), (0, 3), (2, 2)],
+        numpy.array([[0,1,0,1], [0,0,0,0], [0,0,1,0], [0,0,0,0]], dtype=int),
+        False, (4, 4), int, False),
+    (
+        [(0, 1), (0, 3), (2, 2)],
+        numpy.array([[0,1,0,1], [0,0,0,0], [0,0,1,0]], dtype=int),
+        False, (3, 4), int, False),
+    (
+        [(0, 1), (0, 3), (2, 2)],
+        numpy.array([[0,1,0,1], [0,0,0,0], [0,0,1,0]], dtype=float),
+        False, (3, 4), float, False),
+    (
+        [(0, 1), (0, 3), (2, 2)],
+        numpy.array([[0,1,0,1], [0,0,0,0], [0,0,1,0]], dtype=numpy.float32),
+        False, (3, 4), numpy.float32, False),
+    (
+        [(0, 1), (0, 3), (2, 2)],
+        scipy.sparse.csc_matrix([[0,1,0,1], [0,0,0,0], [0,0,1,0]], dtype=numpy.float32),
+        False, (3, 4), numpy.float32, True),
+])
+def test_edges_to_matrix(edges, correct_matrix, add_reverse_edges, shape, dtype, sparse):
+    matrix = xswap.network_formats.edges_to_matrix(
+        edge_list=edges, add_reverse_edges=add_reverse_edges, shape=shape,
+        dtype=dtype, sparse=sparse)
+
+    assert matrix.dtype == dtype
+    assert scipy.sparse.issparse(matrix) == sparse
+    if sparse:
+        assert (matrix != correct_matrix).nnz == 0
+    else:
+        assert numpy.array_equal(matrix, correct_matrix)
diff --git a/tests/test_permute.py b/tests/test_permute.py
index 8fc8d6d..8070704 100644
--- a/tests/test_permute.py
+++ b/tests/test_permute.py
@@ -1,4 +1,7 @@
+import tempfile
+
 import pytest
+import requests
 
 import xswap
 
@@ -20,3 +23,24 @@ def test_xswap_changes_edges(edges, permutable):
         assert new_edges != edges
     else:
         assert new_edges == edges
+
+
+def test_roaring_warning():
+    """
+    Check that a warning is given when using the much slower but far more general
+    Roaring bitset rather than the faster fully uncompressed bitset.
+    """
+    edges_url = "https://github.com/greenelab/xswap/raw/{}/{}".format(
+        "8c31b4cbdbbf2cfa5018b1277bbd0e9f6263e573", "graphs/GiG_edges_reduced.txt")
+    response = requests.get(edges_url)
+    with tempfile.NamedTemporaryFile() as tf:
+        tf.write(response.content)
+        edges = xswap.preprocessing.load_processed_edges(tf.name)
+
+    with pytest.warns(None):
+        permuted_edges, stats = xswap.permute_edge_list(edges, allow_self_loops=True,
+            allow_antiparallel=False, multiplier=0.1, seed=0, max_malloc=4000000000)
+
+    with pytest.warns(RuntimeWarning, match="Using Roaring bitset because of the large number of edges."):
+        permuted_edges, stats = xswap.permute_edge_list(edges, allow_self_loops=True,
+            allow_antiparallel=False, multiplier=0.1, seed=0, max_malloc=10)
diff --git a/tests/test_prior.py b/tests/test_prior.py
new file mode 100644
index 0000000..06091bc
--- /dev/null
+++ b/tests/test_prior.py
@@ -0,0 +1,74 @@
+import numpy
+import pandas
+import pytest
+
+import xswap
+
+
+@pytest.mark.parametrize('edges,true_prior,num_swaps,shape', [
+    ([(0, 0), (1, 1)], 0.5 * numpy.ones((2, 2)), 10000, (2, 2)),
+    ([(0, 1), (1, 0)], 0.5 * numpy.ones((2, 2)), 10000, (2, 2)),
+    ([(0, 0)], numpy.ones((1, 1)), 10, (1, 1)),
+    ([(0, 1), (1, 2), (3, 4), (1, 0)], numpy.zeros((5, 5)), 0, (5, 5)),
+    ([(0, 1), (1, 2), (3, 4), (1, 0)], numpy.zeros((4, 5)), 0, (4, 5)),
+])
+def test_prior_matrix(edges, true_prior, num_swaps, shape):
+    """
+    Check that `xswap.prior.compute_xswap_occurrence_matrix` is returning
+    reasonable results for very small networks where the correct prior is obvious.
+    """
+    occurrence_matrix = xswap.prior.compute_xswap_occurrence_matrix(
+        edges, n_permutations=num_swaps, shape=shape, allow_self_loops=True,
+        allow_antiparallel=True)
+    if num_swaps:
+        edge_prior = (occurrence_matrix / num_swaps).toarray()
+    else:
+        edge_prior = occurrence_matrix.toarray()
+    assert numpy.abs(edge_prior - true_prior).max() == pytest.approx(0, abs=0.01)
+
+
+@pytest.mark.parametrize('edges,dtypes,source_degrees,target_degrees,shape,allow_antiparallel', [
+    (
+        [(0, 2), (0, 3), (1, 2), (2, 3), (3, 4)],
+        {'id': numpy.uint16, 'edge': bool, 'degree': numpy.uint32, 'xswap_prior': float},
+        {0: 2, 1: 1, 2: 3, 3: 3, 4: 1}, {0: 2, 1: 1, 2: 3, 3: 3, 4: 1}, (5, 5), False
+    ),
+    (
+        [(0, 2), (0, 3), (1, 2), (2, 3), (3, 4)],
+        {'id': numpy.int8, 'edge': int, 'degree': numpy.float, 'xswap_prior': numpy.float64},
+        {0: 2, 1: 1, 2: 3, 3: 3, 4: 1}, {0: 2, 1: 1, 2: 3, 3: 3, 4: 1}, (5, 5), False
+    ),
+    (
+        [(0, 2), (0, 3), (1, 2), (1, 3)],
+        {'id': numpy.float16, 'edge': float, 'degree': float, 'xswap_prior': numpy.float32},
+        {0: 2, 1: 2, 2: 0, 3: 0}, {0: 0, 1: 0, 2: 2, 3: 2}, (4, 4), True
+    ),
+])
+def test_prior_dataframe(edges, dtypes, source_degrees, target_degrees, shape, allow_antiparallel):
+    """
+    Check that the `xswap.prior.compute_xswap_priors` performs correctly
+    """
+    prior_df = xswap.prior.compute_xswap_priors(edges, n_permutations=1000,
+        shape=shape, allow_self_loops=False, allow_antiparallel=allow_antiparallel, dtypes=dtypes)
+
+    assert isinstance(prior_df, pandas.DataFrame)
+    assert list(prior_df.columns) == ['source_id', 'target_id', 'edge', 'source_degree',
+                                      'target_degree', 'xswap_prior']
+    assert dict(prior_df.dtypes) == {
+        'source_id': dtypes['id'], 'target_id': dtypes['id'], 'edge': dtypes['edge'],
+        'source_degree': dtypes['degree'], 'target_degree': dtypes['degree'],
+        'xswap_prior': dtypes['xswap_prior']
+    }
+
+    assert prior_df.set_index('source_id')['source_degree'].to_dict() == source_degrees
+    assert prior_df.set_index('target_id')['target_degree'].to_dict() == target_degrees
+
+    # Ensure that all the edges are accounted for in the dataframe
+    for edge in edges:
+        assert prior_df.query('source_id == {} & target_id == {}'.format(*edge))['edge'].values[0]
+
+    # Whether directed-ness is correctly propagated through the pipeline
+    if allow_antiparallel:
+        assert prior_df['edge'].sum() == len(edges)
+    else:
+        assert prior_df['edge'].sum() == len(edges) * 2
diff --git a/xswap/__init__.py b/xswap/__init__.py
index 99806e7..f2605e0 100644
--- a/xswap/__init__.py
+++ b/xswap/__init__.py
@@ -1,11 +1,18 @@
-from xswap.xswap import permute_edge_list
+from xswap import network_formats
 from xswap import preprocessing
+from xswap import prior
+from xswap.permute import permute_edge_list
 
 __version__ = '0.0.2'
 
 __all__ = [
+    network_formats.edges_to_matrix,
+    network_formats.matrix_to_edges,
     permute_edge_list,
     preprocessing.load_str_edges,
     preprocessing.load_processed_edges,
     preprocessing.map_str_edges,
+    prior.compute_xswap_occurrence_matrix,
+    prior.compute_xswap_priors,
+    prior.approximate_xswap_prior,
 ]
diff --git a/xswap/network_formats.py b/xswap/network_formats.py
new file mode 100644
index 0000000..8acca1e
--- /dev/null
+++ b/xswap/network_formats.py
@@ -0,0 +1,78 @@
+from typing import List, Tuple, TypeVar
+
+import numpy
+import scipy.sparse
+
+
+def matrix_to_edges(matrix: numpy.ndarray, include_reverse_edges: bool=True):
+    """
+    Convert (bi)adjacency matrix to an edge list. Inverse of `edges_to_matrix`.
+
+    Parameters
+    ----------
+    matrix : numpy.ndarray
+        Adjacency matrix or biadjacency matrix of a network
+    include_reverse_edges : bool
+        Whether to return edges that are the inverse of existing edges. For
+        example, if returning [(0, 1), (1, 0)] is desired or not. If False,
+        then only edges where source <= target are returned. This parameter
+        should be `True` when passing a biadjacency matrix, as matrix positions
+        indicate separate nodes.
+
+    Returns
+    -------
+    edge_list : List[Tuple[int, int]]
+        Edge list with node ids as the corresponding matrix indices. For example,
+        if `matrix` has `matrix[0, 2] == 1`, then `(0, 2)` will be among the
+        returned edges.
+    """
+    sparse = scipy.sparse.coo_matrix(matrix)
+    edges = zip(sparse.row, sparse.col)
+
+    if not include_reverse_edges:
+        edges = filter(lambda edge: edge[0] <= edge[1], edges)
+    return list(edges)
+
+
+def edges_to_matrix(edge_list: List[Tuple[int, int]], add_reverse_edges: bool,
+                    shape: Tuple[int, int], dtype: TypeVar=bool, sparse: bool=True):
+    """
+    Convert edge list to (bi)adjacency matrix. Inverse of `matrix_to_edges`.
+
+    Parameters
+    ----------
+    edge_list : List[Tuple[int, int]]
+        An edge list mapped such that node ids correspond to desired matrix
+        positions. For example, (0, 0) will mean that the resulting matrix has
+        a positive value of type `dtype` in that position.
+    add_reverse_edges : bool
+        Whether to include the reverse of edges in the matrix. For example,
+        if `edge_list = [(1, 0)]` and `add_reverse_edge = True`, then the
+        returned matrix has `matrix[1, 0]` = `matrix[0, 1]` = 1. Else, the matrix
+        only has `matrix[1, 0]` = 1. If a biadjacency matrix is desired, then
+        set `add_reverse_edges = False`.
+    shape : Tuple[int, int]
+        Shape of the matrix to be returned. Allows edges to be converted to
+        a matrix even when there are nodes without edges.
+    dtype : data-type
+        Dtype of the returned matrix. For example, `int`, `bool`, `float`, etc.
+    sparse : bool
+        Whether a sparse matrix should be returned. If `False`, returns a dense
+        numpy.ndarray
+
+    Returns
+    -------
+    matrix : scipy.sparse.csc_matrix or numpy.ndarray
+    """
+    matrix = scipy.sparse.csc_matrix(
+        (numpy.ones(len(edge_list)), zip(*edge_list)), dtype=dtype, shape=shape,
+    )
+
+    if add_reverse_edges:
+        matrix = (matrix + matrix.T) > 0
+        matrix = matrix.astype(dtype)
+
+    if not sparse:
+        matrix = matrix.toarray()
+
+    return matrix
diff --git a/xswap/xswap.py b/xswap/permute.py
similarity index 100%
rename from xswap/xswap.py
rename to xswap/permute.py
diff --git a/xswap/prior.py b/xswap/prior.py
new file mode 100644
index 0000000..1efd3a2
--- /dev/null
+++ b/xswap/prior.py
@@ -0,0 +1,263 @@
+from typing import List, Tuple
+
+import numpy
+import pandas
+import scipy.sparse
+
+import xswap._xswap_backend
+import xswap.network_formats
+
+
+def compute_xswap_occurrence_matrix(edge_list: List[Tuple[int, int]],
+                                    n_permutations: int,
+                                    shape: Tuple[int, int],
+                                    allow_self_loops: bool = False,
+                                    allow_antiparallel: bool = False,
+                                    swap_multiplier: float = 10,
+                                    initial_seed: int = 0,
+                                    max_malloc: int = 4000000000):
+    """
+    Compute the XSwap prior probability for every node pair in a network. The
+    XSwap prior is the probability of a node pair having an edge between them in
+    degree-preserving permutations of a network. The prior value for a node
+    pair can be considered as the probability of an edge existing between two
+    nodes given only the network's degree sequence.
+
+    Parameters
+    ----------
+    edge_list : List[Tuple[int, int]]
+        Edge list representing the graph whose XSwap edge priors are to be
+        computed. Tuples contain integer values representing nodes. No value
+        should be greater than C++'s `INT_MAX`, in this case 2_147_483_647.
+        An adjacency matrix will be created assuming that a node's value is its
+        index in the matrix. If not, map edges (identifiers can be string or
+        otherwise) using `xswap.preprocessing.map_str_edges`.
+    n_permutations : int
+        The number of permuted networks used to compute the empirical XSwap prior
+    shape : Tuple[int, int]
+        The shape of the matrix to be returned. In other words, a tuple of the
+        number of source and target nodes.
+    allow_self_loops : bool
+        Whether to allow edges like (0, 0). In the case of bipartite graphs,
+        such an edge represents a connection between two distinct nodes, while
+        in other graphs it may represent an edge from a node to itself, in which
+        case an edge may or may not be meaningful depending on context.
+    allow_antiparallel : bool
+        Whether to allow simultaneous edges like (0, 1) and (1, 0). In the case
+        of bipartite graphs, these edges represent two connections between four
+        distinct nodes, while for other graphs, these may be connections between
+        the same two nodes.
+    swap_multiplier : float
+        The number of edge swap attempts is determined by the product of the
+        number of existing edges and multiplier. For example, if five edges are
+        passed and multiplier is set to 10, 50 swaps will be attempted. Non-integer
+        products will be rounded down to the nearest integer.
+    initial_seed : int
+        Random seed that will be passed to the C++ Mersenne Twister 19937 random
+        number generator. `initial_seed` will be used for the first permutation,
+        and the seed used for each subsequent permutation will be incremented by
+        one. For example, if `initial_seed` is 0 and `n_permutations` is 2, then
+        the two permutations will pass seeds 0 and 1, respectively.
+    max_malloc : int (`unsigned long long int` in C)
+        The maximum amount of memory to be allocated using `malloc` when making
+        a bitset to hold edges. An uncompressed bitset is implemented for
+        holding edges that is significantly faster than alternatives. However,
+        it is memory-inefficient and will not be used if more memory is required
+        than `max_malloc`. Above the threshold, a Roaring bitset will be used.
+
+    Returns
+    -------
+    edge_counter : scipy.sparse.csc_matrix
+        Adjacency matrix with entries equal to the number of permutations in
+        which a given edge appeared
+    """
+    if len(edge_list) != len(set(edge_list)):
+        raise ValueError("Edge list contained duplicate edges. "
+                         "XSwap does not support multigraphs.")
+
+    num_swaps = int(swap_multiplier * len(edge_list))
+
+    max_id = max(map(max, edge_list))
+
+    edge_counter = scipy.sparse.csc_matrix(shape, dtype=int)
+
+    for i in range(n_permutations):
+        permuted_edges, stats = xswap._xswap_backend._xswap(
+            edge_list, [], max_id, allow_self_loops, allow_antiparallel,
+            num_swaps, initial_seed + i, max_malloc)
+        permuted_matrix = xswap.network_formats.edges_to_matrix(
+            permuted_edges, add_reverse_edges=(not allow_antiparallel),
+            shape=shape, dtype=int, sparse=True)
+        edge_counter += permuted_matrix
+
+    return edge_counter
+
+
+def compute_xswap_priors(edge_list: List[Tuple[int, int]], n_permutations: int,
+                         shape: Tuple[int, int], allow_self_loops: bool = False,
+                         allow_antiparallel: bool = False,
+                         swap_multiplier: int = 10, initial_seed: int = 0,
+                         max_malloc: int = 4000000000,
+                         dtypes = {'id': numpy.uint16, 'degree': numpy.uint16,
+                                   'edge': bool, 'xswap_prior': float},
+                        ):
+    """
+    Compute the XSwap prior for every potential edge in the network. Uses
+    degree-grouping to maximize the effective number of permutations for each
+    node pair. That is, node pairs with the same source and target degrees can
+    be grouped when computing the XSwap prior, allowing there to be more
+    permutations for some node pairs than `n_permutations`.
+
+    Note that the mechanics of this function are separated to minimize memory use.
+
+    Parameters
+    ----------
+    edge_list : List[Tuple[int, int]]
+        Edge list representing the graph whose XSwap edge priors are to be
+        computed. Tuples contain integer values representing nodes. No value
+        should be greater than C++'s `INT_MAX`, in this case 2_147_483_647.
+        An adjacency matrix will be created assuming that a node's value is its
+        index in the matrix. If not, map edges (identifiers can be string or
+        otherwise) using `xswap.preprocessing.map_str_edges`.
+    n_permutations : int
+        The number of permuted networks used to compute the empirical XSwap prior
+    shape : Tuple[int, int]
+        The shape of the matrix to be returned. In other words, a tuple of the
+        number of source and target nodes.
+    allow_self_loops : bool
+        Whether to allow edges like (0, 0). In the case of bipartite graphs,
+        such an edge represents a connection between two distinct nodes, while
+        in other graphs it may represent an edge from a node to itself, in which
+        case an edge may or may not be meaningful depending on context.
+    allow_antiparallel : bool
+        Whether to allow simultaneous edges like (0, 1) and (1, 0). In the case
+        of bipartite graphs, these edges represent two connections between four
+        distinct nodes, while for other graphs, these may be connections between
+        the same two nodes.
+    swap_multiplier : float
+        The number of edge swap attempts is determined by the product of the
+        number of existing edges and multiplier. For example, if five edges are
+        passed and multiplier is set to 10, 50 swaps will be attempted. Non-integer
+        products will be rounded down to the nearest integer.
+    initial_seed : int
+        Random seed that will be passed to the C++ Mersenne Twister 19937 random
+        number generator. `initial_seed` will be used for the first permutation,
+        and the seed used for each subsequent permutation will be incremented by
+        one. For example, if `initial_seed` is 0 and `n_permutations` is 2, then
+        the two permutations will pass seeds 0 and 1, respectively.
+    max_malloc : int (`unsigned long long int` in C)
+        The maximum amount of memory to be allocated using `malloc` when making
+        a bitset to hold edges. An uncompressed bitset is implemented for
+        holding edges that is significantly faster than alternatives. However,
+        it is memory-inefficient and will not be used if more memory is required
+        than `max_malloc`. Above the threshold, a Roaring bitset will be used.
+    dtypes : dict
+        Dictionary mapping returned column types to dtypes. Keys should be
+        `'id'`, `'degree'`, `'edge'`, and `'xswap_prior'`. `dtype` need only
+        be changed from its defaults if the values of `id` or `degree` are
+        greater than the maxima in the default dtypes, or in cases where greater
+        precision is desired. (`numpy.uint16` has a maximum value of 65535.)
+
+    Returns
+    -------
+    prior_df : pandas.DataFrame
+        Columns are the following:
+        [source_id, target_id, edge, source_degree, target_degree, xswap_prior]
+    """
+    # Compute the adjacency matrix of the original (unpermuted) network
+    original_edges = xswap.network_formats.edges_to_matrix(
+        edge_list, add_reverse_edges=(not allow_antiparallel), shape=shape,
+        dtype=dtypes['edge'], sparse=True)
+
+    # Setup DataFrame for recording prior data
+    prior_df = pandas.DataFrame({
+        'source_id': numpy.repeat(numpy.arange(shape[0], dtype=dtypes['id']), shape[1]),
+        'target_id': numpy.tile(numpy.arange(shape[1], dtype=dtypes['id']), shape[0]),
+        'edge': original_edges.toarray().flatten(),
+    })
+    del original_edges
+
+    prior_df['source_degree'] = (prior_df
+                                 .groupby('source_id')
+                                 .transform(sum)['edge']
+                                 .astype(dtypes['degree']))
+    del prior_df['source_id']
+
+    prior_df['target_degree'] = (prior_df
+                                 .groupby('target_id')
+                                 .transform(sum)['edge']
+                                 .astype(dtypes['degree']))
+    del prior_df['target_id']
+
+    # Compute the number of occurrences of each edge across permutations
+    edge_counter = compute_xswap_occurrence_matrix(
+        edge_list=edge_list, n_permutations=n_permutations, shape=shape,
+        allow_self_loops=allow_self_loops, allow_antiparallel=allow_antiparallel,
+        swap_multiplier=swap_multiplier, initial_seed=initial_seed,
+        max_malloc=max_malloc)
+
+    prior_df['num_permuted_edges'] = edge_counter.astype(dtypes['degree']).toarray().flatten()
+    del edge_counter
+
+    # The number of edges that occurred across all node pairs with the same
+    # `source_degree` and `target_degree`
+    dgp_edge_count = (
+        prior_df
+        .groupby(['source_degree', 'target_degree'])
+        .transform(sum)['num_permuted_edges']
+        .values
+        .astype(dtypes['degree'])
+    )
+    del prior_df['num_permuted_edges']
+
+    # The effective number of permutations for every node pair, incorporating
+    # degree-grouping
+    num_dgp = (
+        n_permutations * prior_df.groupby(['source_degree', 'target_degree'])
+                                 .transform(len)['edge']
+                                 .values
+                                 .astype(dtypes['degree'])
+    )
+    xswap_prior = (dgp_edge_count / num_dgp).astype(dtypes['xswap_prior'])
+    del dgp_edge_count, num_dgp
+
+    prior_df['xswap_prior'] = xswap_prior
+    del xswap_prior
+
+    prior_df = (
+        prior_df
+        .assign(
+            source_id=numpy.repeat(numpy.arange(shape[0], dtype=dtypes['id']), shape[1]),
+            target_id=numpy.tile(numpy.arange(shape[1], dtype=dtypes['id']), shape[0]),
+        )
+        .filter(items=['source_id', 'target_id', 'edge', 'source_degree',
+                       'target_degree', 'xswap_prior'])
+    )
+    return prior_df
+
+
+def approximate_xswap_prior(source_degree, target_degree, num_edges):
+    """
+    Approximate the XSwap prior by assuming that the XSwap Markov Chain is stationary.
+    While this is not the case in reality, some networks' priors can be estimated
+    very well using this equation.
+
+    Parameters
+    ----------
+    source_degree : int, float, numpy.array, or pandas.Series
+        The source degree for a single node pair or a number of source degrees.
+        The type of object passed should match `target_degree`.
+    target_degree : int, float, numpy.array, or pandas.Series
+        The target degree for a single node pair or a number of target degrees.
+        The type of object passed should match `source_degree`.
+    num_edges : int or float
+        The total number of edges in the network
+
+    Returns
+    -------
+    approximate_prior : float, numpy.array, or pandas.Series
+        Output type matches the types of `source_degree` and `target_degree`.
+    """
+    return source_degree * target_degree / (
+        source_degree * target_degree + num_edges - source_degree - target_degree + 1
+    )