From 4100fc4206d59e225bcb8fc908fd451520de813d Mon Sep 17 00:00:00 2001 From: Michael Zietz Date: Mon, 24 Jun 2019 14:31:23 -0400 Subject: [PATCH] Add utility files for XSwap priors and network format conversion Merges https://github.com/greenelab/xswap/pull/11 Adds network format conversion and XSwap edge prior computation utilities Updates README with acknowledgements and examples Adds tests for Roaring bitset warnings, XSwap prior computation, and format conversion --- README.md | 68 +++++++-- tests-require.txt | 3 + tests/test_formats.py | 54 +++++++ tests/test_permute.py | 24 +++ tests/test_prior.py | 74 ++++++++++ xswap/__init__.py | 9 +- xswap/network_formats.py | 78 ++++++++++ xswap/{xswap.py => permute.py} | 0 xswap/prior.py | 263 +++++++++++++++++++++++++++++++++ 9 files changed, 560 insertions(+), 13 deletions(-) create mode 100644 tests/test_formats.py create mode 100644 tests/test_prior.py create mode 100644 xswap/network_formats.py rename xswap/{xswap.py => permute.py} (100%) create mode 100644 xswap/prior.py diff --git a/README.md b/README.md index 14ae94a..fb23c66 100644 --- a/README.md +++ b/README.md @@ -1,24 +1,68 @@ -# xswap +# XSwap: Fast degree-preserving network permutation + +![](https://api.travis-ci.com/greenelab/xswap.svg?branch=master) + +XSwap is an algorithm for degree-preserving network randomization (permutation) [1]. +Permuted networks can be used for a number of purposes in network analysis, including for generating counterfactual distributions of features when only the network's degree sequence is maintained or for computing a prior probability of an edge given only the network's degree sequence. +Overall, permuted networks allow one to quantify the effects of degree on analysis and prediction methods. +Understanding this effect is useful when a network's degree sequence is subject to biases. +This implementation is a modified version of the algorithm due to Hanhijärvi et al. with two additional parameters (`allow_self_loops` and `allow_antiparallel`), which enable greater generalizability to bipartite, directed, and undirected networks. + +[1] Sami Hanhijärvi, Gemma C. Garriga, Kai Puolamäki +*Proceedings of the 2009 SIAM International Conference on Data Mining* (2009-04-30) +DOI: [10.1137/1.9781611972795.67](https://doi.org/10.1137/1.9781611972795.67) ## Usage examples +#### Permuting an edge list + +```python +>>> edges = [(0, 1), (1, 0)] +>>> permuted_edges, permutation_statistics = xswap.permute_edge_list( + edges, allow_self_loops=False, allow_antiparallel=True, + multiplier=10) +>>> permuted_edges +[(1, 0), (0, 1)] +>>> permutation_statistics +{'swap_attempts': 20, 'same_edge': 10, 'self_loop': 5, 'duplicate': 1, + 'undir_duplicate': 0, 'excluded': 0} +``` + +#### Computing degree-sequence based prior probabilities of edges existing + +```python +>>> edges = [(0, 1), (1, 0)] +>>> prior_prob_df = xswap.prior.compute_xswap_priors( + edges, n_permutations=10000, shape=(2, 2), allow_self_loops=True, + allow_antiparallel=True) +>>> prior_prob_df + source_id target_id edge source_degree target_degree xswap_prior +0 0 0 False 1 1 0.5 +1 0 1 True 1 1 0.5 +2 1 0 True 1 1 0.5 +3 1 1 False 1 1 0.5 +``` + ## Choice of parameters -### `directed` and `bipartite` +#### Bipartite networks + +Bipartite networks should be indexed using the bi-adjacency matrix, meaning that the edge `(0, 0)` is from source node 0 to target node 0, and is not a self-loop. +Moreover, bipartite networks should be permuted using `allow_self_loops=False` and `allow_antiparallel=True`. -The `bipartite` argument determines the meaning of a node's value. -A bipartite graph is a graph in which nodes can be divided into disjoint sets with connections exclusively between sets. -For example, consider the graph shown in the figure below: +#### Directed and undirected networks -Image of bipartite graph +For non-bipartite networks, the decisions of `allow_self_loops` and `allow_antiparallel` are not always the same. +For undirected networks, set `allow_antiparallel=False`, as otherwise the edges (1, 0) and (0, 1), which represent the same edge, will be treated as separate. +Antiparallel edges may or may not be allowed for directed networks, depending on context. +Similarly, self-loops may or may not be allowed for directed or undirected networks, depending on the specific network being permuted. -The adjacency matrix corresponding to a bipartite graph can be broken into four blocks. +## Libraries - +The XSwap library includes Roaring Bitmaps (https://github.com/RoaringBitmap/CRoaring), available under the Apache 2.0 license (https://github.com/RoaringBitmap/CRoaring/blob/LICENSE). -The two diagonal blocks are entirely zero and the two off-diagonal blocks are the biadjacency matrix and its transpose. +## Acknowledgments - +Development of this project has largely taken place in the [Greene Lab](http://www.greenelab.com/) at the University of Pennsylvania. However, as an open source project under the `hetio` organization, this repository is grateful for its community of maintainers, contributors, and users. -The biadjacency matrix is, in general, non-square. -This means that the edge (0, 0) is not a self loop, as a 0 in the first position refers to a different node than a 0 in the second position. +This work is funded in part by the Gordon and Betty Moore Foundation’s Data-Driven Discovery Initiative through Grants GBMF4552 to Casey Greene and GBMF4560 to Blair Sullivan. diff --git a/tests-require.txt b/tests-require.txt index fba4e2c..7213368 100644 --- a/tests-require.txt +++ b/tests-require.txt @@ -1,3 +1,6 @@ +numpy +pandas pytest requests +scipy setuptools diff --git a/tests/test_formats.py b/tests/test_formats.py new file mode 100644 index 0000000..dc8e8cd --- /dev/null +++ b/tests/test_formats.py @@ -0,0 +1,54 @@ +import numpy +import pytest +import scipy.sparse + +import xswap.network_formats + + +@pytest.mark.parametrize('matrix,correct_edges,include_reverse_edges', [ + (numpy.array([[1,0,0,0],[0,0,1,0],[0,0,0,1]]), [(0, 0), (1, 2), (2, 3)], False), + (numpy.array([[1,0,0],[0,0,1],[0,1,1]]), [(0, 0), (1, 2), (2, 2)], False), + (numpy.array([[1,0,0],[0,0,1],[0,1,1]]), [(0, 0), (1, 2), (2, 1), (2, 2)], True), +]) +def test_matrix_to_edges(matrix, correct_edges, include_reverse_edges): + edges = xswap.network_formats.matrix_to_edges(matrix, include_reverse_edges) + assert sorted(edges) == sorted(correct_edges) + + +@pytest.mark.parametrize('edges,correct_matrix,add_reverse_edges,shape,dtype,sparse', [ + ( + [(0, 1), (0, 3), (2, 2)], + numpy.array([[0,1,0,1], [1,0,0,0], [0,0,1,0], [1,0,0,0]], dtype=int), + True, (4, 4), int, False), + ( + [(0, 1), (0, 3), (2, 2)], + numpy.array([[0,1,0,1], [0,0,0,0], [0,0,1,0], [0,0,0,0]], dtype=int), + False, (4, 4), int, False), + ( + [(0, 1), (0, 3), (2, 2)], + numpy.array([[0,1,0,1], [0,0,0,0], [0,0,1,0]], dtype=int), + False, (3, 4), int, False), + ( + [(0, 1), (0, 3), (2, 2)], + numpy.array([[0,1,0,1], [0,0,0,0], [0,0,1,0]], dtype=float), + False, (3, 4), float, False), + ( + [(0, 1), (0, 3), (2, 2)], + numpy.array([[0,1,0,1], [0,0,0,0], [0,0,1,0]], dtype=numpy.float32), + False, (3, 4), numpy.float32, False), + ( + [(0, 1), (0, 3), (2, 2)], + scipy.sparse.csc_matrix([[0,1,0,1], [0,0,0,0], [0,0,1,0]], dtype=numpy.float32), + False, (3, 4), numpy.float32, True), +]) +def test_edges_to_matrix(edges, correct_matrix, add_reverse_edges, shape, dtype, sparse): + matrix = xswap.network_formats.edges_to_matrix( + edge_list=edges, add_reverse_edges=add_reverse_edges, shape=shape, + dtype=dtype, sparse=sparse) + + assert matrix.dtype == dtype + assert scipy.sparse.issparse(matrix) == sparse + if sparse: + assert (matrix != correct_matrix).nnz == 0 + else: + assert numpy.array_equal(matrix, correct_matrix) diff --git a/tests/test_permute.py b/tests/test_permute.py index 8fc8d6d..8070704 100644 --- a/tests/test_permute.py +++ b/tests/test_permute.py @@ -1,4 +1,7 @@ +import tempfile + import pytest +import requests import xswap @@ -20,3 +23,24 @@ def test_xswap_changes_edges(edges, permutable): assert new_edges != edges else: assert new_edges == edges + + +def test_roaring_warning(): + """ + Check that a warning is given when using the much slower but far more general + Roaring bitset rather than the faster fully uncompressed bitset. + """ + edges_url = "https://github.com/greenelab/xswap/raw/{}/{}".format( + "8c31b4cbdbbf2cfa5018b1277bbd0e9f6263e573", "graphs/GiG_edges_reduced.txt") + response = requests.get(edges_url) + with tempfile.NamedTemporaryFile() as tf: + tf.write(response.content) + edges = xswap.preprocessing.load_processed_edges(tf.name) + + with pytest.warns(None): + permuted_edges, stats = xswap.permute_edge_list(edges, allow_self_loops=True, + allow_antiparallel=False, multiplier=0.1, seed=0, max_malloc=4000000000) + + with pytest.warns(RuntimeWarning, match="Using Roaring bitset because of the large number of edges."): + permuted_edges, stats = xswap.permute_edge_list(edges, allow_self_loops=True, + allow_antiparallel=False, multiplier=0.1, seed=0, max_malloc=10) diff --git a/tests/test_prior.py b/tests/test_prior.py new file mode 100644 index 0000000..06091bc --- /dev/null +++ b/tests/test_prior.py @@ -0,0 +1,74 @@ +import numpy +import pandas +import pytest + +import xswap + + +@pytest.mark.parametrize('edges,true_prior,num_swaps,shape', [ + ([(0, 0), (1, 1)], 0.5 * numpy.ones((2, 2)), 10000, (2, 2)), + ([(0, 1), (1, 0)], 0.5 * numpy.ones((2, 2)), 10000, (2, 2)), + ([(0, 0)], numpy.ones((1, 1)), 10, (1, 1)), + ([(0, 1), (1, 2), (3, 4), (1, 0)], numpy.zeros((5, 5)), 0, (5, 5)), + ([(0, 1), (1, 2), (3, 4), (1, 0)], numpy.zeros((4, 5)), 0, (4, 5)), +]) +def test_prior_matrix(edges, true_prior, num_swaps, shape): + """ + Check that `xswap.prior.compute_xswap_occurrence_matrix` is returning + reasonable results for very small networks where the correct prior is obvious. + """ + occurrence_matrix = xswap.prior.compute_xswap_occurrence_matrix( + edges, n_permutations=num_swaps, shape=shape, allow_self_loops=True, + allow_antiparallel=True) + if num_swaps: + edge_prior = (occurrence_matrix / num_swaps).toarray() + else: + edge_prior = occurrence_matrix.toarray() + assert numpy.abs(edge_prior - true_prior).max() == pytest.approx(0, abs=0.01) + + +@pytest.mark.parametrize('edges,dtypes,source_degrees,target_degrees,shape,allow_antiparallel', [ + ( + [(0, 2), (0, 3), (1, 2), (2, 3), (3, 4)], + {'id': numpy.uint16, 'edge': bool, 'degree': numpy.uint32, 'xswap_prior': float}, + {0: 2, 1: 1, 2: 3, 3: 3, 4: 1}, {0: 2, 1: 1, 2: 3, 3: 3, 4: 1}, (5, 5), False + ), + ( + [(0, 2), (0, 3), (1, 2), (2, 3), (3, 4)], + {'id': numpy.int8, 'edge': int, 'degree': numpy.float, 'xswap_prior': numpy.float64}, + {0: 2, 1: 1, 2: 3, 3: 3, 4: 1}, {0: 2, 1: 1, 2: 3, 3: 3, 4: 1}, (5, 5), False + ), + ( + [(0, 2), (0, 3), (1, 2), (1, 3)], + {'id': numpy.float16, 'edge': float, 'degree': float, 'xswap_prior': numpy.float32}, + {0: 2, 1: 2, 2: 0, 3: 0}, {0: 0, 1: 0, 2: 2, 3: 2}, (4, 4), True + ), +]) +def test_prior_dataframe(edges, dtypes, source_degrees, target_degrees, shape, allow_antiparallel): + """ + Check that the `xswap.prior.compute_xswap_priors` performs correctly + """ + prior_df = xswap.prior.compute_xswap_priors(edges, n_permutations=1000, + shape=shape, allow_self_loops=False, allow_antiparallel=allow_antiparallel, dtypes=dtypes) + + assert isinstance(prior_df, pandas.DataFrame) + assert list(prior_df.columns) == ['source_id', 'target_id', 'edge', 'source_degree', + 'target_degree', 'xswap_prior'] + assert dict(prior_df.dtypes) == { + 'source_id': dtypes['id'], 'target_id': dtypes['id'], 'edge': dtypes['edge'], + 'source_degree': dtypes['degree'], 'target_degree': dtypes['degree'], + 'xswap_prior': dtypes['xswap_prior'] + } + + assert prior_df.set_index('source_id')['source_degree'].to_dict() == source_degrees + assert prior_df.set_index('target_id')['target_degree'].to_dict() == target_degrees + + # Ensure that all the edges are accounted for in the dataframe + for edge in edges: + assert prior_df.query('source_id == {} & target_id == {}'.format(*edge))['edge'].values[0] + + # Whether directed-ness is correctly propagated through the pipeline + if allow_antiparallel: + assert prior_df['edge'].sum() == len(edges) + else: + assert prior_df['edge'].sum() == len(edges) * 2 diff --git a/xswap/__init__.py b/xswap/__init__.py index 99806e7..f2605e0 100644 --- a/xswap/__init__.py +++ b/xswap/__init__.py @@ -1,11 +1,18 @@ -from xswap.xswap import permute_edge_list +from xswap import network_formats from xswap import preprocessing +from xswap import prior +from xswap.permute import permute_edge_list __version__ = '0.0.2' __all__ = [ + network_formats.edges_to_matrix, + network_formats.matrix_to_edges, permute_edge_list, preprocessing.load_str_edges, preprocessing.load_processed_edges, preprocessing.map_str_edges, + prior.compute_xswap_occurrence_matrix, + prior.compute_xswap_priors, + prior.approximate_xswap_prior, ] diff --git a/xswap/network_formats.py b/xswap/network_formats.py new file mode 100644 index 0000000..8acca1e --- /dev/null +++ b/xswap/network_formats.py @@ -0,0 +1,78 @@ +from typing import List, Tuple, TypeVar + +import numpy +import scipy.sparse + + +def matrix_to_edges(matrix: numpy.ndarray, include_reverse_edges: bool=True): + """ + Convert (bi)adjacency matrix to an edge list. Inverse of `edges_to_matrix`. + + Parameters + ---------- + matrix : numpy.ndarray + Adjacency matrix or biadjacency matrix of a network + include_reverse_edges : bool + Whether to return edges that are the inverse of existing edges. For + example, if returning [(0, 1), (1, 0)] is desired or not. If False, + then only edges where source <= target are returned. This parameter + should be `True` when passing a biadjacency matrix, as matrix positions + indicate separate nodes. + + Returns + ------- + edge_list : List[Tuple[int, int]] + Edge list with node ids as the corresponding matrix indices. For example, + if `matrix` has `matrix[0, 2] == 1`, then `(0, 2)` will be among the + returned edges. + """ + sparse = scipy.sparse.coo_matrix(matrix) + edges = zip(sparse.row, sparse.col) + + if not include_reverse_edges: + edges = filter(lambda edge: edge[0] <= edge[1], edges) + return list(edges) + + +def edges_to_matrix(edge_list: List[Tuple[int, int]], add_reverse_edges: bool, + shape: Tuple[int, int], dtype: TypeVar=bool, sparse: bool=True): + """ + Convert edge list to (bi)adjacency matrix. Inverse of `matrix_to_edges`. + + Parameters + ---------- + edge_list : List[Tuple[int, int]] + An edge list mapped such that node ids correspond to desired matrix + positions. For example, (0, 0) will mean that the resulting matrix has + a positive value of type `dtype` in that position. + add_reverse_edges : bool + Whether to include the reverse of edges in the matrix. For example, + if `edge_list = [(1, 0)]` and `add_reverse_edge = True`, then the + returned matrix has `matrix[1, 0]` = `matrix[0, 1]` = 1. Else, the matrix + only has `matrix[1, 0]` = 1. If a biadjacency matrix is desired, then + set `add_reverse_edges = False`. + shape : Tuple[int, int] + Shape of the matrix to be returned. Allows edges to be converted to + a matrix even when there are nodes without edges. + dtype : data-type + Dtype of the returned matrix. For example, `int`, `bool`, `float`, etc. + sparse : bool + Whether a sparse matrix should be returned. If `False`, returns a dense + numpy.ndarray + + Returns + ------- + matrix : scipy.sparse.csc_matrix or numpy.ndarray + """ + matrix = scipy.sparse.csc_matrix( + (numpy.ones(len(edge_list)), zip(*edge_list)), dtype=dtype, shape=shape, + ) + + if add_reverse_edges: + matrix = (matrix + matrix.T) > 0 + matrix = matrix.astype(dtype) + + if not sparse: + matrix = matrix.toarray() + + return matrix diff --git a/xswap/xswap.py b/xswap/permute.py similarity index 100% rename from xswap/xswap.py rename to xswap/permute.py diff --git a/xswap/prior.py b/xswap/prior.py new file mode 100644 index 0000000..1efd3a2 --- /dev/null +++ b/xswap/prior.py @@ -0,0 +1,263 @@ +from typing import List, Tuple + +import numpy +import pandas +import scipy.sparse + +import xswap._xswap_backend +import xswap.network_formats + + +def compute_xswap_occurrence_matrix(edge_list: List[Tuple[int, int]], + n_permutations: int, + shape: Tuple[int, int], + allow_self_loops: bool = False, + allow_antiparallel: bool = False, + swap_multiplier: float = 10, + initial_seed: int = 0, + max_malloc: int = 4000000000): + """ + Compute the XSwap prior probability for every node pair in a network. The + XSwap prior is the probability of a node pair having an edge between them in + degree-preserving permutations of a network. The prior value for a node + pair can be considered as the probability of an edge existing between two + nodes given only the network's degree sequence. + + Parameters + ---------- + edge_list : List[Tuple[int, int]] + Edge list representing the graph whose XSwap edge priors are to be + computed. Tuples contain integer values representing nodes. No value + should be greater than C++'s `INT_MAX`, in this case 2_147_483_647. + An adjacency matrix will be created assuming that a node's value is its + index in the matrix. If not, map edges (identifiers can be string or + otherwise) using `xswap.preprocessing.map_str_edges`. + n_permutations : int + The number of permuted networks used to compute the empirical XSwap prior + shape : Tuple[int, int] + The shape of the matrix to be returned. In other words, a tuple of the + number of source and target nodes. + allow_self_loops : bool + Whether to allow edges like (0, 0). In the case of bipartite graphs, + such an edge represents a connection between two distinct nodes, while + in other graphs it may represent an edge from a node to itself, in which + case an edge may or may not be meaningful depending on context. + allow_antiparallel : bool + Whether to allow simultaneous edges like (0, 1) and (1, 0). In the case + of bipartite graphs, these edges represent two connections between four + distinct nodes, while for other graphs, these may be connections between + the same two nodes. + swap_multiplier : float + The number of edge swap attempts is determined by the product of the + number of existing edges and multiplier. For example, if five edges are + passed and multiplier is set to 10, 50 swaps will be attempted. Non-integer + products will be rounded down to the nearest integer. + initial_seed : int + Random seed that will be passed to the C++ Mersenne Twister 19937 random + number generator. `initial_seed` will be used for the first permutation, + and the seed used for each subsequent permutation will be incremented by + one. For example, if `initial_seed` is 0 and `n_permutations` is 2, then + the two permutations will pass seeds 0 and 1, respectively. + max_malloc : int (`unsigned long long int` in C) + The maximum amount of memory to be allocated using `malloc` when making + a bitset to hold edges. An uncompressed bitset is implemented for + holding edges that is significantly faster than alternatives. However, + it is memory-inefficient and will not be used if more memory is required + than `max_malloc`. Above the threshold, a Roaring bitset will be used. + + Returns + ------- + edge_counter : scipy.sparse.csc_matrix + Adjacency matrix with entries equal to the number of permutations in + which a given edge appeared + """ + if len(edge_list) != len(set(edge_list)): + raise ValueError("Edge list contained duplicate edges. " + "XSwap does not support multigraphs.") + + num_swaps = int(swap_multiplier * len(edge_list)) + + max_id = max(map(max, edge_list)) + + edge_counter = scipy.sparse.csc_matrix(shape, dtype=int) + + for i in range(n_permutations): + permuted_edges, stats = xswap._xswap_backend._xswap( + edge_list, [], max_id, allow_self_loops, allow_antiparallel, + num_swaps, initial_seed + i, max_malloc) + permuted_matrix = xswap.network_formats.edges_to_matrix( + permuted_edges, add_reverse_edges=(not allow_antiparallel), + shape=shape, dtype=int, sparse=True) + edge_counter += permuted_matrix + + return edge_counter + + +def compute_xswap_priors(edge_list: List[Tuple[int, int]], n_permutations: int, + shape: Tuple[int, int], allow_self_loops: bool = False, + allow_antiparallel: bool = False, + swap_multiplier: int = 10, initial_seed: int = 0, + max_malloc: int = 4000000000, + dtypes = {'id': numpy.uint16, 'degree': numpy.uint16, + 'edge': bool, 'xswap_prior': float}, + ): + """ + Compute the XSwap prior for every potential edge in the network. Uses + degree-grouping to maximize the effective number of permutations for each + node pair. That is, node pairs with the same source and target degrees can + be grouped when computing the XSwap prior, allowing there to be more + permutations for some node pairs than `n_permutations`. + + Note that the mechanics of this function are separated to minimize memory use. + + Parameters + ---------- + edge_list : List[Tuple[int, int]] + Edge list representing the graph whose XSwap edge priors are to be + computed. Tuples contain integer values representing nodes. No value + should be greater than C++'s `INT_MAX`, in this case 2_147_483_647. + An adjacency matrix will be created assuming that a node's value is its + index in the matrix. If not, map edges (identifiers can be string or + otherwise) using `xswap.preprocessing.map_str_edges`. + n_permutations : int + The number of permuted networks used to compute the empirical XSwap prior + shape : Tuple[int, int] + The shape of the matrix to be returned. In other words, a tuple of the + number of source and target nodes. + allow_self_loops : bool + Whether to allow edges like (0, 0). In the case of bipartite graphs, + such an edge represents a connection between two distinct nodes, while + in other graphs it may represent an edge from a node to itself, in which + case an edge may or may not be meaningful depending on context. + allow_antiparallel : bool + Whether to allow simultaneous edges like (0, 1) and (1, 0). In the case + of bipartite graphs, these edges represent two connections between four + distinct nodes, while for other graphs, these may be connections between + the same two nodes. + swap_multiplier : float + The number of edge swap attempts is determined by the product of the + number of existing edges and multiplier. For example, if five edges are + passed and multiplier is set to 10, 50 swaps will be attempted. Non-integer + products will be rounded down to the nearest integer. + initial_seed : int + Random seed that will be passed to the C++ Mersenne Twister 19937 random + number generator. `initial_seed` will be used for the first permutation, + and the seed used for each subsequent permutation will be incremented by + one. For example, if `initial_seed` is 0 and `n_permutations` is 2, then + the two permutations will pass seeds 0 and 1, respectively. + max_malloc : int (`unsigned long long int` in C) + The maximum amount of memory to be allocated using `malloc` when making + a bitset to hold edges. An uncompressed bitset is implemented for + holding edges that is significantly faster than alternatives. However, + it is memory-inefficient and will not be used if more memory is required + than `max_malloc`. Above the threshold, a Roaring bitset will be used. + dtypes : dict + Dictionary mapping returned column types to dtypes. Keys should be + `'id'`, `'degree'`, `'edge'`, and `'xswap_prior'`. `dtype` need only + be changed from its defaults if the values of `id` or `degree` are + greater than the maxima in the default dtypes, or in cases where greater + precision is desired. (`numpy.uint16` has a maximum value of 65535.) + + Returns + ------- + prior_df : pandas.DataFrame + Columns are the following: + [source_id, target_id, edge, source_degree, target_degree, xswap_prior] + """ + # Compute the adjacency matrix of the original (unpermuted) network + original_edges = xswap.network_formats.edges_to_matrix( + edge_list, add_reverse_edges=(not allow_antiparallel), shape=shape, + dtype=dtypes['edge'], sparse=True) + + # Setup DataFrame for recording prior data + prior_df = pandas.DataFrame({ + 'source_id': numpy.repeat(numpy.arange(shape[0], dtype=dtypes['id']), shape[1]), + 'target_id': numpy.tile(numpy.arange(shape[1], dtype=dtypes['id']), shape[0]), + 'edge': original_edges.toarray().flatten(), + }) + del original_edges + + prior_df['source_degree'] = (prior_df + .groupby('source_id') + .transform(sum)['edge'] + .astype(dtypes['degree'])) + del prior_df['source_id'] + + prior_df['target_degree'] = (prior_df + .groupby('target_id') + .transform(sum)['edge'] + .astype(dtypes['degree'])) + del prior_df['target_id'] + + # Compute the number of occurrences of each edge across permutations + edge_counter = compute_xswap_occurrence_matrix( + edge_list=edge_list, n_permutations=n_permutations, shape=shape, + allow_self_loops=allow_self_loops, allow_antiparallel=allow_antiparallel, + swap_multiplier=swap_multiplier, initial_seed=initial_seed, + max_malloc=max_malloc) + + prior_df['num_permuted_edges'] = edge_counter.astype(dtypes['degree']).toarray().flatten() + del edge_counter + + # The number of edges that occurred across all node pairs with the same + # `source_degree` and `target_degree` + dgp_edge_count = ( + prior_df + .groupby(['source_degree', 'target_degree']) + .transform(sum)['num_permuted_edges'] + .values + .astype(dtypes['degree']) + ) + del prior_df['num_permuted_edges'] + + # The effective number of permutations for every node pair, incorporating + # degree-grouping + num_dgp = ( + n_permutations * prior_df.groupby(['source_degree', 'target_degree']) + .transform(len)['edge'] + .values + .astype(dtypes['degree']) + ) + xswap_prior = (dgp_edge_count / num_dgp).astype(dtypes['xswap_prior']) + del dgp_edge_count, num_dgp + + prior_df['xswap_prior'] = xswap_prior + del xswap_prior + + prior_df = ( + prior_df + .assign( + source_id=numpy.repeat(numpy.arange(shape[0], dtype=dtypes['id']), shape[1]), + target_id=numpy.tile(numpy.arange(shape[1], dtype=dtypes['id']), shape[0]), + ) + .filter(items=['source_id', 'target_id', 'edge', 'source_degree', + 'target_degree', 'xswap_prior']) + ) + return prior_df + + +def approximate_xswap_prior(source_degree, target_degree, num_edges): + """ + Approximate the XSwap prior by assuming that the XSwap Markov Chain is stationary. + While this is not the case in reality, some networks' priors can be estimated + very well using this equation. + + Parameters + ---------- + source_degree : int, float, numpy.array, or pandas.Series + The source degree for a single node pair or a number of source degrees. + The type of object passed should match `target_degree`. + target_degree : int, float, numpy.array, or pandas.Series + The target degree for a single node pair or a number of target degrees. + The type of object passed should match `source_degree`. + num_edges : int or float + The total number of edges in the network + + Returns + ------- + approximate_prior : float, numpy.array, or pandas.Series + Output type matches the types of `source_degree` and `target_degree`. + """ + return source_degree * target_degree / ( + source_degree * target_degree + num_edges - source_degree - target_degree + 1 + )