Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added the function construct_pdp_query to build cypher queries that c… #30

Merged
merged 6 commits into from
Jan 10, 2019
165 changes: 165 additions & 0 deletions hetio/neo4j.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,171 @@ def construct_dwpc_query(metarels, property='name', join_hint='midpoint', index_

return query

def construct_pdp_query(metarels, dwpc=None, property='name', join_hint='midpoint', index_hint=False, unique_nodes=True):
"""
Create a cypher query for computing the path degree product for a type of path.
ben-heil marked this conversation as resolved.
Show resolved Hide resolved
This function is very similar to construct_dwpc_query, with the main changes occuring in the
query's aggregation level.

Parameters
----------
metarels : a metarels or MetaPath object
the metapath (path type) to create a query for
dwpc : int
the degree-weighted path count for the metapath. If dwpc is not provided,
a subquery will be added to calculate it.
property : str
which property to use for soure and target node lookup
join_hint : 'midpoint', bool, or int
whether to add a join hint to tell neo4j to traverse form both ends of
the path and join at a specific index. `'midpoint'` or `True` specifies
joining at the middle node in the path (rounded down if an even number
of nodes). `False` specifies not to add a join hint. An int specifies
the node to join on.
index_hint : bool
whether to add index hints which specifies the properties of the source
and target nodes to use for lookup. Enabling both `index_hint` and
`join_hint` can cause the query to fail.
ben-heil marked this conversation as resolved.
Show resolved Hide resolved
unique_nodes : bool or str
whether to exclude paths with duplicate nodes. To not enforce node
uniqueness, use `False`. Methods for enforcing node uniqueness are:
`nested` the path-length independent query (`ALL (x IN nodes(path) WHERE size(filter(z IN nodes(path) WHERE z = x)) = 1)`)
`expanded` for the combinatorial and path-length dependent form (`NOT (n0=n1 OR n0=n2 OR n0=n3 OR n1=n2 OR n1=n3 OR n2=n3)`).
`labeled` to perform an intelligent version of `expanded` where only
nodes with the same label are checked for duplicity. Specifying `True`,
which is the default, uses the `labeled` method.
"""
# Convert metapath to metarels
if isinstance(metarels, hetio.hetnet.MetaPath):
metarels = metapath_to_metarels(metarels)

# create cypher path query
metapath_query = cypher_path(metarels)

# create cypher path degree query
degree_strs = list()
for i, (source_label, target_label, rel_type, direction) in enumerate(metarels):
kwargs = {
'i0': i,
'i1': i + 1,
'source_label': source_label,
'target_label': target_label,
'rel_type': rel_type,
'dir0': '<-' if direction == 'backward' else '-',
'dir1': '->' if direction == 'forward' else '-',
}
degree_strs.append(textwrap.dedent(
'''\
ben-heil marked this conversation as resolved.
Show resolved Hide resolved
size((n{i0}){dir0}[:{rel_type}]{dir1}()),
size((){dir0}[:{rel_type}]{dir1}(n{i1}))'''
).format(**kwargs))
degree_query = ',\n'.join(degree_strs)

using_query = ''
# Specify index hint for node lookup
if index_hint:
using_query = '\n' + textwrap.dedent('''\
USING INDEX n0:{source_label}({property})
USING INDEX n{length}:{target_label}({property})
''').rstrip().format(
property = property,
source_label = metarels[0][0],
target_label = metarels[-1][1],
length = len(metarels)
)

# Specify join hint with node to join on
if join_hint is not False:
if join_hint is True or join_hint == 'midpoint':
join_hint = len(metarels) // 2
join_hint = int(join_hint)
assert join_hint >= 0
assert join_hint <= len(metarels)
using_query += "\nUSING JOIN ON n{}".format(join_hint)

# Unique node constraint (pevent paths with duplicate nodes)
if unique_nodes == 'nested':
unique_nodes_query = '\nAND ALL (x IN nodes(path) WHERE size(filter(z IN nodes(path) WHERE z = x)) = 1)'
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it possible we can put all the code duplicated by construct_dwpc_query in local functions that both methods call?

elif unique_nodes == 'expanded':
pairs = itertools.combinations(range(len(metarels) + 1), 2)
unique_nodes_query = format_expanded_clause(pairs)
elif unique_nodes == 'labeled' or unique_nodes is True:
labels = [metarel[0] for metarel in metarels]
labels.append(metarels[-1][1])
label_to_nodes = dict()
for i, label in enumerate(labels):
label_to_nodes.setdefault(label, list()).append(i)
pairs = list()
for nodes in label_to_nodes.values():
pairs.extend(itertools.combinations(nodes, 2))
unique_nodes_query = format_expanded_clause(pairs)
else:
assert unique_nodes is False
unique_nodes_query = ''

# combine cypher fragments into a single query and add PDP logic
query = ''
if dwpc is not None:
query = textwrap.dedent('''\
MATCH path = {metapath_query}{using_query}
WHERE n0.{property} = {{ source }}
AND n{length}.{property} = {{ target }}{unique_nodes_query}
WITH
[
{degree_query}
] AS degrees, path
WITH path, reduce(pdp = 1.0, d in degrees| pdp * d ^ -{{ w }}) as PDP
RETURN
path,
PDP,
100 * (PDP / {dwpc}) AS PERCENT_OF_DWPC
ORDER BY PERCENT_OF_DWPC DESC
''').rstrip().format(
metapath_query = metapath_query,
using_query = using_query,
unique_nodes_query = unique_nodes_query,
degree_query = degree_query,
length=len(metarels),
property=property,
dwpc = dwpc)
# If the dwpc isn't provided, we'll have to calculate it before the PDP.
# Doing so roughly doubles the query execution time, as it effectively
# runs the query twice returning different degrees of aggregation.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's open a stack overflow issue with the neo4j tag on whether we can do a sum aggregation and then combine that with the pre-aggregated result table all in Cypher.

else:
query = textwrap.dedent('''\
MATCH path = {metapath_query}{using_query}
WHERE n0.{property} = {{ source }}
AND n{length}.{property} = {{ target }}{unique_nodes_query}
WITH
[
{degree_query}
] AS degrees, path
WITH sum(reduce(pdp = 1.0, d in degrees| pdp * d ^ -{{ w }})) as DWPC

MATCH path = {metapath_query}{using_query}
WHERE n0.{property} = {{ source }}
AND n{length}.{property} = {{ target }}{unique_nodes_query}
WITH
[
{degree_query}
] AS degrees, path, DWPC
WITH path, DWPC, reduce(pdp = 1.0, d in degrees| pdp * d ^ -{{ w }}) as PDP
RETURN
path,
PDP,
100 * (PDP / DWPC) AS PERCENT_OF_DWPC
ORDER BY PERCENT_OF_DWPC DESC
''').rstrip().format(
metapath_query = metapath_query,
using_query = using_query,
unique_nodes_query = unique_nodes_query,
degree_query = degree_query,
length=len(metarels),
property=property)


return query

def format_expanded_clause(pairs):
"""
Given an iterable of node-index pairs, return a cypher `WHERE` clause
Expand Down