First, we download the webpage of the Practical Gremlin book

In [2]:
from bs4 import BeautifulSoup
import requests
import re
response = requests.get("https://kelvinlawrence.net/book/Gremlin-Graph-Guide.html")

Then we extract all Gremlin queries on this page

In [3]:
pattern = re.compile(r"g\.(?:V|E|add).*(?:\;|$)")
newpattern = re.compile(r"g\.(?:V|E|add)(?:[^\)]+\)+[\.,][a-zA-Z])*[^\)]+\)+")

In [4]:
# Assuming the response from the GET request is stored in the variable 'response'
soup = BeautifulSoup(response.text, 'html.parser')

#find all sections
sections = soup.find_all('div', {"class": "sect1"})
section_filter = filter(lambda x: x.find('h2').get_text().split(".")[0] in ["3", "4", "5"], sections)

queries = []

for section in section_filter:
	# Find all code blocks in the HTML page
	code_blocks = section.find_all('code')

	# Extract the text from each code block
	code_texts = (line for code in code_blocks for line in code.get_text(strip=True).split("\n") if len(line)>0)

	# Print the extracted code texts
	for code_text in code_texts:
		# Find all occurrences of the pattern in the code text
		queries.extend(newpattern.findall(code_text))

In [5]:
len(queries)

908

In [6]:
for i, query in enumerate(queries):
	print(f"{i+1}: {query}")

1: g.V().hasLabel('airport').groupCount().by('country')
2: g.V().has('code','AUS').out().out().out().has('code','AGR').path().by('code')
3: g.V().has('code','AUS').repeat(out()).times(3).has('code','AGR').path().by('code')
4: g.V().hasLabel('airport')
5: g.V().has('code','DFW')
6: g.V().hasLabel('airport').has('code','DFW')
7: g.V().has('airport','code','DFW')
8: g.V().has('airport','code','DFW').next().getClass()
9: g.V().has('airport','code','DFW').values()
10: g.V().has('airport','code','DFW').values('city')
11: g.V().has('airport','code','DFW').values('runways','icao')
12: g.E().has('dist')
13: g.V().has('region')
14: g.V().hasNot('region')
15: g.V().not(has('region'))
16: g.V().hasLabel('airport').count()
17: g.V().hasLabel('airport').outE('route').count()
18: g.V().outE('route').count()
19: g.E().hasLabel('route').count()
20: g.V().groupCount().by(label)
21: g.V().label().groupCount()
22: g.E().groupCount().by(label)
23: g.E().label().groupCount()
24: g.V().group().by(label).by(c

Now, we'd like to find all steps used in these queries

In [7]:
steps = {}
steps_pattern = re.compile(r"(\w+)\(")
for query in queries:
	for step in steps_pattern.findall(query):
		steps[step] = steps.get(step, 0) + 1

In [8]:
print(len(steps))
print(sorted(steps.items(), key=lambda x: x[1], reverse=True))

131
[('V', 885), ('has', 736), ('by', 637), ('out', 415), ('values', 265), ('count', 241), ('hasLabel', 239), ('as', 209), ('select', 176), ('limit', 174), ('path', 149), ('fold', 108), ('outE', 92), ('order', 86), ('property', 83), ('group', 80), ('is', 80), ('valueMap', 79), ('where', 78), ('repeat', 74), ('next', 72), ('unfold', 72), ('inV', 65), ('gt', 63), ('properties', 52), ('constant', 49), ('groupCount', 48), ('addV', 45), ('project', 40), ('times', 38), ('simplePath', 37), ('sack', 36), ('to', 34), ('dedup', 34), ('neq', 34), ('local', 34), ('within', 34), ('hasId', 32), ('union', 31), ('addE', 31), ('until', 30), ('E', 28), ('from', 28), ('filter', 24), ('in', 23), ('label', 22), ('not', 21), ('id', 20), ('toList', 19), ('between', 19), ('lt', 18), ('sum', 18), ('choose', 18), ('emit', 18), ('eq', 15), ('bothE', 15), ('both', 14), ('without', 14), ('coalesce', 14), ('cap', 14), ('math', 14), ('drop', 14), ('value', 14), ('and', 13), ('mean', 12), ('sample', 12), ('identity',

In [9]:
#steps in PyMogwai
steps_in_pymogwai = {"filter", "has", "hasNot", "hasId", "hasName", "hasLabel", "is",
                     "contains", "simplePath", "limit", "range", "skip", "dedup", "identity", "name",
                     "value", "key", "id", "label", "properties", "values", "select", "order", "count", "path", "elementMap",
                     "max", "min", "sum", "mean", "out", "outE", "outV", "inE", "inV",
                     "in", "both", "bothE", "bothV", "repeat", "branch", "option", "until",
                     "times", "emit", "as", "by", "union", "local", "sideEffect", "property",
                     "toList", "next","iter","asPath", "hasNext",
                     "V", "E", "addV", "addE",
                     "gt", "gte", "lt", "lte", "inside", "outside", "within", "without", "between", "or", "not", "and"}
len(steps_in_pymogwai)

72

In [10]:
query_step_map = {query: steps_pattern.findall(query) for query in queries}
counter = 0
for query, steps in query_step_map.items():
	if len(set(steps)-steps_in_pymogwai)>0:
		counter += 1
		print(f"{query}: {set(steps)-steps_in_pymogwai}")
print(f"{counter} queries contain steps not in PyMogwai")

g.V().hasLabel('airport').groupCount().by('country'): {'groupCount'}
g.V().has('airport','code','DFW').next().getClass(): {'getClass'}
g.V().groupCount().by(label): {'groupCount'}
g.V().label().groupCount(): {'groupCount'}
g.E().groupCount().by(label): {'groupCount'}
g.E().label().groupCount(): {'groupCount'}
g.V().group().by(label).by(count()): {'group'}
g.V().hasLabel('country').group().by('code').by(out().count()): {'group'}
g.V().hasLabel('continent').group().by('code').by(out().count()): {'group'}
g.V().hasLabel('airport').groupCount().by('country').select('FR'): {'groupCount'}
g.V().hasLabel('airport').groupCount().by('country').select('FR','GR','BE'): {'groupCount'}
g.V().has('airport','code','AUS').out().values('code').fold(): {'fold'}
g.V().has('airport','code','AUS').out('route').values('code').fold(): {'fold'}
g.V(3).out().limit(5).path().by(values('code','city').fold()): {'fold'}
g.V().has('airport','code','AUS').out().as('a').out().as('b').path().by('code').from('a').to('b

In [11]:
valid_queries = []
with open("queries.txt", "w") as f:
	for query, steps in query_step_map.items():
		if len(set(steps)-steps_in_pymogwai)==0:
			valid_queries.append(query)
			f.write(f"{query}\n")
	print(f"Saved {len(valid_queries)} queries")

Saved 337 queries


In [12]:
#take a random sample of 100 queries
import random
random.seed(42)
sample = random.sample(valid_queries, 100)
print("\n".join(sample))

g.V().hasLabel('airport').not(outE('route')).count()
g.V().has(id,8).values('code')
g.V().not(has('region'))
g.V(3).both().dedup().count()
g.V().has('region','US-TX').has('longest',gte(12000))
g.V().has('code','AUS').out().has('country',without('US','CA')).values('city')
g.V().has('airport','code','SYD')
g.V(3).outE().limit(1).elementMap()
g.V().not(bothE()).count()
g.V().hasLabel('airport').range(0,20).values('code')
g.V().has('airport','country','US').outE().has('dist',within(100..200)).inV().has('country','US').path().by('code').by('dist').count()
g.V().has('code','XYZ')
g.E().hasLabel('route').count()
g.V().outE('route').count()
g.V().range(3500,-1)
g.V().has('airport','code','AUS').out().has('code',without('DFW','LAX')).out().has('code','SYD').path().by('code')
g.V(airports[x-1])
g.V().hasLabel('airport').values('longest').mean().next()
g.V().has('airport','code','AUS').as('aus').out().has('country',within('US','CA')).has('lon',lte(dfw)).has('code',without('PHX','LAX')).out().has(

just see how many of those queries we can run without an error

In [1]:
import sys
sys.path.insert(0,"../")
from mogwai.core.traversal import Traversal
from mogwai.core.traversal import MogwaiGraphTraversalSource
from mogwai.core.steps.statics import *
from mogwai.parser import graphml_to_mogwaigraph
import random

In [2]:
steps_map = {k: f"{k}_" for k in ["and", "or", "not", "in", "filter", "from", "to", "as", "global", "list", "map", "dict", "set", "id", "min", "max", "sum"]}

def convert_query(query:str):
	for step, replacement in steps_map.items():
		query = query.replace(f".{step}(", f".{replacement}(")
		query = query.replace(f"({step}(", f"({replacement}(")
	query.replace("__.", "")
	return query

In [4]:
graph = graphml_to_mogwaigraph(
    "documents/air-routes-latest.graphml",
    node_label_key="labelV",
    edge_label_key="labelE",
    node_name_key="code"
)
g = MogwaiGraphTraversalSource(graph)
#airports = g.V().has_label("airport").to_list().run()
#x = random.randint(1, len(airports))
#r = g.E().has_label('route').values('dist').max_().next().run()

In [5]:
with open("queries.txt", "r") as f:
	queries = f.readlines()
queries = [query.strip() for query in queries]
print(f"Loaded {len(queries)} queries")

Loaded 337 queries


In [8]:
could_not_create = []
could_not_execute = []
executed = []
for query in queries:
	mogwai_query = convert_query(query)
	print(f"Gremlin Query: {query}")
	print(f"Mogwai Query:  {mogwai_query}")
	try:
		q = eval(mogwai_query)
		print(f"Mogwai Query:  {q.print_query()}")
	except Exception as e:
		print("Could not create query:", e)
		could_not_create.append((query,e))
		try:
			res = q.run()
			print(f"Result:", res)
			executed.append((query, res))
		except Exception as e:
			could_not_execute.append((query,e))
			print("Could not execute query:", e)
	print("")

Gremlin Query: g.V().has('code','AUS').out().out().out().has('code','AGR').path().by('code')
Mogwai Query:  g.V().has('code','AUS').out().out().out().has('code','AGR').path().by('code')
Mogwai Query:  V -> Has -> Out -> Out -> Out -> Has -> Path

Gremlin Query: g.V().has('code','AUS').repeat(out()).times(3).has('code','AGR').path().by('code')
Mogwai Query:  g.V().has('code','AUS').repeat(out()).times(3).has('code','AGR').path().by('code')
Mogwai Query:  V -> Has -> Repeat(Out, x3) -> Has -> Path

Gremlin Query: g.V().hasLabel('airport')
Mogwai Query:  g.V().hasLabel('airport')
Mogwai Query:  V -> Contains

Gremlin Query: g.V().has('code','DFW')
Mogwai Query:  g.V().has('code','DFW')
Mogwai Query:  V -> Has

Gremlin Query: g.V().hasLabel('airport').has('code','DFW')
Mogwai Query:  g.V().hasLabel('airport').has('code','DFW')
Mogwai Query:  V -> Contains -> Has

Gremlin Query: g.V().has('airport','code','DFW')
Mogwai Query:  g.V().has('airport','code','DFW')
Mogwai Query:  V -> Has

Greml

You passed only one argument to `without` that is not a tuple, list, range or generator. This will be treated as a single value. If you want to check inequality to a single value, use `neq` instead.


Result: False

Gremlin Query: g.addV().property(id,n)
Mogwai Query:  g.addV().property(id,n)
Could not create query: MogwaiGraphTraversalSource.addV() missing 1 required positional argument: 'label'
Result: False

Gremlin Query: g.addV("airport").property("code",a[0],"iata",a[1]).next()
Mogwai Query:  g.addV("airport").property("code",a[0],"iata",a[1]).next()
Could not create query: name 'a' is not defined
Result: False

Gremlin Query: g.addV("airport").property("code",it[0],"iata",it[1]).next()
Mogwai Query:  g.addV("airport").property("code",it[0],"iata",it[1]).next()
Could not create query: name 'it' is not defined
Result: False

Gremlin Query: g.V().has('code','XYZ')
Mogwai Query:  g.V().has('code','XYZ')
Mogwai Query:  V -> Has

Gremlin Query: g.V().has('airport','code','DFW').properties().value()
Mogwai Query:  g.V().has('airport','code','DFW').properties().value()
Mogwai Query:  V -> Has -> Properties -> Value

Gremlin Query: g.V().has('airport','code','DFW').properties()
Mogwai

In [9]:
len(could_not_create), len(could_not_execute), len(executed)

(97, 7, 90)

In [7]:
print("\n".join([f"{q} -> {e}" for q, e in could_not_create]))

g.V().has('airport','code','LCY').outE().inV().path().by('code').by('dist') -> Step `Path` does not support multiple by-modulations.
g.V().has('airport','code','LCY').outE().inV().path().by('code').by('dist').by('code') -> Step `Path` does not support multiple by-modulations.
g.V().has('airport','code','LCY').outE().inV().path().by('code').by('dist').by('city') -> Step `Path` does not support multiple by-modulations.
g.V().has('airport','code','LCY').out().limit(5).values('runways').path().by('code').by('code').by() -> Step `Path` does not support multiple by-modulations.
g.V(3).out().limit(5).path().by(out().count()) -> Step `Path` does not support anonymous traversals as by-modulations.
g.V().has('type','airport').limit(10).as('a','b','c').select('a','b','c').by('code').by('region').by(out().count()) -> Traversal.as_() takes 2 positional arguments but 4 were given
g.V(1).as('a').V(2).as('a').select('a') -> 'Traversal' object has no attribute 'V'
g.V(1).as('a').V(2).as('a').select(fir

Now we try this again with manually filtered queries

In [None]:
with open("queries-sanitized.txt", "r") as f:
	queries = f.readlines()
queries = [query.strip() for query in queries]
print(f"Loaded {len(queries)} queries")
could_not_create = []
could_not_execute = []
for query in queries:
	mogwai_query = convert_query(query)
	print(f"Gremlin Query: {query}")
	print(f"Mogwai Query:  {mogwai_query}")
	try:
		q = eval(mogwai_query)
		print(f"Mogwai Query:  {q.print_query()}")
	except Exception as e:
		print("Could not create query:", e)
		could_not_create.append((query,e))
		try:
			print(f"Result:", q.run())
		except Exception as e:
			could_not_execute.append((query,e))
			print("Could not execute query:", e)
	print("")