Skip to content

Commit a62fec2

Browse files
author
Tomasz Zdybał
committed
Trigram indexing for regular expressions
Implementation of trigram indexing to avoid scanning/matching of all values with regular expression. New tokenizer added (TrigramTokenizer). Task processing for regexp function changed: - Trigram index is used to narrow search space. - Only values returned from trigram index lookup are matched with regular expression. If regexp is to wide-ranging (like: .*), or more than 1000000 values are returned for trigram, execution is stopped, because of performance reasons.
1 parent 1e662e1 commit a62fec2

File tree

20 files changed

+3942
-37
lines changed

20 files changed

+3942
-37
lines changed

posting/index.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ func IndexTokens(attr, lang string, src types.Val) ([]string, error) {
7979
}
8080
tokens = append(tokens, toks...)
8181
}
82+
8283
return tokens, nil
8384
}
8485

query/query_test.go

Lines changed: 167 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -251,7 +251,7 @@ func populateGraph(t *testing.T) {
251251
addEdgeToTypedValue(t, "shadow_deep", 24, types.IntID, data.Value.([]byte), nil)
252252
}
253253

254-
// language stuff
254+
// Natural Language Processing test data
255255
// 0x1001 is uid of interest for language tests
256256
addEdgeToLangValue(t, "name", 0x1001, "Badger", "", nil)
257257
addEdgeToLangValue(t, "name", 0x1001, "European badger", "en", nil)
@@ -262,6 +262,23 @@ func populateGraph(t *testing.T) {
262262
addEdgeToLangValue(t, "name", 0x1001, "Барсук", "ru", nil)
263263
addEdgeToLangValue(t, "name", 0x1001, "Blaireau européen", "fr", nil)
264264

265+
// regex test data
266+
// 0x1234 is uid of interest for regex testing
267+
addEdgeToValue(t, "name", 0x1234, "Regex Master", nil)
268+
nextId := uint64(0x2000)
269+
patterns := []string{"mississippi", "missouri", "mission", "missionary",
270+
"whissle", "transmission", "zipped", "monosiphonic", "vasopressin", "vapoured",
271+
"virtuously", "zurich", "synopsis", "subsensuously",
272+
"admission", "commission", "submission", "subcommission", "retransmission", "omission",
273+
"permission", "intermission", "dimission", "discommission",
274+
}
275+
276+
for _, p := range patterns {
277+
addEdgeToValue(t, "value", nextId, p, nil)
278+
addEdgeToUID(t, "pattern", 0x1234, nextId, nil)
279+
nextId++
280+
}
281+
265282
addEdgeToValue(t, "name", 240, "Andrea With no friends", nil)
266283
time.Sleep(5 * time.Millisecond)
267284
}
@@ -2531,6 +2548,7 @@ func TestFilterRegexError(t *testing.T) {
25312548

25322549
func TestFilterRegex1(t *testing.T) {
25332550
populateGraph(t)
2551+
posting.CommitLists(10, 1)
25342552
query := `
25352553
{
25362554
me(id:0x01) {
@@ -2542,13 +2560,13 @@ func TestFilterRegex1(t *testing.T) {
25422560
}
25432561
`
25442562

2545-
js := processToFastJSON(t, query)
2546-
require.JSONEq(t,
2547-
`{"me":[{"name":"Michonne", "friend":[{"name":"Rick Grimes"},{"name":"Glenn Rhee"},{"name":"Daryl Dixon"}, {"name":"Andrea"}]}]}`, js)
2563+
_, err := processToFastJsonReq(t, query)
2564+
require.Error(t, err)
25482565
}
25492566

25502567
func TestFilterRegex2(t *testing.T) {
25512568
populateGraph(t)
2569+
posting.CommitLists(10, 1)
25522570
query := `
25532571
{
25542572
me(id:0x01) {
@@ -2560,18 +2578,18 @@ func TestFilterRegex2(t *testing.T) {
25602578
}
25612579
`
25622580

2563-
js := processToFastJSON(t, query)
2564-
require.JSONEq(t,
2565-
`{"me":[{"name":"Michonne", "friend":[{"name":"Rick Grimes"},{"name":"Glenn Rhee"}]}]}`, js)
2581+
_, err := processToFastJsonReq(t, query)
2582+
require.Error(t, err)
25662583
}
25672584

25682585
func TestFilterRegex3(t *testing.T) {
25692586
populateGraph(t)
2587+
posting.CommitLists(10, 1)
25702588
query := `
25712589
{
25722590
me(id:0x01) {
25732591
name
2574-
friend @filter(regexp(name, "^(Ri)")) {
2592+
friend @filter(regexp(name, "^Rick")) {
25752593
name
25762594
}
25772595
}
@@ -2583,6 +2601,139 @@ func TestFilterRegex3(t *testing.T) {
25832601
`{"me":[{"name":"Michonne", "friend":[{"name":"Rick Grimes"}]}]}`, js)
25842602
}
25852603

2604+
func TestFilterRegex4(t *testing.T) {
2605+
populateGraph(t)
2606+
posting.CommitLists(10, 1)
2607+
query := `
2608+
{
2609+
me(id:0x01) {
2610+
name
2611+
friend @filter(regexp(name, "((en)|(xo))n")) {
2612+
name
2613+
}
2614+
}
2615+
}
2616+
`
2617+
2618+
js := processToFastJSON(t, query)
2619+
require.JSONEq(t,
2620+
`{"me":[{"name":"Michonne", "friend":[{"name":"Glenn Rhee"},{"name":"Daryl Dixon"} ]}]}`, js)
2621+
}
2622+
2623+
func TestFilterRegex5(t *testing.T) {
2624+
populateGraph(t)
2625+
posting.CommitLists(10, 1)
2626+
query := `
2627+
{
2628+
me(id:0x01) {
2629+
name
2630+
friend @filter(regexp(name, "^[a-zA-z]*[^Kk ]?[Nn]ight")) {
2631+
name
2632+
}
2633+
}
2634+
}
2635+
`
2636+
2637+
js := processToFastJSON(t, query)
2638+
require.JSONEq(t,
2639+
`{"me":[{"name":"Michonne"}]}`, js)
2640+
}
2641+
2642+
func TestFilterRegex6(t *testing.T) {
2643+
populateGraph(t)
2644+
posting.CommitLists(10, 1)
2645+
time.Sleep(100 * time.Millisecond)
2646+
query := `
2647+
{
2648+
me(id:0x1234) {
2649+
pattern @filter(regexp(value, "miss((issippi)|(ouri))")) {
2650+
value
2651+
}
2652+
}
2653+
}
2654+
`
2655+
2656+
js := processToFastJSON(t, query)
2657+
require.JSONEq(t,
2658+
`{"me":[{"pattern":[{"value":"mississippi"}, {"value":"missouri"}]}]}`, js)
2659+
}
2660+
2661+
func TestFilterRegex7(t *testing.T) {
2662+
populateGraph(t)
2663+
posting.CommitLists(10, 1)
2664+
time.Sleep(100 * time.Millisecond)
2665+
query := `
2666+
{
2667+
me(id:0x1234) {
2668+
pattern @filter(regexp(value, "[aeiou]mission")) {
2669+
value
2670+
}
2671+
}
2672+
}
2673+
`
2674+
2675+
js := processToFastJSON(t, query)
2676+
require.JSONEq(t,
2677+
`{"me":[{"pattern":[{"value":"omission"}, {"value":"dimission"}]}]}`, js)
2678+
}
2679+
2680+
func TestFilterRegex8(t *testing.T) {
2681+
populateGraph(t)
2682+
posting.CommitLists(10, 1)
2683+
time.Sleep(100 * time.Millisecond)
2684+
query := `
2685+
{
2686+
me(id:0x1234) {
2687+
pattern @filter(regexp(value, "^(trans)?mission")) {
2688+
value
2689+
}
2690+
}
2691+
}
2692+
`
2693+
2694+
js := processToFastJSON(t, query)
2695+
require.JSONEq(t,
2696+
`{"me":[{"pattern":[{"value":"mission"}, {"value":"missionary"}, {"value":"transmission"}]}]}`, js)
2697+
}
2698+
2699+
func TestFilterRegex9(t *testing.T) {
2700+
populateGraph(t)
2701+
posting.CommitLists(10, 1)
2702+
time.Sleep(100 * time.Millisecond)
2703+
query := `
2704+
{
2705+
me(id:0x1234) {
2706+
pattern @filter(regexp(value, "s.{2,5}mission")) {
2707+
value
2708+
}
2709+
}
2710+
}
2711+
`
2712+
2713+
js := processToFastJSON(t, query)
2714+
require.JSONEq(t,
2715+
`{"me":[{"pattern":[{"value":"submission"}, {"value":"subcommission"}, {"value":"discommission"}]}]}`, js)
2716+
}
2717+
2718+
func TestFilterRegex10(t *testing.T) {
2719+
populateGraph(t)
2720+
posting.CommitLists(10, 1)
2721+
time.Sleep(100 * time.Millisecond)
2722+
query := `
2723+
{
2724+
me(id:0x1234) {
2725+
pattern @filter(regexp(value, "[^m]iss")) {
2726+
value
2727+
}
2728+
}
2729+
}
2730+
`
2731+
2732+
js := processToFastJSON(t, query)
2733+
require.JSONEq(t,
2734+
`{"me":[{"pattern":[{"value":"mississippi"}, {"value":"whissle"}]}]}`, js)
2735+
}
2736+
25862737
func TestToFastJSONFilterUID(t *testing.T) {
25872738
populateGraph(t)
25882739
query := `
@@ -5171,7 +5322,8 @@ func TestSchemaBlock1(t *testing.T) {
51715322
{Predicate: "loc", Type: "geo"}, {Predicate: "alive", Type: "bool"},
51725323
{Predicate: "shadow_deep", Type: "int"}, {Predicate: "friend", Type: "uid"},
51735324
{Predicate: "geometry", Type: "geo"}, {Predicate: "alias", Type: "string"},
5174-
{Predicate: "dob", Type: "date"}, {Predicate: "survival_rate", Type: "float"}}
5325+
{Predicate: "dob", Type: "date"}, {Predicate: "survival_rate", Type: "float"},
5326+
{Predicate: "value", Type: "string"}}
51755327
checkSchemaNodes(t, expected, actual)
51765328
}
51775329

@@ -5186,7 +5338,7 @@ func TestSchemaBlock2(t *testing.T) {
51865338
`
51875339
actual := processSchemaQuery(t, query)
51885340
expected := []*graphp.SchemaNode{
5189-
{Predicate: "name", Type: "string", Index: true, Tokenizer: []string{"term", "exact"}}}
5341+
{Predicate: "name", Type: "string", Index: true, Tokenizer: []string{"term", "exact", "trigram"}}}
51905342
checkSchemaNodes(t, expected, actual)
51915343
}
51925344

@@ -5226,12 +5378,12 @@ func TestSchemaBlock5(t *testing.T) {
52265378
`
52275379
actual := processSchemaQuery(t, query)
52285380
expected := []*graphp.SchemaNode{
5229-
{Predicate: "name", Type: "string", Index: true, Tokenizer: []string{"term", "exact"}}}
5381+
{Predicate: "name", Type: "string", Index: true, Tokenizer: []string{"term", "exact", "trigram"}}}
52305382
checkSchemaNodes(t, expected, actual)
52315383
}
52325384

52335385
const schemaStr = `
5234-
name:string @index(term, exact) .
5386+
name:string @index(term, exact, trigram ) .
52355387
alias:string @index(exact, term, fulltext) .
52365388
dob:date @index .
52375389
film.film.initial_release_date:date @index .
@@ -5243,6 +5395,7 @@ age : int .
52435395
shadow_deep : int .
52445396
friend:uid @reverse .
52455397
geometry:geo @index .
5398+
value:string @index(trigram) .
52465399
`
52475400

52485401
func TestMain(m *testing.M) {
@@ -5267,7 +5420,8 @@ func TestMain(m *testing.M) {
52675420

52685421
worker.StartRaftNodes(dir2)
52695422
// Load schema after nodes have started
5270-
schema.ParseBytes([]byte(schemaStr), 1)
5423+
err = schema.ParseBytes([]byte(schemaStr), 1)
5424+
x.Check(err)
52715425
defer os.RemoveAll(dir2)
52725426

52735427
os.Exit(m.Run())

tok/tok.go

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
package tok
1919

2020
import (
21+
"bytes"
2122
"encoding/binary"
2223
"time"
2324

@@ -59,6 +60,7 @@ func init() {
5960
RegisterTokenizer(TermTokenizer{})
6061
RegisterTokenizer(ExactTokenizer{})
6162
RegisterTokenizer(BoolTokenizer{})
63+
RegisterTokenizer(TrigramTokenizer{})
6264
SetDefault(types.GeoID, "geo")
6365
SetDefault(types.IntID, "int")
6466
SetDefault(types.FloatID, "float")
@@ -238,6 +240,12 @@ func EncodeGeoTokens(tokens []string) {
238240
}
239241
}
240242

243+
func EncodeRegexTokens(tokens []string) {
244+
for i := 0; i < len(tokens); i++ {
245+
tokens[i] = encodeToken(tokens[i], TrigramTokenizer{}.Identifier())
246+
}
247+
}
248+
241249
type BoolTokenizer struct{}
242250

243251
func (t BoolTokenizer) Name() string { return "bool" }
@@ -251,3 +259,27 @@ func (t BoolTokenizer) Tokens(v types.Val) ([]string, error) {
251259
}
252260
func (t BoolTokenizer) Identifier() byte { return 0x9 }
253261
func (t BoolTokenizer) IsSortable() bool { return false }
262+
263+
type TrigramTokenizer struct{}
264+
265+
func (t TrigramTokenizer) Name() string { return "trigram" }
266+
func (t TrigramTokenizer) Type() types.TypeID { return types.StringID }
267+
func (t TrigramTokenizer) Tokens(sv types.Val) ([]string, error) {
268+
value, ok := sv.Value.(string)
269+
if !ok {
270+
return nil, x.Errorf("Trigram indices only supported for string types")
271+
}
272+
runes := bytes.Runes([]byte(value))
273+
l := len(runes) - 2
274+
if l > 0 {
275+
tokens := make([]string, l)
276+
for i := 0; i < l; i++ {
277+
trigram := string(runes[i : i+3])
278+
tokens[i] = encodeToken(trigram, t.Identifier())
279+
}
280+
return tokens, nil
281+
}
282+
return nil, nil
283+
}
284+
func (t TrigramTokenizer) Identifier() byte { return 0xA }
285+
func (t TrigramTokenizer) IsSortable() bool { return false }

tok/tok_test.go

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,8 +98,33 @@ func TestTermTokenizer(t *testing.T) {
9898
val.Value = "Tokenizer works!"
9999

100100
tokens, err := tokenizer.Tokens(val)
101-
require.Nil(t, err)
101+
require.NoError(t, err)
102102
require.Equal(t, 2, len(tokens))
103103
id := tokenizer.Identifier()
104104
require.Equal(t, []string{encodeToken("tokenizer", id), encodeToken("works", id)}, tokens)
105105
}
106+
107+
func TestTrigramTokenizer(t *testing.T) {
108+
tokenizer, has := GetTokenizer("trigram")
109+
require.True(t, has)
110+
require.NotNil(t, tokenizer)
111+
val := types.ValueForType(types.StringID)
112+
val.Value = "Dgraph rocks!"
113+
tokens, err := tokenizer.Tokens(val)
114+
require.NoError(t, err)
115+
require.Equal(t, 11, len(tokens))
116+
id := tokenizer.Identifier()
117+
require.Equal(t, []string{
118+
encodeToken("Dgr", id),
119+
encodeToken("gra", id),
120+
encodeToken("rap", id),
121+
encodeToken("aph", id),
122+
encodeToken("ph ", id),
123+
encodeToken("h r", id),
124+
encodeToken(" ro", id),
125+
encodeToken("roc", id),
126+
encodeToken("ock", id),
127+
encodeToken("cks", id),
128+
encodeToken("ks!", id),
129+
}, tokens)
130+
}

0 commit comments

Comments
 (0)