Skip to content

Commit

Permalink
support for FTS (word) search operators
Browse files Browse the repository at this point in the history
  • Loading branch information
melissaboiko committed Aug 25, 2014
1 parent fb9910a commit 5b89027
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 38 deletions.
31 changes: 20 additions & 11 deletions README.md
Expand Up @@ -25,16 +25,25 @@ Sample usage

Some example queries:

$ myougiden -h # long help
$ myougiden tea ceremony # guess what kind of query to run
$ myougiden 茶 # ibid
$ myougiden -p 茶 # include partial matches
$ myougiden -p -f 茶 # ...but limit to frequent words
$ myougiden -p -f -t 茶 # ...and tab-separated, single-line output
$ myougiden -x '茶.' # regexp search
$ myougiden chanoyu # if no match is found, treat as rōmaji
$ myougiden -r kanji # forces rōmaji
$ myougiden -a uK # consult documentation for EDICT-style abbreviations
$ myougiden tea ceremony # guess what kind of query to run
$ myougiden 茶 # ibid
$ myougiden chanoyu # if no match is found, treat as rōmaji
$ myougiden -r kanji # forces rōmaji

$ myougiden -w flower tea # word search; return matches including both
$ myougiden -w flower -tea # matches include word 'flower' but not 'tea'
$ myougiden -w 'tea ceremony' # matches include the phrase in this order

$ myougiden -b 茶 # beggining word search; starts with 茶

$ myougiden -p 茶 # partial match anywhere
$ myougiden -p -f 茶 # ...but limit to frequent words
$ myougiden -p -f -t 茶 # ...and tab-separated, single-line output

$ myougiden -x '茶$' # regexp search

$ myougiden -h # long help
$ myougiden -a uK # consult documentation for abbreviations

Screenshots
===========
Expand All @@ -57,7 +66,7 @@ Then, you need to compile the dictionary database at least once:

$ sudo updatedb-myougiden -f
# This command downloads and compile JMdict.
# It's a bit heavy, go have some coffee...
# It's a bit slow, go have some coffee...

That's it, have fun!

Expand Down
12 changes: 8 additions & 4 deletions bin/myougiden
Expand Up @@ -112,7 +112,8 @@ ag.add_argument('-a', '--abbrev', metavar='ABBREV', default=None,
help='''Print meaning of an abbreviation.''')


ap.add_argument('query', help='Text to look for.', metavar='QUERY', nargs='*')
ap.add_argument('query', help='Text to look for.', metavar='QUERY',
nargs=argparse.REMAINDER)


args = ap.parse_args()
Expand Down Expand Up @@ -144,11 +145,14 @@ if args.debug:
common.debug = True


args.query = ' '.join(args.query)
args.query_s = ' '.join(args.query)

# case sensitivity must be handled before opening db
if not args.case_sensitive:
if re.search("[A-Z]", args.query):
# debian sqlite currently doesn't support enhanced query syntax
# minus_keywords = re.sub('(NOT|OR|AND)', '', args.query_s)
# if (re.search("[A-Z]", minus_keywords)):
if (re.search("[A-Z]", args.query_s)):
args.case_sensitive = True

if not config:
Expand Down Expand Up @@ -203,7 +207,7 @@ elif args.abbrev:
print('Not found!')
sys.exit(0)

if args.query == '':
if args.query == []:
ap.print_help()
sys.exit(2)

Expand Down
85 changes: 62 additions & 23 deletions myougiden/search.py
Expand Up @@ -25,6 +25,7 @@ def __init__(self,
self.extent = extent

self.query = query
self.query_s = ' '.join(query)
self.case_sensitive = cmdline_args.case_sensitive
self.frequent = cmdline_args.frequent

Expand All @@ -35,10 +36,10 @@ def extent_sort_key(self):
return ['whole','word','beginning','partial'].index(self.extent)

def field_sort_key(self):
if tt.is_kana(self.args.query):
if tt.is_kana(self.args.query_s):
return ['reading', 'kanji'].index(self.field)

elif tt.is_latin(self.args.query):
elif tt.is_latin(self.args.query_s):
if self.args.field == 'reading':
# try it converted to kana first
return ['reading', 'gloss', 'kanji'].index(self.field)
Expand Down Expand Up @@ -71,23 +72,23 @@ def sort_key(self):

def __repr__(self):
return("'%s': regexp %s, field %s, extent %s\n sort key: %s" %
(self.query, self.regexp, self.field, self.extent,
(list(self.query), self.regexp, self.field, self.extent,
self.sort_key()))

def generate_search_conditions(args):
'''args = command-line argument dict (argparse object)'''

if args.regexp:
regexp_flags = (True,)
elif tt.has_regexp_special(args.query):
elif tt.has_regexp_special(args.query_s):
regexp_flags = (False, True)
else:
regexp_flags = (False,)

if args.field != 'auto':
fields = (args.field,)
else:
if tt.is_kana(args.query):
if tt.is_kana(args.query_s):
fields = ('kanji', 'reading')
else:
fields = ('kanji', 'reading', 'gloss')
Expand All @@ -103,7 +104,7 @@ def generate_search_conditions(args):
for field in fields:
for extent in extents:

if field == 'gloss' and extent == 'beginning':
if field == 'gloss' and extent == 'beginning' and args.extent == 'auto':
# when we search for e.g. 'man' in auto guesses, we
# typically don't want 'manatee' but not 'humanity'
continue
Expand All @@ -116,11 +117,22 @@ def generate_search_conditions(args):
# useless combination requested, adjust
extent = 'whole'

if field == 'reading' and tt.is_latin(args.query):
if field == 'reading' and tt.is_latin(args.query_s):
# 'reading' field auto-convert romaji to kana. as of this
# writing, JMdict has no romaji in readingfields.
queries = (romkan.to_hiragana(args.query),
romkan.to_katakana(args.query))
queries = ([romkan.to_hiragana(s) for s in args.query],
[romkan.to_katakana(s) for s in args.query])

# romkan will convert ASCII hyphen-minus to CJKV long 'ー'
# we back-convert it in start position, to preserve FTS
# operator '-'.
def fix_hyphen(s):
if len(s) > 1 and s[0] == 'ー':
s = '-' + s[1:]
return s

queries = [[fix_hyphen(s) for s in query]
for query in queries]
else:
queries = (args.query,)
# TODO: add wide-char
Expand All @@ -134,12 +146,40 @@ def search_by(cur, cond):
'''Main search function. Take a SearchCondition object, return list of ent_seqs.
'''

query = cond.query[:]
if ((cond.field == 'gloss' and cond.case_sensitive)
or cond.extent in ('whole', 'partial')):
fts=False
query_s = cond.query_s[:]
else:
fts=True
query_prep = []
for cmdline_arg in cond.query:
# unify Japanese spaces, newlines etc. to a single space
re.sub('\\s+', ' ', cmdline_arg)

# if the user called
#
# myougiden 'full phrase'
#
# or
# myougiden "full phrase"
#
# we'll get a single argv with spaces in it. in this case we
# translate the argv to the string
#
# "full phrase"
#
# (including the double quotes), which is what sqlite FTS looks
# for.
if ' ' in cmdline_arg:
cmdline_arg = '"' + cmdline_arg + '"'

if cond.extent == 'beginning':
cmdline_arg += '*'

query_prep.append(cmdline_arg)

query_s = ' '.join(query_prep)

if fts:
if cond.field == 'kanji':
Expand All @@ -163,23 +203,21 @@ def search_by(cur, cond):
operator = 'REGEXP ?'

if cond.extent == 'whole':
query = '^' + query + '$'
query_s = '^' + query_s + '$'
elif cond.extent == 'beginning':
query = '^' + query
query_s = '^' + query_s
elif cond.extent == 'word':
query = r'\b' + query + r'\b'
query_s = r'\b' + query_s + r'\b'

else:
if fts:
operator = 'MATCH ?'
if cond.extent == 'beginning':
query = query + '*'

else:

if cond.extent == 'whole':
operator = '= ?'
query = query.replace('\\', '\\\\')
query_s = query_s.replace('\\', '\\\\')
if cond.case_sensitive and cond.field == 'gloss':
where_extra = 'COLLATE BINARY';

Expand All @@ -190,13 +228,13 @@ def search_by(cur, cond):
operator = r"LIKE ? ESCAPE '\'"

# my editor doesn't like raw strings
# query = query.replace(r'\', r'\\')
query = query.replace('\\', '\\\\')
# query_s = query_s.replace(r'\', r'\\')
query_s = query_s.replace('\\', '\\\\')

query = query.replace('%', r'\%')
query = query.replace('_', r'\_')
query_s = query_s.replace('%', r'\%')
query_s = query_s.replace('_', r'\_')

query = '%' + query + '%'
query_s = '%' + query_s + '%'

if cond.frequent:
where_extra += ' AND %s.frequent = 1' % table
Expand All @@ -208,7 +246,7 @@ def search_by(cur, cond):
WHERE %s %s %s
;'''
% (table, cond.field, operator, where_extra),
[query])
[query_s])

res = []
for row in cur.fetchall():
Expand Down Expand Up @@ -245,8 +283,9 @@ def matched_regexp(conds):
'''

# TODO: there's some duplication between this logic and search_by()
# TODO: support word search

reg = conds.query
reg = conds.query_s
if not conds.regexp:
reg = re.escape(reg)

Expand Down

0 comments on commit 5b89027

Please sign in to comment.