Skip to content

Commit

Permalink
add words
Browse files Browse the repository at this point in the history
  • Loading branch information
ledao committed Dec 27, 2019
1 parent 3f77915 commit c5ac693
Show file tree
Hide file tree
Showing 5 changed files with 52 additions and 25 deletions.
2 changes: 1 addition & 1 deletion lufly/sys_data/cat_sqlitedb.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/bash

cat sys_table.sqlitedb.a* > sys_table.sqlitedb
cat sys_table.sqlite.a* > sys_table.sqlite


1 change: 1 addition & 0 deletions lufly/sys_data/query.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
SELECT * from wordphonetable where full = 'nage';
2 changes: 1 addition & 1 deletion lufly/sys_data/split_sqlitedb.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/bash

split -b 50M sys_table.sqlitedb sys_table.sqlitedb.
split -b 50M sys_table.sqlite sys_table.sqlite.


70 changes: 48 additions & 22 deletions scripts/add_words.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,17 @@
from tables import DelWordTable
from peewee import fn
from toolz.curried import pipe, map, groupby, filter, keymap, curry, take
from common import split_sy, get_double_dict, full_to_double
from common import get_full_to_xhe_transformer, get_full_to_zrm_transformmer, get_full_to_lu_transformmer, get_full, word_to_two
from common import full_to_two
from pypinyin import lazy_pinyin
import attr
# import attr


@attr.s(frozen=True)
class Item(object):
word = attr.ib(type=str,)
priority = attr.ib(type=int, default=1)
phones = attr.ib(type=str, default='')
# @attr.s(frozen=True)
# class Item(object):
# word = attr.ib(type=str,)
# priority = attr.ib(type=int, default=1)
# phones = attr.ib(type=list, default=list())


@curry
Expand All @@ -32,15 +33,45 @@ def for_each(proc, eles):
proc(e)


# @curry
# def cols_to_item(cols: List[str])->Item:
# if len(cols) == 1:
# return Item(word=cols[0])
# elif len(cols) == 2:
# return Item(word=cols[0], priority=int(cols[1]))
# elif len(cols) == 2 + len(cols[0]):
# return Item(word=cols[0], priority=cols[1], phones=list(filter(lambda e: len(e) != 0, [e.srtip() for e in cols[2:]])))
# else:
# raise RuntimeError("cols length not in [1,2]")

@curry
def cols_to_item(cols: List[str])->Item:
def cols_to_word_phone_table(cols: List[str], xhe_transformer, zrm_transformer) -> WordPhoneTable:
if len(cols) == 1:
return Item(word=cols[0])
word = cols[0]
priority = 1
full = get_full(word)
elif len(cols) == 2:
return Item(word=cols[0], priority=int(cols[1]))
word = cols[0]
priority = cols[1]
full = get_full(word)
elif len(cols) == 2 + len(cols[0]):
word = cols[0]
priority = cols[1]
full = list(filter(lambda e: len(e) > 0, [e.strip() for e in cols[2:]]))
else:
raise RuntimeError("cols length not in [1,2]")

return WordPhoneTable(
word=word,
full=''.join(full),
xhe=''.join([full_to_two(e, xhe_transformer) for e in full]),
zrm=''.join([full_to_two(e, zrm_transformer) for e in full]),
lu="",
priority=priority,
updatedt=datetime.now()
)



def contain_alpha(word: str) -> bool:
for c in word:
Expand All @@ -60,6 +91,7 @@ def contain_symbols(word: str) -> bool:
if __name__ == "__main__":
if len(sys.argv) != 2:
print(f"USAGE: python3 {sys.argv[0]} words.txt", file=sys.stderr)
print("words format:word prioroty w1_yin w2_yin ...")
sys.exit(1)

_, words_path = sys.argv
Expand All @@ -75,25 +107,19 @@ def contain_symbols(word: str) -> bool:
set
)

with open(words_path, "r", encoding='utf8') as fin:

#FIXME: bug to fix, we have more phone type now.
ft_dict = get_double_dict()
xhe_transformer = get_full_to_xhe_transformer();
zrm_transformer = get_full_to_zrm_transformmer();
lu_transformer = get_full_to_lu_transformmer();

with open(words_path, "r", encoding='utf8') as fin:
to_add_words = pipe(fin,
map(lambda e: e.strip().split('\t')),
map(lambda e: e.strip().split(' ')),
filter(lambda e: len(e) in (1, 2)),
filter(lambda e: len(e[0]) <= 5),
filter(lambda e: not contain_alpha(
e[0]) and not contain_symbols(e[0])),
filter(lambda e: e[0] not in exist_words),
map(cols_to_item),
map(lambda e: (
e, map(lambda e: split_sy(e), lazy_pinyin(e.word)))),
map(lambda e: attr.evolve(e[0], phones=''.join(
full_to_double(e[1], ft_dict)))),
map(lambda e: WordPhoneTable(word=e.word, phones=e.phones,
priority=e.priority, updatedt=datetime.now())),
map(cols_to_word_phone_table)
)

with db.atomic():
Expand Down
2 changes: 1 addition & 1 deletion scripts/tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

pwd = Path(__file__).parent

db = SqliteDatabase(str(Path(pwd) / "../lufly/sys_data/sys_table.sqlitedb"))
db = SqliteDatabase(str(Path(pwd) / "../lufly/sys_data/sys_table.sqlite"))


class BaseModel(Model):
Expand Down

0 comments on commit c5ac693

Please sign in to comment.