Skip to content

Commit

Permalink
write convertopengram.py
Browse files Browse the repository at this point in the history
  • Loading branch information
epico committed Oct 9, 2016
1 parent c798526 commit b116151
Showing 1 changed file with 68 additions and 0 deletions.
68 changes: 68 additions & 0 deletions tools/convertopengram.py
@@ -0,0 +1,68 @@
#!/usr/bin/python3
import os
from argparse import ArgumentParser

from distill import strip_tone

'''
convert the opengram dictionary file format
to libpinyin input file format,
the same format as utils/storage/gen_pinyin_table.cpp .
'''

#minimum pinyin frequency
minimum = 3

#default pinyin total frequency
total_frequency = 100


def handle_pinyin(outfile, word, num, pinyin):
# no tones in opengram dictionary
stripped = strip_tone(pinyin)
assert stripped == pinyin

freq = 0
if not ":" in pinyin:
freq = total_frequency / num
else:
(py, freq) = pinyin.split(":", 1)
assert freq.endswith("%")
freq = freq.rstrip("%")
freq = float(freq)
freq = total_frequency * freq
pinyin = py

freq = int(freq)
freq = max(freq, minimum)
freq = str(freq)
oneline = "\t".join((word, pinyin, freq))
outfile.writelines([oneline, os.linesep])


def handle_line(outfile, line):
(word, pinyins) = line.split(None, 1)
pinyin_list = pinyins.split(None)
num = len(pinyin_list)
for pinyin in pinyin_list:
handle_pinyin(outfile, word, num, pinyin)


def handle_file(infilename, outfilename):
infile = open(infilename, "r")
outfile = open(outfilename, "w")
for oneline in infile.readlines():
oneline = oneline.rstrip(os.linesep)
handle_line(outfile, oneline)
outfile.close()
infile.close()


if __name__ == "__main__":
parser = ArgumentParser(description='convert opengram dictionary.')
parser.add_argument('infile', help='input file')
parser.add_argument('outfile', help='output file')
args = parser.parse_args()
print(args)

handle_file(args.infile, args.outfile)

0 comments on commit b116151

Please sign in to comment.