diff --git a/tools/interpolation_to_kmm.py b/tools/interpolation_to_kmm.py new file mode 100644 index 0000000..2e05c36 --- /dev/null +++ b/tools/interpolation_to_kmm.py @@ -0,0 +1,211 @@ +#!/usr/bin/python3 + +import os +import sys +from argparse import ArgumentParser + +items = [] +arraycount = {} +unigram = {} + +phrases = {} +seen_in_unigram = set() + +def writeRestUnigram(outfile): + # Write out + for token, count in arraycount.items(): + if token in seen_in_unigram: + pass + else: + string = phrases[token] + count = unigram[token] if token in unigram else 0 + inline = "\item {0} {1} count {2}".format(token, string, count) + outline = writeUnigramItem(inline) + outfile.writelines([outline, os.linesep]) + for token, freq in unigram.items(): + if token in seen_in_unigram: + pass + else: + string = phrases[token] + count = unigram[token] + inline = "\item {0} {1} count {2}".format(token, string, count) + outline = writeUnigramItem(inline) + outfile.writelines([outline, os.linesep]) + +def parseUnigramItem(oneline): + assert oneline.startswith("\\item") + + +def parseBigramItem(oneline): + assert oneline.startswith("\\item") + (linetype, firsttoken, firststring, + secondtoken, secondstring, tagcount, count) = oneline.split(" ") + assert linetype == "\\item" + assert tagcount == "count" + firsttoken = int(firsttoken) + secondtoken = int(secondtoken) + count = int(count) + items.append((firsttoken, secondtoken, count)) + + if firsttoken in arraycount: + arraycount[firsttoken] += count + else: + arraycount[firsttoken] = count + + if secondtoken in unigram: + unigram[secondtoken] += count + else: + unigram[secondtoken] = count + + # save in phrases + if firsttoken in phrases: + assert firststring == phrases[firsttoken] + else: + phrases[firsttoken] = firststring + + if secondtoken in phrases: + assert secondstring == phrases[secondtoken] + else: + phrases[secondtoken] = secondstring + + +def parseHeader(oneline): + assert oneline == "\\data model interpolation" + +def parseEnd(oneline): + assert oneline == "\\end" + +def parseBody(infile): + state = None + for oneline in infile.readlines(): + oneline = oneline.rstrip(os.linesep) + if oneline.startswith("\\data"): + parseHeader(oneline) + continue + if oneline.startswith("\\1-gram"): + state = "1-gram" + continue + if oneline.startswith("\\2-gram"): + state = "2-gram" + continue + if oneline.startswith("\\end"): + state = None + parseEnd(oneline) + continue + if oneline.startswith("\\item"): + if "1-gram" == state: + parseUnigramItem(oneline) + continue + if "2-gram" == state: + parseBigramItem(oneline) + continue + +def writeUnigramItem(inline): + assert inline.startswith("\\item") + (linetype, token, string, tagcount, count) = inline.split(" ") + assert linetype == "\\item" + assert tagcount == "count" + token = int(token) + oldfreq = int(count) + + count = 0 + if token in arraycount: + count = arraycount[token] + freq = 0 + if token in unigram: + freq = unigram[token] + + assert oldfreq == freq + outline = "\\item {0} {1} count {2} freq {3}".format( + token, string, count, freq) + + # write out already + seen_in_unigram.add(token) + + return outline + + +def writeBigramItem(inline): + assert inline.startswith("\\item") + (linetype, firsttoken, firststring, secondtoken, secondstring, tagcount, count) = inline.split(" ") + assert linetype == "\\item" + assert tagcount == "count" + firsttoken = int(firsttoken) + secondtoken = int(secondtoken) + count = int(count) + T = count + N_n_0 = 1 + n_1 = 1 if count == 1 else 0 + Mr = count + outline = "\\item {0} {1} {2} {3} count {4} T {5} N_n_0 {6} n_1 {7} Mr {8}".format(firsttoken, firststring, secondtoken, secondstring, count, T, N_n_0, n_1, Mr) + return outline + +def writeHeader(inline): + assert inline == "\\data model interpolation" + count = sum([x[2] for x in items]) + N = 1 + total_freq = count + outline = "\data model \"k mixture model\" count {0} N {1} total_freq {2}".format(count, N, total_freq) + return outline + +def writeEnd(inline): + assert inline == "\\end" + outline = "\\end" + return outline + +def writeBody(infile, outfile): + state = None + for inline in infile.readlines(): + inline = inline.rstrip(os.linesep) + if inline.startswith("\\data"): + outline = writeHeader(inline) + outfile.writelines([outline, os.linesep]) + continue + if inline.startswith("\\1-gram"): + state = "1-gram" + outline = "\\1-gram" + outfile.writelines([outline, os.linesep]) + # write the tag value + outline = writeUnigramItem("\item 1 count 0") + outfile.writelines([outline, os.linesep]) + continue + if inline.startswith("\\2-gram"): + # assume \2-gram is after \1-gram + writeRestUnigram(outfile) + state = "2-gram" + outline = "\\2-gram" + outfile.writelines([outline, os.linesep]) + continue + if inline.startswith("\\end"): + state = None + outline = writeEnd(inline) + outfile.writelines([outline, os.linesep]) + continue + if inline.startswith("\\item"): + if "1-gram" == state: + outline = writeUnigramItem(inline) + outfile.writelines([outline, os.linesep]) + continue + if "2-gram" == state: + outline = writeBigramItem(inline) + outfile.writelines([outline, os.linesep]) + continue + + +if __name__ == '__main__': + parser = ArgumentParser(description='Simple converter.') + parser.add_argument('infile', help='input file') + parser.add_argument('outfile', help='output file') + + args = parser.parse_args() + print(args) + + infile = open(args.infile, "r") + parseBody(infile) + infile.close() + + infile = open(args.infile, "r") + outfile = open(args.outfile, "w") + writeBody(infile, outfile) + outfile.close() + infile.close()