In [1]:
# -*- coding: utf-8 -*-
import io
from random import randint

In [2]:
def segment(text, segs):
    words = []
    last = 0
    for i in range(len(segs)):
        if segs[i] == '1':
            words.append(text[last:i+1])
            last = i+1
    words.append(text[last:]) 
    return words


def evaluate(text, segs):

    words = segment(text, segs)
    text_size = len(words)
    lexicon_size = len(' '.join(list(set(words)))) 
    return text_size + lexicon_size



def flip(segs, pos):
    return segs[:pos] + str(1-int(segs[pos])) + segs[pos+1:]

def flip_n(segs, n): 
    for i in range(n):
        segs = flip(segs, randint(0,len(segs)-1)) 
    return segs

def anneal(text, segs, iterations, rate): 
    temperature = float(len(segs))
    while temperature > 0.1:
        best_segs, best = segs, evaluate(text, segs) 
        
        for i in (range(iterations)):
            guess = flip_n(segs, int(round(temperature))) 
            score = evaluate(text, guess)
            if score < best:
                best, best_segs = score, guess 
            
        score, segs = best, best_segs
        temperature = temperature / rate
        print evaluate(text, segs), segment(text, segs)
    return score, segs

In [3]:
# Normalize funcs
s1 = u'ÀÁÂÃÈÉÊÌÍÒÓÔÕÙÚÝàáâãèéêìíòóôõùúýĂăĐđĨĩŨũƠơƯưẠạẢảẤấẦầẨẩẪẫẬậẮắẰằẲẳẴẵẶặẸẹẺẻẼẽẾếỀềỂểỄễỆệỈỉỊịỌọỎỏỐốỒồỔổỖỗỘộỚớỜờỞởỠỡỢợỤụỦủỨứỪừỬửỮữỰựỲỳỴỵỶỷỸỹ'
s0 = u'AAAAEEEIIOOOOUUYaaaaeeeiioooouuyAaDdIiUuOoUuAaAaAaAaAaAaAaAaAaAaAaAaEeEeEeEeEeEeEeEeIiIiOoOoOoOoOoOoOoOoOoOoOoOoUuUuUuUuUuUuUuYyYyYyYy'


def normalize(input_str):
    s = ''
    for c in input_str:
        if c in s1:
            s += s0[s1.index(c)]
        else:
            s += c
    s = s.replace(' ', '').lower().strip()
    
    return s
    

In [4]:
# Load products name from file
with io.open("terms.txt", mode="r", encoding="utf-8") as f:
    names = f.readlines()
# Normalize
names = map(normalize, names)

In [5]:
# Default segnment by product name
def str_to_bit(str):
    l = len(str)
    pre  = ['0']* (l - 1)
    suff = '1'
    pre.append(suff)

    text = ''.join(pre)
    return text

def default_segmentation(terms):
    bit_segs = map(str_to_bit, terms)
    end = bit_segs[len(bit_segs) - 1]
    # slice the ending bit
    end = end[:-1]
    bit_segs[len(bit_segs) - 1] = end
    return ''.join(bit_segs)
    

In [6]:
# Sample 1
text = "dienthoaiiphonex256gbhangnhapkhaudienthoaiiphone8plus64gbhangchinhhangfptdienthoaihtc10hangchinhhang"
seg1 = "0000000000000000000000000000000010000000000000000000000000000000000000001000000000000000000000000001"
score, final_seg = anneal(text, seg1, 10000, 1.1)

107 ['dienthoaiiphonex256gbhangnhapkhau', 'dienthoaiiphone8plus64gbhangchinhhangfpt', 'dienthoaihtc10hangchinhhang', '']
107 ['dienthoaiiphonex256gbhangnhapkhau', 'dienthoaiiphone8plus64gbhangchinhhangfpt', 'dienthoaihtc10hangchinhhang', '']
107 ['dienthoaiiphonex256gbhangnhapkhau', 'dienthoaiiphone8plus64gbhangchinhhangfpt', 'dienthoaihtc10hangchinhhang', '']
107 ['dienthoaiiphonex256gbhangnhapkhau', 'dienthoaiiphone8plus64gbhangchinhhangfpt', 'dienthoaihtc10hangchinhhang', '']
107 ['dienthoaiiphonex256gbhangnhapkhau', 'dienthoaiiphone8plus64gbhangchinhhangfpt', 'dienthoaihtc10hangchinhhang', '']
107 ['dienthoaiiphonex256gbhangnhapkhau', 'dienthoaiiphone8plus64gbhangchinhhangfpt', 'dienthoaihtc10hangchinhhang', '']
107 ['dienthoaiiphonex256gbhangnhapkhau', 'dienthoaiiphone8plus64gbhangchinhhangfpt', 'dienthoaihtc10hangchinhhang', '']
107 ['dienthoaiiphonex256gbhangnhapkhau', 'dienthoaiiphone8plus64gbhangchinhhangfpt', 'dienthoaihtc10hangchinhhang', '']
107 ['dienthoaiiphonex256gbhangn

86 ['dienthoai', 'ipho', 'nex256gb', 'hang', 'nhapkhau', 'dienthoai', 'ipho', 'ne8plus64gbha', 'ngchinh', 'hang', 'fpt', 'dienthoai', 'htc10ha', 'ngchinh', 'hang']
86 ['dienthoai', 'ipho', 'nex256gb', 'hang', 'nhapkhau', 'dienthoai', 'ipho', 'ne8plus64gbha', 'ngchinh', 'hang', 'fpt', 'dienthoai', 'htc10ha', 'ngchinh', 'hang']
86 ['dienthoai', 'ipho', 'nex256gb', 'hang', 'nhapkhau', 'dienthoai', 'ipho', 'ne8plus64gbha', 'ngchinh', 'hang', 'fpt', 'dienthoai', 'htc10ha', 'ngchinh', 'hang']
86 ['dienthoai', 'ipho', 'nex256gb', 'hang', 'nhapkhau', 'dienthoai', 'ipho', 'ne8plus64gbha', 'ngchinh', 'hang', 'fpt', 'dienthoai', 'htc10ha', 'ngchinh', 'hang']
86 ['dienthoai', 'ipho', 'nex256gb', 'hang', 'nhapkhau', 'dienthoai', 'ipho', 'ne8plus64gbha', 'ngchinh', 'hang', 'fpt', 'dienthoai', 'htc10ha', 'ngchinh', 'hang']
86 ['dienthoai', 'ipho', 'nex256gb', 'hang', 'nhapkhau', 'dienthoai', 'ipho', 'ne8plus64gbha', 'ngchinh', 'hang', 'fpt', 'dienthoai', 'htc10ha', 'ngchinh', 'hang']
86 ['dienthoai',

In [7]:
#Sample 2
text = "dienthoaiiphonex256gbhangnhapkhaudienthoaiiphone8plus64gbhangchinhhangfptdienthoaihtc10hangchinhhang"
seg1 = "0000000010000010000000000000000010000000010000010000000010000000000001001000000001000010000000000001"
score, final_seg = anneal(text, seg1, 10000, 1.1)

82 ['dienthoai', 'iphone', 'x256gbhangnhapkhau', 'dienthoai', 'iphone', '8plus64gb', 'hangchinhhang', 'fpt', 'dienthoai', 'htc10', 'hangchinhhang', '']
82 ['dienthoai', 'iphone', 'x256gbhangnhapkhau', 'dienthoai', 'iphone', '8plus64gb', 'hangchinhhang', 'fpt', 'dienthoai', 'htc10', 'hangchinhhang', '']
82 ['dienthoai', 'iphone', 'x256gbhangnhapkhau', 'dienthoai', 'iphone', '8plus64gb', 'hangchinhhang', 'fpt', 'dienthoai', 'htc10', 'hangchinhhang', '']
82 ['dienthoai', 'iphone', 'x256gbhangnhapkhau', 'dienthoai', 'iphone', '8plus64gb', 'hangchinhhang', 'fpt', 'dienthoai', 'htc10', 'hangchinhhang', '']
82 ['dienthoai', 'iphone', 'x256gbhangnhapkhau', 'dienthoai', 'iphone', '8plus64gb', 'hangchinhhang', 'fpt', 'dienthoai', 'htc10', 'hangchinhhang', '']
82 ['dienthoai', 'iphone', 'x256gbhangnhapkhau', 'dienthoai', 'iphone', '8plus64gb', 'hangchinhhang', 'fpt', 'dienthoai', 'htc10', 'hangchinhhang', '']
82 ['dienthoai', 'iphone', 'x256gbhangnhapkhau', 'dienthoai', 'iphone', '8plus64gb', 'ha

80 ['dienthoai', 'iphone', 'x256gbhangnhapkhau', 'dienthoai', 'iphone', '8plus64gb', 'hangchinhhang', 'fpt', 'dienthoai', 'htc10', 'hangchinhhang']
80 ['dienthoai', 'iphone', 'x256gbhangnhapkhau', 'dienthoai', 'iphone', '8plus64gb', 'hangchinhhang', 'fpt', 'dienthoai', 'htc10', 'hangchinhhang']
80 ['dienthoai', 'iphone', 'x256gbhangnhapkhau', 'dienthoai', 'iphone', '8plus64gb', 'hangchinhhang', 'fpt', 'dienthoai', 'htc10', 'hangchinhhang']
80 ['dienthoai', 'iphone', 'x256gbhangnhapkhau', 'dienthoai', 'iphone', '8plus64gb', 'hangchinhhang', 'fpt', 'dienthoai', 'htc10', 'hangchinhhang']
80 ['dienthoai', 'iphone', 'x256gbhangnhapkhau', 'dienthoai', 'iphone', '8plus64gb', 'hangchinhhang', 'fpt', 'dienthoai', 'htc10', 'hangchinhhang']
80 ['dienthoai', 'iphone', 'x256gbhangnhapkhau', 'dienthoai', 'iphone', '8plus64gb', 'hangchinhhang', 'fpt', 'dienthoai', 'htc10', 'hangchinhhang']
80 ['dienthoai', 'iphone', 'x256gbhangnhapkhau', 'dienthoai', 'iphone', '8plus64gb', 'hangchinhhang', 'fpt', 'di

In [None]:
# Tiki Product
text = ''.join(names)
seg = default_segmentation(names)
score, final_seg = anneal(text, seg, 1000, 1.1)