# Base4096cn

Similar to base64, base4096cn is binary-to-text encoding schemes that represent binary data in an Chinese words string.


### Get top 4096 most frequent Chinese words

In [142]:
import urllib.request

req = urllib.request.Request(
    "https://gist.githubusercontent.com/indiejoseph/eae09c673460aa0b56db/raw/ac66c1900b048e3c72a4388b2304893ca3b9a571/%E7%8E%B0%E4%BB%A3%E6%B1%89%E8%AF%AD%E5%B8%B8%E7%94%A8%E8%AF%8D%E8%A1%A8.txt",
    #headers={'Accept-encoding': 'gzip'}
)
with urllib.request.urlopen(req) as response:
    wordtable=response.read().decode('utf8')
    def parseLine(l):
        try:
            [w,py,frq] = l.split('\t')
            return [w,int(frq)]
        except:
            print('foo',l)
            return None
    
    dic = {}
    for l in wordtable.split('\n'):
        if not l:
            continue
        [w,py,id] = l.split('\t')
        dic[w]=[w,id]
        
    wordtable = list(dic.values())
    wordtable.sort(key=lambda a:a[1])
    #wordtable = [('\n',),('，',),('。',)] + wordtable
    wordtable = wordtable[:2**12]
    wordtable = [a[0] for a in wordtable]


### Convert bytes to base4096

In [170]:
def b4096encode(b):
    l=len(b)
    c = []
    for i in range(0,l,3):
        c+=[
            b[i]<<4 | (b[i+1]>>4 if i+1<l else 0),
            ((0b1111 & b[i+1])<<8 | (b[i+2] if i+2<l else 0)) if (i+1<l) else -1
        ]
        if i+3 == l:
            c+=[-1]
    return c
    
def b4096decode(b):
    l=len(b)
    c=[]
    for i in range(0,l,2):
        if b[i]>=0:
            c+=[b[i]>>4]
        else:
            return c
        if b[i+1]>=0:
            c+=[
                (0b1111 & b[i]) << 4 | b[i+1] >> 8,
            ]
        else:
            return c
        if not i+2==l:
            c+=[b[i+1]&0b11111111]    
    return c


### Conver to Chinese words

In [144]:
def tochs(b):
    s=''
    for i in range(len(b)-1,-1,-1):
        if b[i]<0:
            s += '嘤'
            continue
        w = wordtable[b[i]]
        ns = w+s
        duplicated = [(w, a) for a in wordtable if ns.startswith(a)]
        if len(duplicated)>1:
            duplicated.sort(key=lambda a:-len(a[1]))
            if len(duplicated[1][1])>=len(w):
                print(duplicated)
                s = w+','+s # add seperator to prevent matching multiple words
                continue
        s = ns
    return s

def fromchs(s):
    b = []
    while s:
        maxlen = 0
        index = -1
        if s[0]==',':
            s=s[1:]
            continue
        if s[0]=='嘤':
            b+=[-1]
            s=s[1:]
            continue
        for i in range(len(wordtable)):
            w = wordtable[i]
            if s.startswith(w):
                l = len(w)
                if l>maxlen:
                    index = i
                    maxlen = l
        if index<0:
            print(s)
            raise Exception('not found')
        b+=[index]
        s=s[maxlen:]
    return b


In [137]:
def encode(raw):
    return tochs(b4096encode(raw))
                 
def decode(encoded):
    return bytes(b4096decode(fromchs(encoded)))

In [174]:
encode('https://github.com/knilink/algorithms'.encode('utf8'))

'明朗表明奇妙意料求学鸳鸯毫升回复世事许诺简要骨架探险大漠高梁囚犯政治协商会议幌子忧伤开行扣押四肢小汽车明智丘陵嘤'

In [175]:
decode('明朗表明奇妙意料求学鸳鸯毫升回复世事许诺简要骨架探险大漠高梁囚犯政治协商会议幌子忧伤开行扣押四肢小汽车明智丘陵嘤').decode('utf8')

'https://github.com/knilink/algorithms'