In [1]:
import math

def log2_ceil(n: int):
    return math.ceil(math.log2(n)) 


def pretty_print(n: int, size: int):
    return format(n, f'0{log2_ceil(size)}b')


def get_compressed(code: list, alphabet_size: int):
    data = []
    for item in code:
        data.append(pretty_print(item, alphabet_size))
        alphabet_size += 1
    return data


In [2]:
class Trie():
    def __init__(self):
        self.nodes = {}
        self.leaf = False
        self.code = None
    
    def insert(self, word: str, code: int) -> None:
        current_trie = self
        for char in word:
            if char not in current_trie.nodes:
                current_trie.nodes[char] = Trie()
            current_trie = current_trie.nodes[char]
        current_trie.leaf = True
        current_trie.code = code
    
    def find(self, word: str):
        current_trie = self
        for char in word:
            if char not in current_trie.nodes:
                #return False, None
                return None
            current_trie = current_trie.nodes[char]
        #return current_trie.leaf, current_trie.code
        return current_trie.code


In [3]:
class Lzw():
    def __init__(self, end_symbol: str):
        self.end_symbol = end_symbol
    
    
    def encode(self, alphabet: list, word: str):
        root = Trie()
        
        i = 0
        for char in alphabet:
            root.insert(char, i)
            i += 1

        compressed_data = []
        X = word[0]
        for Y in word[1:]:
            XY = X + Y
            if Y == self.end_symbol:
                compressed_data.append(root.find(X))
            else:
                if root.find(XY) is None:
                    compressed_data.append(root.find(X))

                    root.insert(XY, i)
                    i += 1

                    X = Y
                else:
                    X = XY
        return {
            'data': compressed_data,
            'data bits': get_compressed(compressed_data, len(alphabet)),
        }
    
    def decode(self, alphabet: list, word: list):
#         root = {}
#         counter = 0
#         for char in alphabet:
#             root[counter] = char
#             counter += 1
#         r = []
#         wordd = root[int(word[0])]
#         r.append(wordd)
#         for i in range(1, len(word)):
#             last = root[int(word[i - 1])]
            
#             if word[i] not in root.keys():
#                 wordd = last + last[0]
#                 print(wordd)
#             else:
#                 wordd = last + root[int(word[i])][0]
#                 print(wordd)
#             root[counter] = wordd
#             counter += 1
#             r.append(root[int(word[i])])
#         return r
            
        ### return to older version:
#         root = {}
#         size = 0
#         for char in  alphabet:
#             root[size] = char
#             size += 1
#         data = []
        
#         X = word[0]
#         data.append(root[int(word[0])])
#         for Y in word[1:]:
#             if int(Y) in root:     
#                 root[size] = root[int(X)] + root[int(Y)][0]
#                 data.append(root[int(Y)])
#             else:
#                 root[size] = root[int(X)] + root[int(X)][0]
#                 data.append(root[size])
#             X = Y
#             size += 1
#         return data

        ### trie:
        root = Trie()
        size = 0
        for char in  alphabet:
            root.insert(str(size), char)
            size += 1
        data = []
        
        X = word[0]
        data.append(
            root.find(word[0])
        )
        for Y in word[1:]:
            if root.find(Y) is not None:
                root.insert(
                    str(size),
                    root.find(X) + root.find(Y)[0]
                )
                data.append(root.find(Y))
            else:
                root.insert(
                    str(size),
                    root.find(X) + root.find(X)[0]
                )
                data.append(root.find(str(size)))
            X = Y
            size += 1
        return data


In [4]:
### binary file read ###
import numpy as np

WORD = ''
dtype = np.dtype('B')

try:
    with open("data.txt", "rb") as f:
        numpy_data = np.fromfile(f,dtype)
    WORD = ''.join(list(map(str, numpy_data)))
except IOError:
    print('Error While Opening the file!')

ALPHABET = [str(i) for i in range(256)]
END_SYMBOL = '#'

print(f'word    = {WORD}'[:70] + '...')
lzw = Lzw(END_SYMBOL)

encoded = lzw.encode(ALPHABET, WORD + END_SYMBOL)

encoded = list(map(str, encoded['data']))
encoded_string = ''.join(encoded)
print(f'encoded = {encoded_string}'[:70] + '...')

decoded = lzw.decode(ALPHABET, encoded)
decoded_string = ''.join(decoded)
print(f'decoded = {decoded_string}'[:70] + '...')
print(f'word == decoded: {decoded_string == WORD}')

word    = 691181011141211161041051101033210511532115111321191041051161...
encoded = 691181011141211161041051101033226315321151113211910410270610...
decoded = 691181011141211161041051101033210511532115111321191041051161...
word == decoded: True


In [5]:
### simple binary test ###

WORD = '011011100001010100010101010'
ALPHABET = ['0', '1']
END_SYMBOL = '#'


print(f'word    = {WORD}')
lzw = Lzw(END_SYMBOL)

encoded = lzw.encode(ALPHABET, WORD + END_SYMBOL)

encoded = list(map(str, encoded['data']))
encoded_string = ''.join(encoded)
print(f'encoded = {encoded_string}')

decoded = lzw.decode(ALPHABET, encoded)
decoded_string = ''.join(decoded)
print(f'decoded = {decoded_string}')
print(f'word == decoded: {decoded_string == WORD}')

word    = 011011100001010100010101010
encoded = 011230729474139
decoded = 011011100001010100010101010
word == decoded: True


In [6]:
### abacaba test ###
ALPHABET = ['a', 'b', 'c', 'd', 'e']
END_SYMBOL = '#'

WORD = 'abacabadabacabae'
ENCODED_WORD = '01025039864'
ENCODED_WORD_BITS = '0000010000100101000000111001100001100100'

print(f'word    = {WORD}')

lzw = Lzw(END_SYMBOL)

encoded = lzw.encode(ALPHABET, WORD + END_SYMBOL)
encoded = list(map(str, encoded['data']))
encoded_string = ''.join(encoded)
print(f'encoded = {encoded_string}')

decoded = lzw.decode(ALPHABET, ENCODED_WORD)
decoded_string = ''.join(decoded)
print(f'decoded = {decoded_string}')

print(f'word == decoded: {decoded_string == WORD}')

word    = abacabadabacabae
encoded = 01025039864
decoded = abacabadabacabae
word == decoded: True
