## LZW algorithm in Python
Kacper Dobek

In [1]:
ASCII_MAX = 256

def load_file(filename) -> str:
    with open(filename) as f:
        text = f.read()
    print("File loaded")

    return str(text)

# encoded values are stored in a list
def lzw_encode(data):
    next_code = ASCII_MAX
    encoded = []
    p = data[0]
    table = get_ascii_table()

    for i in range(1, len(data)):
        c = data[i]
        if p + c in table.keys():
            p += c
        else:
            encoded.append(table[p])
            table[p+c] = next_code
            next_code += 1
            p = c
    encoded.append(table[p])

    present_solution(encoded[:100])

    return encoded

def lzw_decode(data):
    next_code = ASCII_MAX
    decoded = ''

    table = get_ascii_table()
    table = {v: k for k, v in table.items()}

    p = data[0]
    decoded += table[p]
    for i in range(1, len(data)):
        c = data[i]
        if c not in table.keys():
            s = table[p]
            s += s[0]
        else:
            s = table[c]
        decoded += s
        k = s[0]
        table[next_code] = table[p] + k
        next_code += 1
        p = c
    print(decoded[:100])
    return decoded

# fills a dictionary with ASCII characters [0-255]
def get_ascii_table() -> dict:
    table = dict()
    for i in range(0, ASCII_MAX):
        table[chr(i)] = i

    return table

def present_solution(encoded):
    temp = encoded.copy()
    for i in range(len(temp)):
        if temp[i] >= ASCII_MAX:
            temp[i] = '*' + str(temp[i])
    print(temp)

A test on a short text

In [2]:
sample_text = "Six sleek swans swam swiftly southwards"
encoded = lzw_encode(sample_text)
decoded = lzw_decode(encoded)
print(decoded)

[83, 105, 120, 32, 115, 108, 101, 101, 107, '*259', 119, 97, 110, 115, '*265', 97, 109, '*265', 105, 102, 116, 108, 121, '*259', 111, 117, 116, 104, '*266', 114, 100, 115]
Six sleek swans swam swiftly southwards
Six sleek swans swam swiftly southwards


norm_wiki_sample.txt

In [3]:
sample_text = load_file("norm_wiki_sample.txt")
encoded = lzw_encode(sample_text)
decoded = lzw_decode(encoded)
print("Original size:", len(sample_text), "Encoded size:", len(encoded))
print("Ratio:", len(encoded) / len(sample_text))
print( "Is the text the same after decoding?", sample_text == decoded)

File loaded
[32, 97, 108, 98, 101, 114, 116, 32, 111, 102, 32, 112, 114, 117, 115, 115, 105, 97, 32, 49, 55, 32, 109, 97, 121, '*274', 52, 57, 48, 32, 50, '*284', '*278', 114, 99, 104, '*274', 53, 54, 56, 32, 119, 97, 115, 32, 116, 104, 101, 32, 108, '*298', '*262', 103, 114, 97, 110, 100, '*277', '*306', '*260', '*263', '*265', '*301', '*303', 116, 101, 117, 116, 111, 110, 105, 99, 32, 107, '*325', 103, 104, 116, '*299', 119, 104, 111, '*256', 102, '*320', 114, 32, 99, '*324', 118, '*260', 116, 105, 110, 103, '*300', '*337', 108, '*322', '*302']
 albert of prussia 17 may 1490 20 march 1568 was the last grand master of the teutonic knights who a
Original size: 10788941 Encoded size: 1581560
Ratio: 0.1465908470534782
Is the text the same after decoding? True


wiki_sample.txt

In [4]:
sample_text = load_file("wiki_sample.txt")
encoded = lzw_encode(sample_text)
decoded = lzw_decode(encoded)
print("Original size:", len(sample_text), "Encoded size:", len(encoded))
print("Ratio:", len(encoded) / len(sample_text))
print("Is the text the same after decoding?", sample_text == decoded)

File loaded
[64, 64, 49, 53, 49, 52, 32, 65, 108, 98, 101, 114, 116, 32, 111, 102, 32, 80, 114, 117, 115, 115, 105, 97, 32, 40, 32, 49, 55, 32, 77, 97, 121, '*282', 52, 57, 48, 32, 50, '*292', '*286', 114, 99, 104, '*282', 53, 54, 56, 32, 41, 32, 119, 97, 115, 32, 116, 104, 101, 32, 108, '*308', '*268', 71, 114, 97, 110, 100, '*285', '*316', '*266', '*269', '*271', '*311', '*313', 84, 101, 117, 116, 111, 110, 105, 99, 32, 75, '*335', 103, 104, 116, '*309', 44, '*306', 104, 111, 32, 97, 102, 116, '*325', 99, '*334']
@@1514 Albert of Prussia ( 17 May 1490 20 March 1568 ) was the last Grand Master of the Teutonic Kni
Original size: 11904620 Encoded size: 1794108
Ratio: 0.15070686842587164
Is the text the same after decoding? True


Let's work on the bmp file now.
I find the offset and encode the pixel data. The data is read from the bmp file byte by byte. It is then passed to the encoding function as a string of ASCII characters.

In [5]:
import struct

In [6]:
with open("lena.bmp", 'rb') as bmp:
    
    bmp.seek(10, 0) 
    offset = struct.unpack('I', bmp.read(4))
    offset = offset[0]

    bmp.seek(28, 0)
    bpp = struct.unpack('H', bmp.read(2))
    bpp = bpp[0]

    bmp.seek(34, 0)
    imageSize = struct.unpack('I', bmp.read(4))
    imageSize = imageSize[0]

    print(offset, bpp, imageSize)

    bmp.seek(offset, 0)
    imageData = ''

    for byte in range(imageSize):
        bmp_byte = bmp.read(1)
        char = struct.unpack('c', bmp_byte)
        char = char[0]
        imageData += str(char)[2]
    
    print(imageData[:200])


138 24 11524800
\$>\ :\\8\\:\ 9\\4\\3\\4\\0\\0\\2\\2\\+\\,\\,\\)\\(\\+\\)\\'\\)\\)\\(\\,\\,\\-\\.\\0\\/\\/\\/\\2\\1\\4\\5\\3\\2\\3\\4\\2\!6\\3\\3\!6\ 5\\1\\2\#8\ 5\!6\#8\#9\#9\$:\&<\(>\'=\'=\$:\&;\*?\*@ -C)4J'8M+9O+9


In [7]:
encoded = lzw_encode(imageData)
decoded = lzw_decode(encoded)
print("Original size:", len(imageData), "Encoded size:", len(encoded))
print("Ratio:", len(encoded) / len(imageData))
print("Is the data the same after decoding?", imageData == decoded)

[92, 36, 62, 92, 32, 58, 92, 92, 56, '*262', '*261', 32, 57, '*262', 52, '*262', 51, '*269', '*262', 48, '*274', '*262', 50, '*277', '*262', 43, '*262', 44, '*282', '*262', 41, '*262', 40, '*280', '*285', '*262', 39, '*290', 92, '*286', 92, '*288', 92, '*283', '*298', '*262', 45, '*262', 46, '*276', 92, 47, '*262', '*307', '*306', '*279', 92, 49, '*273', 92, 53, '*271', '*311', '*272', 92, '*270', 92, '*278', 33, 54, '*317', 92, '*272', '*324', '*259', '*316', '*312', '*279', 35, '*264', 32, '*316', '*329', '*334', 92, 35, '*268', '*341', '*256', '*261', 38, 60, '*296', '*258', 39, 61, 92, '*350', '*344', 92]
\$>\ :\\8\\:\ 9\\4\\3\\4\\0\\0\\2\\2\\+\\,\\,\\)\\(\\+\\)\\'\\)\\)\\(\\,\\,\\-\\.\\0\\/\\/\\/\\2\\1\
Original size: 11524800 Encoded size: 1574821
Ratio: 0.1366462758572817
Is the data the same after decoding? True


Sources https://www.geeksforgeeks.org/lzw-lempel-ziv-welch-compression-technique/ \
https://www.youtube.com/watch?v=0Kwqdkhgbfw