# Huffmann Coding

## [Wikipedia](https://en.wikipedia.org/wiki/Huffman_coding).

![huff 1](HuffmanCodeAlg.png)  
![huff 2](1024px-Huffman_tree_2.svg.png)  
![huff 3](800px-Huffman_coding_visualisation.svg.png) 

--- 

## Python source [here](http://bhrigu.me/blog/2017/01/17/huffman-coding-python-implementation/).

In the original, the only way to decompress back from binary without bugs is to have instantiated the class and performed the compression (as the author implements the creation of the heap, the dictionaries for codes, etc., during that step). In this implementation the necessary steps are performed at initialisation, thus it is possible to run either compress or decompress separately (the latter provided the original file and the binary file are both present).

In [1]:
from collections import Counter
import heapq
import os

In [2]:
class HeapNode:
    def __init__(self, char, freq):
        self.char = char
        self.freq = freq
        self.left = None
        self.right = None

    def __gt__(self, other):
        if(other == None):
            return -1
        if(not isinstance(other, HeapNode)):
            return -1
        return self.freq > other.freq

In [205]:
class HuffmanCoding:    

    def __init__(self, path, limit=None):
        self.path = path
        self.heap = []
        self.codes = {}
        self.reverse_codes = {}
        with open(self.path, 'r') as f:
            if limit:
                self.text = f.read()[:limit].rstrip()
            else:
                self.text = f.read().rstrip()
        self.frequency = self.make_frequency_dict(self.text)
        self.make_heap(self.frequency)
        self.merge_nodes()
        self.make_codes()

    # functions for compression:

    def make_frequency_dict(self, text):
        frequency = Counter()
        for c in text: frequency[c] += 1 
        return frequency

    def make_heap(self, freq_dict):
        for l, freq in freq_dict.items():
            node = HeapNode(l, freq)
            heapq.heappush(self.heap, node)

    def merge_nodes(self):
        while len(self.heap) > 1:
            node1 = heapq.heappop(self.heap)
            node2 = heapq.heappop(self.heap)   
            merged = HeapNode(None, node1.freq + node2.freq)
            merged.left = node1
            merged.right = node2
            heapq.heappush(self.heap, merged)

    def make_codes_helper(self, root, current_code):       
        if root is None: return
        if root.char is not None:
            self.codes[root.char] = current_code
            self.reverse_codes[current_code] = root.char
            return
        self.make_codes_helper(root.left, current_code + '0')
        self.make_codes_helper(root.right, current_code + '1')

    def make_codes(self):
        root = heapq.heappop(self.heap)
        current_code = ''
        self.make_codes_helper(root, current_code)

    def get_encoded_text(self, text):
        encoded_text = ''
        for c in text:
            encoded_text += self.codes[c]
        return encoded_text

    # make encoded text a multiple of 8 (bytes)
    # append the appropriate number of 0s at the end
    # & prepend this information as an 8 bits string
    def pad_encoded_text(self, encoded_text):        
        extra_padding = 8 - len(encoded_text) % 8    # complement of the modulo
        encoded_text += '0' * extra_padding          # append modulo_complement zeros at the end
        padded_info = '{:08b}'.format(extra_padding) # format: 0 to keep zeros, b for binary
        encoded_text = padded_info + encoded_text    # prepend modulo in the form of padding
        return encoded_text

    def get_byte_array(self, padded_encoded_text):
        if len(padded_encoded_text) % 8 != 0:
            print('Encoded text not padded properly')
            exit(0)
        b = bytearray()
        for i in range(0, len(padded_encoded_text), 8):
            byte = padded_encoded_text[i:i+8]
            b.append(int(byte, 2))
        return b

    def compress(self, limit=None):
        filename, file_extension = os.path.splitext(self.path)
        output_path = filename + '.bin'

        with open(output_path, 'wb') as output:
            encoded_text = self.get_encoded_text(self.text)
            padded_encoded_text = self.pad_encoded_text(encoded_text)
            b = self.get_byte_array(padded_encoded_text)
            output.write(bytes(b))

        print('Compressed to:', filename+'.bin')
        return output_path

    # functions for decompression

    def remove_padding(self, padded_encoded_text):
        padded_info = padded_encoded_text[:8]               # retrieve info
        extra_padding = int(padded_info, 2)                 # convert back to int
        padded_encoded_text = padded_encoded_text[8:]       # text without info
        encoded_text = padded_encoded_text[:-extra_padding] # text without end padding
        return encoded_text

    def decode_text(self, encoded_text):
        current_code = ''
        decoded_text = ''
        
        for bit in encoded_text:
            current_code += bit
            if current_code in self.reverse_codes:
                character = self.reverse_codes[current_code]
                decoded_text += character
                current_code = ''
                
        return decoded_text
    
    def decompress(self, input_path):        
        filename, file_extension = os.path.splitext(self.path)
        output_path = filename + '_decompressed' + '.txt'
        
        with open(input_path, 'rb') as f, open(output_path, 'w') as output:
            bit_string = ''

            byte = f.read(1)
            while byte:
                byte = ord(byte)
                bits = bin(byte)[2:].rjust(8, '0')
                bit_string += bits
                byte = f.read(1)

            encoded_text = self.remove_padding(bit_string)
            decompressed_text = self.decode_text(encoded_text)
            
            output.write(decompressed_text)

        print('Decompressed to:', output_path)
        return output_path

---

Testing

In [206]:
h = HuffmanCoding('fw.txt')
h.compress(limit=50)

Compressed to: fw.bin


'fw.bin'

In the current implementation, it is possible to use an independent instance of `HuffmanCoding` for the decompression process.

In [207]:
u = HuffmanCoding('fw.txt')
u.decompress('fw.bin')

Decompressed to: fw_decompressed.txt


'fw_decompressed.txt'

---

Reminder: `rstrip` fn in Python.

In [41]:
help(str.rstrip)

Help on method_descriptor:

rstrip(...)
    S.rstrip([chars]) -> str
    
    Return a copy of the string S with trailing whitespace removed.
    If chars is given and not None, remove characters in chars instead.



---

String formatting: `0` at the start to keep the zeros, `8` for 8 slots, `b` for binary.

In [33]:
print('{:08b}'.format(7))

00000111


---

Illustration of the remainder of the modulo.

In [56]:
for i in range(10,20): 
    print('i: {} | modulo 8: {} | remainder: {}'.format(i, i% 8, 8 - i % 8))

i: 10 | modulo 8: 2 | remainder: 6
i: 11 | modulo 8: 3 | remainder: 5
i: 12 | modulo 8: 4 | remainder: 4
i: 13 | modulo 8: 5 | remainder: 3
i: 14 | modulo 8: 6 | remainder: 2
i: 15 | modulo 8: 7 | remainder: 1
i: 16 | modulo 8: 0 | remainder: 8
i: 17 | modulo 8: 1 | remainder: 7
i: 18 | modulo 8: 2 | remainder: 6
i: 19 | modulo 8: 3 | remainder: 5


---
Converting binary string representations to ints

In [45]:
help(int)

Help on class int in module builtins:

class int(object)
 |  int(x=0) -> integer
 |  int(x, base=10) -> integer
 |  
 |  Convert a number or string to an integer, or return 0 if no arguments
 |  are given.  If x is a number, return x.__int__().  For floating point
 |  numbers, this truncates towards zero.
 |  
 |  If x is not a number or if base is given, then x must be a string,
 |  bytes, or bytearray instance representing an integer literal in the
 |  given base.  The literal can be preceded by '+' or '-' and be surrounded
 |  by whitespace.  The base defaults to 10.  Valid bases are 0 and 2-36.
 |  Base 0 means to interpret the base from the string as an integer literal.
 |  >>> int('0b100', base=0)
 |  4
 |  
 |  Methods defined here:
 |  
 |  __abs__(self, /)
 |      abs(self)
 |  
 |  __add__(self, value, /)
 |      Return self+value.
 |  
 |  __and__(self, value, /)
 |      Return self&value.
 |  
 |  __bool__(self, /)
 |      self != 0
 |  
 |  __ceil__(...)
 |      Ceiling of

In [53]:
print(int('000', 2))
print(int('001', 2))
print(int('010', 2))
print(int('011', 2))
print(int('100', 2))
print(int('101', 2))
print(int('110', 2))
print(int('111', 2))

0
1
2
3
4
5
6
7


---

Reminder: `ord()` fn.

In [83]:
help(ord)

Help on built-in function ord in module builtins:

ord(c, /)
    Return the Unicode code point for a one-character string.



In [86]:
print(ord(' '))
print(ord('a'))
print(ord('b'))
print(ord('c'))

32
97
98
99


In [137]:
help(str.rjust)

Help on method_descriptor:

rjust(...)
    S.rjust(width[, fillchar]) -> str
    
    Return S right-justified in a string of length width. Padding is
    done using the specified fill character (default is a space).



In [157]:
print(bin(8).rjust(6, '0'))
print(bin(10).rjust(6, '0'))
print(bin(12).rjust(6, '0'))

0b1000
0b1010
0b1100


In [144]:
print(bin(8))
print(bin(8)[2:])

0b1000
1000


---

Manually testing padding.

In [179]:
def pad(txt):        
    print('string before:', txt)
    pad = 8 - len(txt) % 8       # complement of the modulo
    print('end padding (pad): {} times 0: {}'.format(pad,'0'*pad))
    txt += '0' * pad             # append modulo_complement zeros at the end
    info = '{:08b}'.format(pad)  # format: 0 to keep zeros, b for binary
    print('start padding (info):', info)  # info: pad turned into binary representation
    print('(which is {} as a byte (8 bits): {:>08})'.format(pad, bin(pad)[2:]).rjust(8, '0'))
    txt = info + txt             # prepend modulo in the form of padding
    print('string after:', txt)
    print('--------------')

In [180]:
pad('111')
pad('1110')

string before: 111
end padding (pad): 5 times 0: 00000
start padding (info): 00000101
(which is 5 as a byte (8 bits): 00000101)
string after: 0000010111100000
--------------
string before: 1110
end padding (pad): 4 times 0: 0000
start padding (info): 00000100
(which is 4 as a byte (8 bits): 00000100)
string after: 0000010011100000
--------------


Now removing padding.

In [190]:
def unpad(padded_encoded_text):
    padded_info = padded_encoded_text[:8]               # retrieve info
    extra_padding = int(padded_info, 2)                 # convert back to int
    print('info: {}, which is {} as an int'.format(padded_info, extra_padding))
    print('text before:', padded_encoded_text)
    padded_encoded_text = padded_encoded_text[8:]       # text without info
    encoded_text = padded_encoded_text[:-extra_padding] # text without end padding
    print('text after:', encoded_text)  
    print('------------------')

In [191]:
unpad('0000010111100000')
unpad('0000010011100000')

info: 00000101, which is 5 as an int
text before: 0000010111100000
text after: 111
------------------
info: 00000100, which is 4 as an int
text before: 0000010011100000
text after: 1110
------------------


---

Manually testing compression

In [199]:
with open('fw.txt') as f:
    txt = f.read()[3:30]
    print('text:', txt)
    h = HuffmanCoding('fw.txt')
    frq_dict = h.make_frequency_dict(txt)
    print('frequency dict:', list(frq_dict.items()))
    h.make_heap(frq_dict)
    h.merge_nodes()
    h.make_codes()
    enc = h.get_encoded_text(txt)
    print('encoded text:', enc)
    print('with padding:', h.pad_encoded_text(enc))

text: The Restored Finnegans Wake
frequency dict: [('T', 1), ('h', 1), ('e', 5), (' ', 3), ('R', 1), ('s', 2), ('t', 1), ('o', 1), ('r', 1), ('d', 1), ('F', 1), ('i', 1), ('n', 3), ('g', 1), ('a', 2), ('W', 1), ('k', 1)]
encoded text: 11100101010001010110001001111011100111010001100001010111101000110110011110100001110010101101110001111100
with padding: 000010001110010101000101011000100111101110011101000110000101011110100011011001111010000111001010110111000111110000000000


---

Manually testing decompression.

In [202]:
with open('fw.bin', 'rb') as f:
    bit_string = ""
    byte = f.read(1)
    while byte:
        print('decoding byte:', byte)
        byte = ord(byte)
        print('\tord:', byte)
        bits = bin(byte)[2:].rjust(8, '0')
        print('\tbits:', bits)
        bit_string += bits
        print('\tbit string:', bit_string)
        byte = f.read(1)
        print()

decoding byte: b'\x07'
	ord: 7
	bits: 00000111
	bit string: 00000111

decoding byte: b'\xb6'
	ord: 182
	bits: 10110110
	bit string: 0000011110110110

decoding byte: b'\xf9'
	ord: 249
	bits: 11111001
	bit string: 000001111011011011111001

decoding byte: b'\xec'
	ord: 236
	bits: 11101100
	bit string: 00000111101101101111100111101100

decoding byte: b'\xef'
	ord: 239
	bits: 11101111
	bit string: 0000011110110110111110011110110011101111

decoding byte: b'\xe2'
	ord: 226
	bits: 11100010
	bit string: 000001111011011011111001111011001110111111100010

decoding byte: b'\xef'
	ord: 239
	bits: 11101111
	bit string: 00000111101101101111100111101100111011111110001011101111

decoding byte: b'A'
	ord: 65
	bits: 01000001
	bit string: 0000011110110110111110011110110011101111111000101110111101000001

decoding byte: b'5'
	ord: 53
	bits: 00110101
	bit string: 000001111011011011111001111011001110111111100010111011110100000100110101

decoding byte: b'\xe3'
	ord: 227
	bits: 11100011
	bit string: 000001111011