# Content and Objectives

+ Lempel-Ziv-77 encoding of a string
+ Huffmann encoding of the resulting 3-tuples
+ Application of this function to several examples 

# Import

In [24]:
# importing
import numpy as np
from collections import Counter

# Implementation of Lempel-Ziv-77

Encoding of an input sequence using the LZ77 algorithm. Default window length is 10

In [4]:
def LZ77_encode ( input_sequence, window_length = 10 ):
    '''
    Implementation of LZ77 encoding
    
    IN: input_sequence ( list or np.array of letters )

    OUT: list of 3-tuples with each tuple being (a,b,x) where a is how far we look back in the sequence, b is the length of the string to copy and x the next letter
    '''
    return_value = [(0,0,input_sequence[0])]
    
    k = 1
    while k < len(input_sequence):
        # test all possible look_backs
        best_b = 0
        best_a = 0
        # look back over window of past 
        for a_test in range(1,min(k+1, window_length+1)):
            b_test = 0
            while input_sequence[k+b_test] == input_sequence[k-a_test+b_test]:
                b_test += 1
                if k+b_test == len(input_sequence):
                    break
                
            if b_test > best_b:
                best_b = b_test
                best_a = a_test
            
        if k+best_b == len(input_sequence):
            return_value.append( (best_a, best_b, 'EOF'))
        else:
            return_value.append( (best_a, best_b, input_sequence[k+best_b]))
        k = k+best_b+1
    return return_value

Decoding algorithm, just the reconstruction of the string by looking up the data

In [7]:
def LZ77_decode( input_tuples ):
    '''
    Implementation of LZ77 decoding

    IN: list of 3-tuples (a,b,x) being generated by the LZ77 encoder

    OUT: reconstructed input sequence
    '''
    return_value = []
  
    for s in input_tuples:
        m = len(return_value)
        for i in range(s[1]):
            return_value.append(return_value[m-s[0]+i])
        if s[2] == 'EOF':
            break
        return_value.append(s[2])

    return return_value

In [15]:
print('Example 1: abracadabra')
enc1 = LZ77_encode('abracadabra')
print(enc1)
print(''.join(LZ77_decode(enc1)))
print('---------------')

print('Example 2: abracadabrad')
enc2 = LZ77_encode('abracadabrad')
print(enc2)
print(''.join(LZ77_decode(enc2)))
print('---------------')

print('Example 3: HAHAHAHA!')
enc3 = LZ77_encode('HAHAHAHA!')
print(enc3)
print(''.join(LZ77_decode(enc3)))
print('---------------')

print('Example 4: nachrichtentechnikistelegant')
enc4 = LZ77_encode('nachrichtentechnikistelegant')
print(enc4)
print(''.join(LZ77_decode(enc4)))
print('---------------')



Example 1: abracadabra
[(0, 0, 'a'), (0, 0, 'b'), (0, 0, 'r'), (3, 1, 'c'), (2, 1, 'd'), (7, 4, 'EOF')]
abracadabra
---------------
Example 2: abracadabrad
[(0, 0, 'a'), (0, 0, 'b'), (0, 0, 'r'), (3, 1, 'c'), (2, 1, 'd'), (7, 4, 'd')]
abracadabrad
---------------
Example 3: HAHAHAHA!
[(0, 0, 'H'), (0, 0, 'A'), (2, 6, '!')]
HAHAHAHA!
---------------
Example 4: nachrichtentechnikistelegant
[(0, 0, 'n'), (0, 0, 'a'), (0, 0, 'c'), (0, 0, 'h'), (0, 0, 'r'), (0, 0, 'i'), (4, 2, 't'), (0, 0, 'e'), (10, 1, 't'), (3, 1, 'c'), (7, 1, 'n'), (0, 0, 'i'), (0, 0, 'k'), (2, 1, 's'), (9, 2, 'l'), (2, 1, 'g'), (0, 0, 'a'), (0, 0, 'n'), (7, 1, 'EOF')]
nachrichtentechnikistelegant
---------------


# Recursive Implementation of Huffman

In [32]:
# Huffman function
def huffman_recursive( symb_dict, show_steps = 0 ):
    '''
    Recursive implementation of Huffman coding
    partly according to: https://gist.github.com/mreid/fdf6353ec39d050e972b
    
    Note: For convenience the two most unlikely symbols are at the beginning/the first indices of the dict
    
    IN: symb_dict ( dictionary of { letter : probability } )
        show_steps ( boolean allowing output of intermediate codes/intermediate steps )
    
    OUT: code_dict (dict of shape { letter: codeword } )
    '''
    
    # check that probability equals 1.0 (approximately) 
    np.testing.assert_almost_equal( sum( symb_dict.values() ), 1.0 )
  

    # if length equals 2 use 1 bit,
    # the shorter sequence obtaining leading 0
    if len( symb_dict) == 2:
        
        # compare sequence lengths and return coding ( shorter sequence coded by leading 0 )
        if len( list( symb_dict.keys() )[1] ) > len( list( symb_dict.keys() )[0] ):
            return dict( zip( symb_dict.keys(), ['1', '0' ] ) )
        
        else:         
            return dict( zip( symb_dict.keys(), ['0', '1' ] ) )      
  

    # copy dict
    symb_dict_new = symb_dict.copy() 
       
    # sort dict w.r.t. increasing probability
    #
    # NOTE: lambda is an on-the-fly definition of a function of syntax "lambda with variables: do";
    # so lambda t: t[1] simply gets second element of t
    symb_dict_new_sorted = sorted( symb_dict_new.items(), key=lambda t: t[1])
    
    # if activated, show intermediate dicts for illustration 
    if show_steps:
        dict_for_printing = [ ( key, round(val, 4) ) for key, val in symb_dict_new_sorted ]
        print( dict_for_printing )
        print( )
    
    # find two least probable symbols
    # NOTE: - [ i ] gives a dict entry; 
    #       - [ i ][ 0 ] gives the key of the dict entry, corresponding to the symbol
    s_N_1 = symb_dict_new_sorted[ 1 ][ 0 ]
    s_N = symb_dict_new_sorted[ 0 ][ 0 ]

    # pop according entries and create a new one with sum probability
    # key is concatenation of the old symbols
    p_N_1 = symb_dict_new.pop( s_N_1 )
    p_N = symb_dict_new.pop( s_N )
    
    symb_dict_new[ s_N + s_N_1 ] = p_N + p_N_1
    
    
    # apply recursion for the reduced symbol set
    code_dict = huffman_recursive( symb_dict_new, show_steps )
    
    
    # get codeword and append '1'/'0' for going up/down respectively
    cw = code_dict.pop( s_N + s_N_1 )

    code_dict[ s_N_1 ] = cw + '1'    
    code_dict[ s_N ] = cw + '0'
    
    return code_dict

# Applying LZ77 following by Huffman coding to different examples

In [40]:
# two booleans for 
# choosing example to be used and 
# choosing whether or not showing intermediate results of Huffman
example = 4
show_intermediate_steps = False

if example == 1:
    string = 'abracadabra'
    
elif example == 2:
    string = 'abracadabrad'
    
elif example == 3:
    string = 'HAHAHAHA!'

elif example == 4:
    string = 'nachrichtentechnikistelegant'
    
encoded_tuples = LZ77_encode(string)

# transform encoded_tuples to dict (with probabilities) and normalize to have sum equal to 1    
symb_dict = dict([(ele,count/len(encoded_tuples)) for ele, count in Counter(encoded_tuples).items()])

s = sum( symb_dict.values() )   
symb_dict.update( (key, val / s ) for key, val in symb_dict.items() )    

      
# apply Huffman function defined above
code = huffman_recursive( symb_dict , show_intermediate_steps )

# print various information
print('-------------------------')

print('Huffman coding: \n\n {}\n'.format( sorted( code.items(), key=lambda t: t[0] ) ) )

# determine average codeword length
L = 0
for l, p in symb_dict.items():
    L += p * len( code[ l ] )

    
print('-------------------------')

print('Average codeword length: \tL = {:2.2f}'.format( L ) )

p_letters = list( symb_dict.values() )
print('Entropy: \t\t\tH(X) = {}'.format( - np.sum( p_letters * np.log2( p_letters ) ) ) )

print('Max. Entropie: \t\t\tH0 = {:2.2f}'.format( np.log2(len(p_letters))) )

-------------------------
Huffman coding: 

 [((0, 0, 'a'), '1110'), ((0, 0, 'c'), '0000'), ((0, 0, 'e'), '0100'), ((0, 0, 'h'), '0001'), ((0, 0, 'i'), '1111'), ((0, 0, 'k'), '1000'), ((0, 0, 'n'), '1101'), ((0, 0, 'r'), '0010'), ((2, 1, 'g'), '1011'), ((2, 1, 's'), '1001'), ((3, 1, 'c'), '0110'), ((4, 2, 't'), '0011'), ((7, 1, 'EOF'), '1100'), ((7, 1, 'n'), '0111'), ((9, 2, 'l'), '1010'), ((10, 1, 't'), '0101')]

-------------------------
Average codeword length: 	L = 4.00
Entropy: 			H(X) = 3.9321380397593764
Max. Entropie: 			H0 = 4.00


In [41]:
# code text by simply parsing symbols and concatenating according codewords
# requires previous sections to be completed
coded_stream = []
for t in encoded_tuples:
    coded_stream.append( code[ t ] )  

# print various information
print('-------------------------')

print('Original text: {}\n'.format( string ) )

print('Number of letters: {}\n\n'.format( len( string ) ) )

print('Coded: {}\n'.format( ''.join( coded_stream ) ) )

print('Bits with Huffman: {}\n\n'.format( len( ''.join( coded_stream ) ) ) )

-------------------------
Original text: nachrichtentechnikistelegant

Number of letters: 28


Coded: 1101111000000001001011110011010001010110011111111000100110101011111011011100

Bits with Huffman: 76


