# Example Huffman coding implementation

Distributions are represented as dictionaries of { 'symbol': probability }

Codes are dictionaries too: { 'symbol': 'codeword' }

In [1]:
from math import log2 as lb

def huffman_r(p):
    '''Return a Huffman code for an ensemble with distribution p.'''
    assert(abs(sum(p.values())-1) < 0.00001) # Ensure probabilities sum to 1

    # Base case of only two symbols, assign 0 or 1 arbitrarily
    if(len(p) == 2):
        return dict(zip(p.keys(), ['0', '1']))

    # Create a new distribution by merging lowest prob. pair
    p_prime = p.copy()
    a1, a2 = lowest_prob_pair(p)
    p1, p2 = p_prime.pop(a1), p_prime.pop(a2)
    p_prime[a1 + a2] = p1 + p2

    # Recurse and construct code on new distribution
    c = huffman_r(p_prime)
    ca1a2 = c.pop(a1 + a2)
    c[a1], c[a2] = ca1a2 + '0', ca1a2 + '1'
    return c

def lowest_prob_pair(p):
    '''Return pair of symbols from distribution p with lowest probabilities.'''
    assert(len(p) >= 2) # Ensure there are at least 2 symbols in the dist.
    sorted_p = sorted(p.items(), key=lambda x: x[1])
    #print(p)
    #print(sorted_p)
    #print("\n")
    return sorted_p[0][0], sorted_p[1][0]

def huffman(p):
    c = huffman_r(p)   #dictionary
    cu = sorted(c.items(), key=lambda x: x[0]) #list of tupples
    navg=0
    H=0
    Hmax=lb(len(c))
    print("#\tSymbol\tProb.\tCode\tlength")
    for i in range(0,len(cu)):
        sim=cu[i][0]
        Ix=-lb(p[sim])
        print("%d\t%s\t%.3f\t%s\t%d" %(i, sim, p[sim], c[sim], len(c[sim])))
        navg = navg + len(c[sim])*p[sim]
        H = H + p[sim]*Ix 
    print("navg =",navg, "H(X) = %.3f, Rhc = %.3f%%" %(H, (navg-H)/navg*100))
    print("Inf. Src.: Hmax = %.3f, Rscr=%.3f%%" %(Hmax, (Hmax-H)/Hmax*100))
    return c

def huffmanX(p,factor = 2):
    pp=dict()
    for idx in range(0,factor):
        pp=expandCoding(p,pp)
    print("Sum of probabilities error:%e" %abs(sum(p.values())-1))
    huffman(pp)
    
def expandCoding(p1,p2):
    assert(len(p1)>=2)
    if (len(p2)>0):
        pp=dict()
        for sim1 in p1:
            for sim2 in p2:
                pp[sim1+sim2]=p1[sim1]*p2[sim2]
        return pp
    else:
        return p1

In [20]:
primer={'x1':0.3, 'x2':0.7}
huffmanX(primer,2)

Sum of probabilities error:0.000000e+00
#	Symbol	Prob.	Code	length
0	x1x1	0.090	110	3
1	x1x2	0.210	111	3
2	x2x1	0.210	10	2
3	x2x2	0.490	0	1
navg = 1.81 H(X) = 1.763, Rhc = 2.620%
Inf. Src.: Hmax = 2.000, Rscr=11.871%
