In [202]:
import time
import numpy as np
import itertools
import pandas as pd
import queue
from collections import deque
np.set_printoptions(suppress=True)

In this programming problem and the next you'll code up the greedy algorithm from the lectures on Huffman coding.

Download the text file below.

This file describes an instance of the problem. It has the following format:

[number_of_symbols]

[weight of symbol #1]

[weight of symbol #2]

...

For example, the third line of the file is "6852892," indicating that the weight of the second symbol of the alphabet is 6852892. (We're using weights instead of frequencies, like in the "A More Complex Example" video.)

Your task in this problem is to run the Huffman coding algorithm from lecture on this data set. What is the maximum length of a codeword in the resulting Huffman code?

ADVICE: If you're not getting the correct answer, try debugging your algorithm using some small test cases. And then post them to the discussion forum!

Continuing the previous problem, what is the minimum length of a codeword in your Huffman code?

In [239]:
class HuffmanNode(object):
    def __init__(self,left=None,right=None,root=None):
        self.left = left
        self.right = right
        self.root = root
    def children(self):
        return (self.left,self.right)
    
    #preorder traversal to generate the huffman encoding
    def preorder(self,path=None,G=None):
        if path is None:
            path = ''
        if G is None:
            G = {}
        if self.left is not None:
            if isinstance(self.left[1],HuffmanNode):
                self.left[1].preorder(path+'0',G)
            else:
                G[self.left[1]] = path+'0'
                
        if self.right is not None:
            if isinstance(self.right[1],HuffmanNode):
                self.right[1].preorder(path+'1',G)
            else:
                G[self.right[1]] = path+'1'
    
        return G

In [240]:
def huffman_code_heap(filename):
    first = True
    symbol = 0
    p = queue.PriorityQueue()
    with open('week11_file/'+filename) as f:
        for line in f:
            line = line.split() # to deal with blank 
            if line and not first:            # lines (ie skip them)
                weight = int(line[0])
                #add all the symbols to the heap, key = frequencies
                p.put([weight,symbol])
                symbol+=1
            else:
                num_symbols = int(line[0])
                first = False
    
    #while there are two or more nodes in the heap
    while p.qsize()>1:
        #extract the two smallest-frequency symbols
        left, right = p.get(),p.get()
        node = HuffmanNode(left,right)
        #re-insert the new meta-symbol (new key = sum of two old ones)
        p.put([left[0]+right[0],node])
    
    node = p.get()
    G = node[1].preorder()
    sorted_length = sorted(G.values(),key=len)
    min_length = len(sorted_length[0])
    max_length = len(sorted_length[-1])
    return min_length,max_length
    

In [241]:
start_time = time.time()
print(huffman_code_heap('week11_1_test1.txt')) #2,5
print("--- %s seconds ---" % (time.time() - start_time))
start_time = time.time()
print(huffman_code_heap('week11_1_test2.txt')) #3,6
print("--- %s seconds ---" % (time.time() - start_time))

(2, 5)
--- 0.0035440921783447266 seconds ---
(3, 6)
--- 0.001893758773803711 seconds ---


In [242]:
start_time = time.time()
print(huffman_code_heap('week11_1.txt')) #9,19
print("--- %s seconds ---" % (time.time() - start_time))

(9, 19)
--- 0.03994107246398926 seconds ---


In [243]:
def huffman_code_queue(filename):
    first = True
    symbol = 0
    q_first = deque()
    q_second = deque()
    graph = []
    with open('week11_file/'+filename) as f:
        for line in f:
            line = line.split() # to deal with blank 
            if line and not first:            # lines (ie skip them)
                weight = int(line[0])
                graph.append([weight,symbol])
                symbol+=1
            else:
                num_symbols = int(line[0])
                first = False
                
    graph = sorted(graph,key=lambda x:x[0])
   
    #enqueue all leaf nodes into the first queue in increasing order of frequencies
    for i in range(len(graph)):
        q_first.append(graph[i])
    
    #while there are two or more nodes in both of the queues, dequeue the two nodes with lowest weight by examining
    #the fronts of both queues
    while len(q_first)+len(q_second) >1:
        min_node = [0]*2
        for i in range(2):
            if len(q_second) == 0:
                min_node[i] = q_first.popleft()
            elif len(q_first) == 0:
                min_node[i] = q_second.popleft()
            else:
                if q_first[0] < q_second[0]:
                    min_node[i] = q_first.popleft()
                else:
                    min_node[i] = q_second.popleft()
        left = min_node[0]
        right = min_node[1]
        node = HuffmanNode(left,right)
        #engueue the new node into the second queue (key = sum of the two old frequencies)
        q_second.append([left[0]+right[0],node])
    
    if len(q_first) == 1:
        node = q_first[0]
    else:
        node = q_second[0]
        
    G = node[1].preorder()
    sorted_length = sorted(G.values(),key=len)
    min_length = len(sorted_length[0])
    max_length = len(sorted_length[-1])
    return min_length,max_length
    

In [245]:
start_time = time.time()
print(huffman_code_queue('week11_1_test1.txt')) #2,5
print("--- %s seconds ---" % (time.time() - start_time))
start_time = time.time()
print(huffman_code_queue('week11_1_test2.txt')) #3,6
print("--- %s seconds ---" % (time.time() - start_time))

(2, 5)
--- 0.0015480518341064453 seconds ---
(3, 6)
--- 0.000621795654296875 seconds ---


In [332]:
start_time = time.time()
print(huffman_code_queue('week11_1.txt')) #9,19
print("--- %s seconds ---" % (time.time() - start_time))

(9, 19)
--- 0.03961801528930664 seconds ---


In this programming problem you'll code up the dynamic programming algorithm for computing a maximum-weight independent set of a path graph.

Download the text file below.

This file describes the weights of the vertices in a path graph (with the weights listed in the order in which vertices appear in the path). It has the following format:

[number_of_vertices]

[weight of first vertex]

[weight of second vertex]

...

For example, the third line of the file is "6395702," indicating that the weight of the second vertex of the graph is 6395702.

Your task in this problem is to run the dynamic programming algorithm (and the reconstruction procedure) from lecture on this data set. The question is: of the vertices 1, 2, 3, 4, 17, 117, 517, and 997, which ones belong to the maximum-weight independent set? (By "vertex 1" we mean the first vertex of the graph---there is no vertex 0.) In the box below, enter a 8-bit string, where the ith bit should be 1 if the ith of these 8 vertices is in the maximum-weight independent set, and 0 otherwise. For example, if you think that the vertices 1, 4, 17, and 517 are in the maximum-weight independent set and the other four vertices are not, then you should enter the string 10011010 in the box below.


In [328]:
def weighted_independent_set(filename):
    first = True
    counter= 0
    with open('week11_file/'+filename) as f:
        for line in f:
            line = line.split() # to deal with blank 
            if line and not first:            # lines (ie skip them)
                weight = int(line[0])
                G[counter] = weight
                counter+=1
            else:
                num_vertices = int(line[0])
                G = [0]*num_vertices
                first = False
    A = [0,G[0]]
    for i in range(2,len(G)+1):
        #two cases: 1) Max-wt IS of Gi-1 and 2) max-wt IS of Gi-2 + {vn}
        optim_value = max(A[i-1],A[i-2]+G[i-1]) #A[i] is the maximum weight IS of Gi (G is 1 based indexing)
        A.append(optim_value)
    
    S = deque()
    i = len(A)-1
    
    #Reconstruction Algorithm
    
    #Scan through array from right to left
    while i>=1:
        #Case 1 wins
        if A[i-1] >= A[i-2] + G[i-1]:
            i-=1
        else:
            #Case 2 wins
            S.appendleft(i)
            i-=2
    
    return A[-1],S
    

In [329]:
start_time = time.time()
print(weighted_independent_set('week11_2_test1.txt')) #Max sum: 2616, Chosen points (position): [2, 4, 6, 8, 10]
print("--- %s seconds ---" % (time.time() - start_time))

(2616, deque([2, 4, 6, 8, 10]))
--- 0.002702951431274414 seconds ---


In [330]:
start_time = time.time()
print(weighted_independent_set('week11_2_test2.txt')) #Max sum: 2533,Chosen points (position): [1, 3, 6, 9]
print("--- %s seconds ---" % (time.time() - start_time))

(2533, deque([1, 3, 6, 9]))
--- 0.010534048080444336 seconds ---


In [331]:
start_time = time.time()
print(weighted_independent_set('week11_2.txt')) #Max sum: 2955353732,Chosen points (position): 10100110
print("--- %s seconds ---" % (time.time() - start_time))

(2955353732, deque([1, 3, 5, 8, 10, 13, 15, 18, 20, 22, 24, 26, 28, 31, 33, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 69, 72, 75, 77, 79, 81, 83, 85, 88, 90, 92, 94, 96, 98, 100, 103, 106, 108, 110, 112, 115, 117, 120, 122, 124, 126, 128, 131, 133, 136, 139, 141, 143, 145, 147, 149, 151, 153, 155, 157, 160, 162, 164, 166, 168, 170, 173, 175, 177, 179, 181, 183, 185, 187, 190, 193, 195, 197, 199, 201, 203, 205, 207, 209, 211, 214, 216, 218, 221, 223, 226, 228, 230, 232, 234, 236, 238, 240, 243, 245, 247, 249, 252, 254, 256, 258, 261, 263, 265, 267, 269, 271, 273, 275, 277, 279, 281, 283, 285, 287, 289, 292, 294, 296, 298, 300, 302, 304, 306, 308, 310, 312, 314, 316, 318, 321, 323, 325, 327, 329, 331, 333, 335, 337, 339, 341, 343, 345, 347, 349, 351, 353, 355, 357, 359, 361, 363, 365, 367, 369, 371, 373, 375, 377, 379, 381, 383, 385, 387, 389, 391, 393, 395, 397, 399, 402, 404, 406, 408, 410, 413, 415, 417, 420, 422, 425, 427, 429, 431, 433, 435, 437, 439, 441, 443,