# 1.

# 2.

In [32]:
DEBUG = True

string = """
Huffman coding
In computer science and information theory, a Huffman code is an optimal prefix code found using the algorithm developed by David A. Huffman while he was a Ph.D. student at MIT, and published in the 1952 paper "A Method for the Construction of Minimum-Redundancy Codes". The process of finding and/or using such a code is called Huffman coding and is a common technique in entropy encoding, including in lossless data compression. The algorithm's output can be viewed as a variable-length code table for encoding a source symbol (such as a character in a file). Huffman's algorithm derives this table based on the estimated probability or frequency of occurrence (weight) for each possible value of the source symbol. As in other entropy encoding methods, more common symbols are generally represented using fewer bits than less common symbols. Huffman's method can be efficiently implemented, finding a code in linear time to the number of input weights if these weights are sorted. However, although optimal among methods encoding symbols separately, Huffman coding is not always optimal among all compression methods.

History
In 1951, David A. Huffman and his MIT information theory classmates were given the choice of a term paper or a final exam. The professor, Robert M. Fano, assigned a term paper on the problem of finding the most efficient binary code. Huffman, unable to prove any codes were the most efficient, was about to give up and start studying for the final when he hit upon the idea of using a frequency-sorted binary tree and quickly proved this method the most efficient.

In doing so, the student outdid his professor, who had worked with information theory inventor Claude Shannon to develop a similar code. By building the tree from the bottom up instead of the top down, Huffman avoided the major flaw of the suboptimal Shannon-Fano coding.

Terminology
Huffman coding uses a specific method for choosing the representation for each symbol, resulting in a prefix code (sometimes called "prefix-free codes", that is, the bit string representing some particular symbol is never a prefix of the bit string representing any other symbol). Huffman coding is such a widespread method for creating prefix codes that the term "Huffman code" is widely used as a synonym for "prefix code" even when such a code is not produced by Huffman's algorithm.
"""


class NodeTree(object):
    def __init__(self, left=None, right=None):
        self.left = left
        self.right = right

    def children(self):
        return (self.left, self.right)

    def nodes(self):
        return (self.left, self.right)

    def __str__(self):
        return "%s_%s" % (self.left, self.right)


## Tansverse the NodeTress in every possible way to get codings
def huffmanCodeTree(node, left=True, binString=""):
    if type(node) is str:
        return {node: binString}
    (l, r) = node.children()
    d = dict()
    d.update(huffmanCodeTree(l, True, binString + "0"))
    d.update(huffmanCodeTree(r, False, binString + "1"))
    return d

#if DEBUG:
#    print "Input file: " + sys.argv[1]

freq = {}
for c in string:
    if c in freq:
        freq[c] += 1
    else:
        freq[c] = 1

#Sort the frequency table based on occurrence this will also convert the
#dict to a list of tuples
freq = sorted(freq.items(), key=lambda x: x[1], reverse=True)
print(freq)
print("-------------------------------")
if DEBUG:
    print(" Char | Freq ")
    for key, c in freq:
        print (" %4r | %d" % (key, c))
    print ("--------------------")

nodes = freq

while len(nodes) > 1:
    key1, c1 = nodes[-1]
    key2, c2 = nodes[-2]
    nodes = nodes[:-2]

    node = NodeTree(key1, key2)
    #print(node)
    #print("----------------------")
    nodes.append((node, c1 + c2))
    # Re-sort the list
    nodes = sorted(nodes, key=lambda x: x[1], reverse=True)  # Sort by frequency which is x[1]

if DEBUG:
    print ("left: %s" % nodes[0][0].nodes()[0])
    print ("right: %s" % nodes[0][0].nodes()[1])


huffmanCode = huffmanCodeTree(nodes[0][0])

print (" Char | Freq  | Huffman code ")
print ("-----------------------------")
for char, frequency in freq:
    print (" %-4r | %5d | %12s" % (char, frequency, huffmanCode[char]))


[(' ', 381), ('e', 216), ('o', 164), ('n', 147), ('i', 138), ('t', 133), ('a', 128), ('s', 117), ('r', 104), ('d', 87), ('f', 81), ('h', 80), ('m', 77), ('c', 70), ('l', 60), ('u', 59), ('p', 48), ('g', 44), ('y', 35), ('b', 33), ('w', 21), ('.', 19), (',', 18), ('v', 17), ('H', 16), ('\n', 11), ('"', 8), ('x', 7), ('T', 6), ('I', 5), ('M', 5), ('-', 5), ('A', 4), ('q', 4), ("'", 4), ('D', 3), ('1', 3), ('C', 3), ('(', 3), (')', 3), ('9', 2), ('5', 2), ('R', 2), ('F', 2), ('k', 2), ('S', 2), ('P', 1), ('2', 1), ('/', 1), ('B', 1), ('j', 1)]
-------------------------------
 Char | Freq 
  ' ' | 381
  'e' | 216
  'o' | 164
  'n' | 147
  'i' | 138
  't' | 133
  'a' | 128
  's' | 117
  'r' | 104
  'd' | 87
  'f' | 81
  'h' | 80
  'm' | 77
  'c' | 70
  'l' | 60
  'u' | 59
  'p' | 48
  'g' | 44
  'y' | 35
  'b' | 33
  'w' | 21
  '.' | 19
  ',' | 18
  'v' | 17
  'H' | 16
 '\n' | 11
  '"' | 8
  'x' | 7
  'T' | 6
  'I' | 5
  'M' | 5
  '-' | 5
  'A' | 4
  'q' | 4
  "'" | 4
  'D' | 3
  '1' | 3
  

In [50]:
import numpy as np
import cv2
from sklearn.cluster import KMeans
import heapq
import struct
from typing import Tuple, Dict

class HuffmanTreeNode:
    def __init__(self, value=None, frequency=None):
        self.value = value
        self.frequency = frequency
        self.left = None
        self.right = None

    def __lt__(self, other):
        return self.frequency < other.frequency

def image_to_data(image: np.ndarray) -> np.ndarray:
    return image.reshape((-1, 3))

def quantize_image(image: np.ndarray, n_colors: int) -> Tuple[np.ndarray, np.ndarray]:
    data = image_to_data(image)
    kmeans = KMeans(n_clusters=n_colors, n_init=10)
    kmeans.fit(data)
    new_data = kmeans.cluster_centers_[kmeans.labels_]
    quantized_image = new_data.reshape(image.shape).astype(np.uint8)
    return quantized_image, kmeans.cluster_centers_.astype(np.uint8)

def to_indexed_image(image: np.ndarray, colormap: np.ndarray) -> np.ndarray:
    data = image_to_data(image)
    indexed_data = np.argmin(np.linalg.norm(colormap - data[:, np.newaxis], axis=2), axis=1)
    return indexed_data.reshape(image.shape[:-1])

def build_huffman_tree(frequency: Dict[int, int]) -> HuffmanTreeNode:
    heap = [HuffmanTreeNode(value=char, frequency=freq) for char, freq in frequency.items()]
    heapq.heapify(heap)
    while len(heap) > 1:
        left = heapq.heappop(heap)
        right = heapq.heappop(heap)
        merged_node = HuffmanTreeNode(frequency=left.frequency + right.frequency)
        merged_node.left = left
        merged_node.right = right
        heapq.heappush(heap, merged_node)
    return heap[0]

def build_huffman_codes(tree: HuffmanTreeNode) -> Dict[int, str]:
    codes = {}
    def traverse(node, code=""):
        if node is None:
            return
        if node.value is not None:
            codes[node.value] = code
            return
        traverse(node.left, code + "0")
        traverse(node.right, code + "1")
    traverse(tree)
    return codes

def huffman_compress(image: np.ndarray, colormap_size: int, output_file: str):
    quantized_image, colormap = quantize_image(image, colormap_size)
    indexed_image = to_indexed_image(quantized_image, colormap)
    color_counts = dict(zip(*np.unique(indexed_image, return_counts=True)))
    huffman_tree = build_huffman_tree(color_counts)
    huffman_codes = build_huffman_codes(huffman_tree)
    encoded_data = ''.join(huffman_codes[i] for i in indexed_image.ravel())
    padded_encoded_data = encoded_data + '0' * (8 - len(encoded_data) % 8)
    encoded_bytes = bytearray(int(padded_encoded_data[i:i+8], 2) for i in range(0, len(padded_encoded_data), 8))
    
    with open(output_file, 'wb') as file:
        # Save colormap size
        file.write(struct.pack('I', colormap_size))
        
        # Save colormap
        file.write(colormap.tobytes())
        
        # Save the original length of indexed image data
        file.write(struct.pack('I', len(indexed_image.ravel())))
        
        # Save the color counts (frequencies)
        file.write(struct.pack('I', len(color_counts)))
        for color, count in color_counts.items():
            file.write(struct.pack('B', color))
            file.write(struct.pack('I', count))
        
        # Save the encoded bytes
        file.write(encoded_bytes)

def huffman_decompress(input_file: str, output_file: str):
    with open(input_file, 'rb') as file:
        colormap_size = struct.unpack('I', file.read(4))[0]
        colormap = np.frombuffer(file.read(3 * colormap_size), dtype=np.uint8).reshape(-1, 3)
        
        # Read the original length of indexed image data
        original_length = struct.unpack('I', file.read(4))[0]
        
        # Read color counts (frequencies)
        num_colors = struct.unpack('I', file.read(4))[0]
        color_counts = {}
        for _ in range(num_colors):
            color = struct.unpack('B', file.read(1))[0]
            count = struct.unpack('I', file.read(4))[0]
            color_counts[color] = count
        
        encoded_data = file.read()

    huffman_tree = build_huffman_tree(color_counts)
    huffman_codes = build_huffman_codes(huffman_tree)
    decoded_data = []
    current_code = ""
    for byte in encoded_data:
        current_code += bin(byte)[2:].rjust(8, '0')
        while current_code:
            found = False
            for char, code in huffman_codes.items():
                if current_code.startswith(code):
                    decoded_data.append(char)
                    current_code = current_code[len(code):]
                    found = True
                    break
            if not found:
                break

    # Use the original length to shape the decoded data
    indexed_image = np.array(decoded_data[:original_length], dtype=np.uint8).reshape(input_image.shape[:-1])

    # Convert indexed image to quantized image using colormap
    quantized_image = colormap[indexed_image]

    # Save the decompressed image
    cv2.imwrite(output_file, quantized_image)



if __name__ == "__main__":
    input_image_path = 'ZhongXinaSmaller.jpg'
    compressed_file = 'compressed.hmc'
    decompressed_file = 'result_huffman.bmp'

    input_image = cv2.imread(input_image_path)
    if input_image is None:
        raise ValueError(f"Could not open or find the image: {input_image_path}")

    colormap_sizes = [256, 128, 64, 32, 16, 8]
    for colormap_size in colormap_sizes:
        print(f"Compressing with colormap size: {colormap_size}")
        huffman_compress(input_image, colormap_size, compressed_file)
        huffman_decompress(compressed_file, decompressed_file)


Compressing with colormap size: 256
Compressing with colormap size: 128
Compressing with colormap size: 64
Compressing with colormap size: 32
Compressing with colormap size: 16
Compressing with colormap size: 8
