In [1]:
import heapq

In [41]:
class BinaryTreeNode:
    def __init__(self, value, frequency):
        self.value = value
        self.frequency = frequency
        self.left = None
        self.right = None

    # Overriding the method of heap as the push and pop should happen based on frequency value

    def __lt__(self, other):
        return self.frequency < other.frequency

    def __eq__(self, other):
        return self.frequency == other.frequency

In [58]:
class HuffmanCoding:

    def __init__(self, path):
        self.path = path
        self.heap = []
        self.codes = {}

    def __makeFrequencyDictionary(self, content):
        freq = {}
        for ele in content:
            freq[ele] = freq.get(ele, 0) + 1
        return freq

    def __buildHeap(self, freq:dict):
        for x in freq:
            newNode = BinaryTreeNode(x, freq[x])
            heapq.heappush(self.heap, newNode)

    def __buildTree(self):
        while len(self.heap) > 1:
            n1, n2 = heapq.heappop(self.heap), heapq.heappop(self.heap)
            sumFrequency = n1.frequency + n2.frequency
            newNode = BinaryTreeNode("", sumFrequency)
            newNode.left = n1
            newNode.right = n2
            heapq.heappush(self.heap, newNode)
        return

    def __buildCodeHelper(self, root, curr_bits):
        if not root:
            return
        if root.value:
            self.codes[root.value] = curr_bits
            return
        self.__buildCodeHelper(root.left, curr_bits + "0")
        self.__buildCodeHelper(root.right, curr_bits + "1")

    def __buildCode(self):
        root = heapq.heappop(self.heap)
        self.__buildCodeHelper(root, "")

    def __encodedText(self, text):
        encodedText = ""
        for char in text:
            encodedText += self.codes[char]
        return encodedText

    def __paddedEncodedText(self, encodedText):
        paddingAmount = 8 - (len(encodedText) % 8)
        encodedText += '0'*paddingAmount

        paddingInfo = "{0:08b}".format(paddingAmount)
        paddedEncodedText = paddingInfo + encodedText
        return paddedEncodedText

    def __getByteArray(self, paddedEncode):
        final = []
        for i in range(0, len(paddedEncode), 8):
            byte = paddedEncode[i:i+8]
            final.append(int(byte, 2))
        return final

    def compress(self):

        # Get file from given path
        filePath = self.path

        # Read content of file
        fileContent = []
        with open(filePath, encoding='utf8') as f:
            for ln in f:
                fileContent += ln.split()
        fileContent = "ababddgh"
        # Make frequency dictionary using the text
        freq = self.__makeFrequencyDictionary(fileContent)

        # Construct heap from frequency dict
        self.__buildHeap(freq)

        # Construct binary tree from heap
        self.__buildTree()

        # Construct codes from Binary tree
        self.__buildCode()

        # Create encoded text and put in file
        encodedText = self.__encodedText(fileContent)
        paddedEncodedText = self.__paddedEncodedText(encodedText)

        # Return the file as output
        byteArr = self.__getByteArray(paddedEncodedText)
        return byteArr

    def decompress(self):
        pass



In [59]:
h = HuffmanCoding("./demoTextFile.txt")
h.compress()

[6, 102, 13, 192]

In [8]:
heapq.heappush(h, 2)
h

TypeError: heap argument must be a list

In [10]:
type(h)

__main__.HuffmanCoding

In [37]:
with open('demoTextFile.txt', encoding='utf8') as f:
    for line in f:
        print(line.split())

['a', 'b', 'c', 'x', 'a', 'd', 'z']
