In [17]:
# 大顶堆
class LargeHeap(object):
    def __init__(self, cap=10):
        self.length = 0
        self.cap = cap
        self.heap = [0] * (cap + 1)
    
    @classmethod
    def _parent(cls, child_idx):
        return child_idx // 2
    
    @classmethod
    def _left(cls, parent_idx):
        return parent_idx * 2
    
    @classmethod
    def _right(cls, parent_idx):
        return parent_idx * 2 + 1
    
    def insert(self, data):
        if self.length >= self.cap:
            return -1
        self.heap[self.length + 1] = data
        self.length += 1
        self._heap_up()

    def pop(self):
        if self.length < 1:
            return -1
        res = self.heap[1]
        self.heap[1] = self.heap[self.length]
        self.heap[self.length] = 0
        self.length -= 1
        self._heap_down()
        return res
    
    def peek(self):
        return self.heap[1]

    def _heap_up(self):
        c_idx, p_idx = self.length, LargeHeap._parent(self.length)
        while p_idx > 0 and self.heap[c_idx] > self.heap[p_idx]:
            self.heap[c_idx], self.heap[p_idx] = self.heap[p_idx], self.heap[c_idx]
            c_idx, p_idx = p_idx, LargeHeap._parent(p_idx)
            
    def _heap_down(self, count=None, idx=1):
        if not count:
            count = self.length
        c_idx, l_idx, r_idx = idx, LargeHeap._left(idx), LargeHeap._right(idx)
        maxi, maxi_idx = self.heap[c_idx], c_idx
        while True:
            if l_idx <= count and self.heap[c_idx] < self.heap[l_idx]:
                maxi_idx, maxi = l_idx, self.heap[l_idx]
            if r_idx <= count and self.heap[maxi_idx] < self.heap[r_idx]:
                maxi_idx, maxi = r_idx, self.heap[r_idx]

            if maxi_idx == c_idx:
                break
            else:
                self.heap[c_idx], self.heap[maxi_idx] = self.heap[maxi_idx], self.heap[c_idx]
                c_idx, l_idx, r_idx = maxi_idx, LargeHeap._left(maxi_idx), LargeHeap._right(maxi_idx)
    
    def createFromList(self, alist):
        self.cap = len(alist)
        self.length = len(alist)
        self.heap = alist[:]
        self.heap.append(0)
        self.heap[0], self.heap[self.length] = self.heap[self.length], self.heap[0]
        for i in range(self.length // 2, 0, -1):
            self._heap_down(idx = i)

    def sort(self):
        for i in range(1, self.length):
            self.heap[1], self.heap[-i] = self.heap[-i], self.heap[1]
            self._heap_down(count = self.length - i)

    def __repr__(self):
        return str(self.heap[1:])

In [289]:
hp = Heap(10)
hp.insert(3)
hp.insert(9)
hp.insert(1)
hp.insert(8)
hp.insert(7)
hp.insert(3)
print(hp)
for _ in range(6):
    print(hp.pop())
a = [0, 6, 3, 4, 0, 9, 2, 7, 5, -2, 8, 1, 6, 10]
h = Heap(len(a))
h.createFromList(a[1:])
h.sort()
print(h)

[9, 8, 3, 3, 7, 1, 0, 0, 0, 0]
9
8
7
3
3
1
[-2, 0, 1, 2, 3, 4, 5, 6, 6, 7, 8, 9, 10]


In [9]:
# 小顶堆
class SmallHeap(object):
    def __init__(self, cap=10):
        self.cap = cap
        self.length = 0
        self.heap = [0] * (self.cap + 1)
    
    def insert(self, data):
        if self.length >= self.cap:
            return -1
        self.heap[self.length + 1] = data
        self.length += 1
        self._heap_up()
    
    def pop(self):
        if self.length < 1:
            return -1
        res = self.heap[1]
        self.heap[1] = self.heap[self.length]
        self.heap[self.length] = 0
        self.length -= 1
        self._heap_down()
        return res
    
    def peek(self):
        return self.heap[1]
    
    @classmethod
    def _parent(self, idx):
        return idx // 2
    
    @classmethod
    def _left(self, idx):
        return 2 * idx
    
    @classmethod
    def _right(self, idx):
        return 2 * idx + 1
    
    def _heap_up(self):
        c_idx, p_idx = self.length, SmallHeap._parent(self.length)
        while p_idx > 0 and self.heap[c_idx] < self.heap[p_idx]:
            self.heap[c_idx], self.heap[p_idx] = self.heap[p_idx], self.heap[c_idx]
            c_idx, p_idx = p_idx, SmallHeap._parent(p_idx)
            
    def _heap_down(self):
        c_idx, l_idx, r_idx = 1, SmallHeap._left(1), SmallHeap._right(1)
        mini, mini_idx = self.heap[c_idx], c_idx
        while True:
            if l_idx <= self.length and self.heap[c_idx] > self.heap[l_idx]:
                mini, mini_idx = self.heap[l_idx], l_idx
            if r_idx <= self.length and self.heap[mini_idx] > self.heap[r_idx]:
                mini, mini_idx = self.heap[r_idx], r_idx
            
            if mini_idx == c_idx:
                break
            else:
                self.heap[c_idx], self.heap[mini_idx] = self.heap[mini_idx], self.heap[c_idx]
                c_idx, l_idx, r_idx = mini_idx, SmallHeap._left(mini_idx), SmallHeap._right(mini_idx)
            
    def __repr__(self):
        return str(self.heap[1:])

应用一：合并小文件

In [2]:
import os

In [3]:
input_path = r'C:\Users\hrajzl\Desktop\code\local\src\heap_app\merge_small_files\input'
ouput_path = r'C:\Users\hrajzl\Desktop\code\local\src\heap_app\merge_small_files\output\out.txt'

In [4]:
def read_line(file, line):
    try:
        with open(file, 'r') as f:
            for i, line_content in enumerate(f):
                if i == line:
                    next_line = line + 1
                    return (line_content, next_line)
            return None
    except:
        return None

In [5]:
def write_content_to_file(content, file):
    d = os.path.dirname(file)
    if not os.path.exists(d):
        os.mkdir(d)
    
    with open(file, 'a+') as f:
        if not content.endswith('\n'):
            content += '\n'
        f.write(str(content))

In [6]:
def merge_small_files(input_path, ouput_path):
    input_files = os.listdir(input_path)
    lines = [0] * len(input_files)
    
    h = SmallHeap()
    
    for i in range(len(input_files)):
        file = os.path.join(input_path, input_files[i])
        line = lines[i]
        v = read_line(file, line)
        if v:
            v += (i, )
            h.insert(v)

    while True:
        if h.length == 0:
            break
        
        content, line_number, file_idx = h.pop()
        write_content_to_file(content, ouput_path)
        lines[file_idx] = line_number
        input_file = os.path.join(input_path, input_files[file_idx])
        value = read_line(input_file, line_number)
        
        if value is not None:
            value += (file_idx,)
            h.insert(value)

In [8]:
merge_small_files(input_path, ouput_path)

应用二：利用堆求Top K

In [12]:
def find_top_k(alist, k):
    h = SmallHeap(k)
    for i in range(k):
        h.insert(alist[i])
    
    for i in alist[k:]:
        if i > h.peek():
            h.pop()
            h.insert(i)
    
    return h.peek()

应用三：利用堆求中位数

In [18]:
def find_median(alist):
    length = len(alist)
    small_cap = length // 2
    large_cap = length - small_cap
    
    small_heap = SmallHeap(small_cap)
    large_heap = LargeHeap(large_cap)
    
    for i in range(small_cap):
        small_heap.insert(alist[i])
    
    for item in alist[small_cap:]:
        if item > small_heap.peek():
            tmp = small_heap.pop()
            small_heap.insert(item)
            large_heap.insert(tmp)
    return large_heap.peek()

In [22]:
find_median([1,2,3,4,5,6,7,8,9,10,11])

6