In [5]:
import numpy as np
import os

def generate_data(num_pages, page_size, output_dir):
    """
    Generate random data and save it in pages as numpy files.
    
    :param num_pages: Number of pages to generate
    :param page_size: Number of elements per page
    :param output_dir: Directory to save the generated pages
    """
    os.makedirs(output_dir, exist_ok=True)
    for i in range(num_pages):
        data = np.random.randint(0, 10, page_size)
        np.save(os.path.join(output_dir, f'page_{i}.npy'), data)


In [6]:
def sort_pages(input_dir, num_pages):
    """
    Load pages from disk, sort them, and save the sorted pages back.
    
    :param input_dir: Directory containing the unsorted pages
    :param num_pages: Number of pages to sort
    """
    for i in range(num_pages):
        data = np.load(os.path.join(input_dir, f'page_{i}.npy'))
        sorted_data = np.sort(data)
        np.save(os.path.join(input_dir, f'sorted_page_{i}.npy'), sorted_data)

In [7]:
import heapq

def merge_pages(input_dir, num_pages, buffer_size, output_file):
    """
    Merge sorted pages into a single sorted file using a buffer.
    
    :param input_dir: Directory containing the sorted pages
    :param num_pages: Number of pages to merge
    :param buffer_size: Buffer size (n) indicating how many pages to load into memory
    :param output_file: File to write the final sorted output
    """
    open_files = [open(os.path.join(input_dir, f'sorted_page_{i}.npy'), 'rb') for i in range(buffer_size - 1)]
    buffers = [np.load(f) for f in open_files]
    
    with open(output_file, 'wb') as out_file:
        # Initialize the heap with the first element of each buffer
        min_heap = []
        for i, buffer in enumerate(buffers):
            if buffer.size > 0:
                heapq.heappush(min_heap, (buffer[0], i, 0))  # (value, buffer index, element index)

        while min_heap:
            val, buffer_index, element_index = heapq.heappop(min_heap)
            out_file.write(val.tobytes())  # Write value to output file
            next_element_index = element_index + 1
            if next_element_index < buffers[buffer_index].size:
                heapq.heappush(min_heap, (buffers[buffer_index][next_element_index], buffer_index, next_element_index))

In [8]:
def external_merge_sort(input_dir, num_pages, page_size, buffer_size, output_file):
    """
    Perform external merge sort on the data.
    
    :param input_dir: Directory containing the pages
    :param num_pages: Number of pages
    :param page_size: Number of elements per page
    :param buffer_size: Buffer size (n)
    :param output_file: Output file to write the final sorted data
    """
    sort_pages(input_dir, num_pages)
    merge_pages(input_dir, num_pages, buffer_size, output_file)

# Example usage
generate_data(num_pages=100, page_size=10, output_dir='data')
external_merge_sort(input_dir='data', num_pages=100, page_size=10, buffer_size=10, output_file='sorted_data.npy')