In [1]:
import numpy as np
import os
import string
import random
import csv

# Function to convert a key to its binary hash representation
def hash_function(key):
    return '{0:016b}'.format(key)

#Initializes bucket
class Bucket:
    def __init__(self, local_depth, index, empty_spaces, id):
        self.id = id #Bucket ID
        self.local_depth = local_depth #Local depth of bucket
        self.index = index #List of numbers in a bucket
        self.empty_spaces = empty_spaces #Empty spaces in a bucket

#Initializes directory
class Directory:
    def __init__(self, global_depth, directory_records):
        self.global_depth = global_depth #Global depth of directory
        self.directory_records = directory_records #List of directory records

#Initializes directory record        
class DirectoryRecord:
    def __init__(self, bucket, hash_prefix):
        self.hash_prefix = hash_prefix #Hash prefix for directory record
        self.value = bucket #Bucket associated with directory record

bucket_capacity = 3 #Capacity of each bucket
bucket_number = 2 #Starting bucket ID's
global_depth = 1 #Initial global depth

# Initialization of buckets
bucket1 = Bucket(local_depth=1, empty_spaces=bucket_capacity, index=[], id=1)
bucket2 = Bucket(local_depth=1, empty_spaces=bucket_capacity, index=[], id=2)

# Initialization of directory records
directory_records = list()
directory_records.append(DirectoryRecord(hash_prefix=0, bucket=bucket1))
directory_records.append(DirectoryRecord(hash_prefix=1, bucket=bucket2))

#Initialize directory
directory = Directory(global_depth=1, directory_records=directory_records)


def insert(index):
    
    global directory
    global bucket_number
    
    t_id = index[0] #Gets ID from index
    hash_key = hash_function(int(t_id)) #Converts ID into binary
    hash_prefix = int(hash_key[-directory.global_depth:], 2) #Compute hash prefix based on global depth

    bucket = directory.directory_records[hash_prefix].value #Get corresponding bucket
    bucket.index.append(index) #Add index to the bucket
    bucket.empty_spaces -= 1 #Decrease the empty spaces in the bucket

    if bucket.empty_spaces < 0: #If bucket is overloaded handle overflow
        temp_memory = bucket.index
        bucket.empty_spaces = bucket_capacity #Stores bucket numbers temporarily
        bucket.index = [] #Clears bucket

        if directory.global_depth > bucket.local_depth: #If global depth is larger than bucket depth, split the bucket
            number_of_links = 2 ** (directory.global_depth - bucket.local_depth) #Calculate how many directory records point to the bucket
            bucket.local_depth += 1 #Increase the local depth of the bucket
            number_of_modify_links = number_of_links // 2 #Claculates the new number of links for the bucket
            
            #Creates new bucket
            new_bucket = Bucket(local_depth=bucket.local_depth, index=[], empty_spaces=bucket_capacity, id=bucket_number)
            bucket_number += 1
            
            #Updates directory records with new bucket references
            for directory_record in directory.directory_records: #Iterates over each directory record
                if directory_record.value == bucket: #Check if each directory record references the bucket that overflowed
                    if number_of_modify_links != 0: #If there are still links to modify
                        number_of_modify_links -= 1 
                    else:
                        directory_record.value = new_bucket #Assign new bucket to directory reocrd
            
            for item in temp_memory: #Reinserts items from temp_memory to the buckets
                insert(item)
 
        elif directory.global_depth == bucket.local_depth: #If global depth equals bucket local depth, expand directory
            new_directory_len = 2 * len(directory.directory_records) #Gets new directory length
            new_directory_records = [] #Initializes new directory records

            for directory_record_number in range(new_directory_len): #Initializes new directory with new records
                new_directory_records.append(DirectoryRecord(
                    hash_prefix=directory_record_number, 
                    bucket=Bucket(local_depth=1, index=[], empty_spaces=bucket_capacity, id=bucket_number)
                ))
                bucket_number += 1 #Increments bucket number for each bucket created
            
            #Creates new directory
            new_directory = Directory(global_depth=directory.global_depth + 1, directory_records=new_directory_records)

            
            for directory_record in directory.directory_records:
                hashkey1 = '0' + hash_function(directory_record.hash_prefix) #Further splits the directory values into 0's and 1's
                hashkey2 = '1' + hash_function(directory_record.hash_prefix)
                new_index1 = int(hashkey1[-new_directory.global_depth:], 2) #Ensures that the new directory records are placed properly
                new_index2 = int(hashkey2[-new_directory.global_depth:], 2)

                #Assigns the bucket to new directory entries
                new_directory.directory_records[new_index1].value = directory_record.value
                new_directory.directory_records[new_index2].value = directory_record.value

            #Updates directory
            directory = new_directory
             #Reinserts items into the buckets
            for item in temp_memory:
                insert(item)
    print_directory()

    
def print_directory():
    global directory
    print("Directory State:")
    for record in directory.directory_records:
        print(f"Binary: {record.hash_prefix}, Bucket ID: {record.value.id}, Local Depth: {record.value.local_depth}, Values: {record.value.index}")
    print()    
    

# Inserting values and printing the directory state
#for value in values:
insert([16])

Directory State:
Binary: 0, Bucket ID: 1, Local Depth: 1, Values: [[16]]
Binary: 1, Bucket ID: 2, Local Depth: 1, Values: []



In [2]:
insert([22])

Directory State:
Binary: 0, Bucket ID: 1, Local Depth: 1, Values: [[16], [22]]
Binary: 1, Bucket ID: 2, Local Depth: 1, Values: []



In [3]:
insert([26])

Directory State:
Binary: 0, Bucket ID: 1, Local Depth: 1, Values: [[16], [22], [26]]
Binary: 1, Bucket ID: 2, Local Depth: 1, Values: []



In [4]:
insert([20])

Directory State:
Binary: 0, Bucket ID: 1, Local Depth: 1, Values: [[16]]
Binary: 1, Bucket ID: 2, Local Depth: 1, Values: []
Binary: 2, Bucket ID: 4, Local Depth: 1, Values: []
Binary: 3, Bucket ID: 5, Local Depth: 1, Values: []

Directory State:
Binary: 0, Bucket ID: 1, Local Depth: 1, Values: [[16]]
Binary: 1, Bucket ID: 2, Local Depth: 1, Values: []
Binary: 2, Bucket ID: 4, Local Depth: 1, Values: [[22]]
Binary: 3, Bucket ID: 5, Local Depth: 1, Values: []

Directory State:
Binary: 0, Bucket ID: 1, Local Depth: 1, Values: [[16]]
Binary: 1, Bucket ID: 2, Local Depth: 1, Values: []
Binary: 2, Bucket ID: 4, Local Depth: 1, Values: [[22], [26]]
Binary: 3, Bucket ID: 5, Local Depth: 1, Values: []

Directory State:
Binary: 0, Bucket ID: 1, Local Depth: 1, Values: [[16], [20]]
Binary: 1, Bucket ID: 2, Local Depth: 1, Values: []
Binary: 2, Bucket ID: 4, Local Depth: 1, Values: [[22], [26]]
Binary: 3, Bucket ID: 5, Local Depth: 1, Values: []

Directory State:
Binary: 0, Bucket ID: 1, Local De

In [5]:
insert([3])

Directory State:
Binary: 0, Bucket ID: 1, Local Depth: 1, Values: [[16], [20]]
Binary: 1, Bucket ID: 2, Local Depth: 1, Values: []
Binary: 2, Bucket ID: 4, Local Depth: 1, Values: [[22], [26]]
Binary: 3, Bucket ID: 5, Local Depth: 1, Values: [[3]]



In [6]:
insert([1])

Directory State:
Binary: 0, Bucket ID: 1, Local Depth: 1, Values: [[16], [20]]
Binary: 1, Bucket ID: 2, Local Depth: 1, Values: [[1]]
Binary: 2, Bucket ID: 4, Local Depth: 1, Values: [[22], [26]]
Binary: 3, Bucket ID: 5, Local Depth: 1, Values: [[3]]



In [7]:
insert([12])

Directory State:
Binary: 0, Bucket ID: 1, Local Depth: 1, Values: [[16], [20], [12]]
Binary: 1, Bucket ID: 2, Local Depth: 1, Values: [[1]]
Binary: 2, Bucket ID: 4, Local Depth: 1, Values: [[22], [26]]
Binary: 3, Bucket ID: 5, Local Depth: 1, Values: [[3]]



In [8]:
insert([11])

Directory State:
Binary: 0, Bucket ID: 1, Local Depth: 1, Values: [[16], [20], [12]]
Binary: 1, Bucket ID: 2, Local Depth: 1, Values: [[1]]
Binary: 2, Bucket ID: 4, Local Depth: 1, Values: [[22], [26]]
Binary: 3, Bucket ID: 5, Local Depth: 1, Values: [[3], [11]]



In [9]:
insert([13])

Directory State:
Binary: 0, Bucket ID: 1, Local Depth: 1, Values: [[16], [20], [12]]
Binary: 1, Bucket ID: 2, Local Depth: 1, Values: [[1], [13]]
Binary: 2, Bucket ID: 4, Local Depth: 1, Values: [[22], [26]]
Binary: 3, Bucket ID: 5, Local Depth: 1, Values: [[3], [11]]



In [10]:
insert([19])

Directory State:
Binary: 0, Bucket ID: 1, Local Depth: 1, Values: [[16], [20], [12]]
Binary: 1, Bucket ID: 2, Local Depth: 1, Values: [[1], [13]]
Binary: 2, Bucket ID: 4, Local Depth: 1, Values: [[22], [26]]
Binary: 3, Bucket ID: 5, Local Depth: 1, Values: [[3], [11], [19]]



In [11]:
insert([38])

Directory State:
Binary: 0, Bucket ID: 1, Local Depth: 1, Values: [[16], [20], [12]]
Binary: 1, Bucket ID: 2, Local Depth: 1, Values: [[1], [13]]
Binary: 2, Bucket ID: 4, Local Depth: 1, Values: [[22], [26], [38]]
Binary: 3, Bucket ID: 5, Local Depth: 1, Values: [[3], [11], [19]]



In [12]:
insert([47])

Directory State:
Binary: 0, Bucket ID: 1, Local Depth: 1, Values: [[16], [20], [12]]
Binary: 1, Bucket ID: 2, Local Depth: 1, Values: [[1], [13]]
Binary: 2, Bucket ID: 4, Local Depth: 1, Values: [[22], [26], [38]]
Binary: 3, Bucket ID: 5, Local Depth: 2, Values: [[3]]

Directory State:
Binary: 0, Bucket ID: 1, Local Depth: 1, Values: [[16], [20], [12]]
Binary: 1, Bucket ID: 2, Local Depth: 1, Values: [[1], [13]]
Binary: 2, Bucket ID: 4, Local Depth: 1, Values: [[22], [26], [38]]
Binary: 3, Bucket ID: 5, Local Depth: 2, Values: [[3], [11]]

Directory State:
Binary: 0, Bucket ID: 1, Local Depth: 1, Values: [[16], [20], [12]]
Binary: 1, Bucket ID: 2, Local Depth: 1, Values: [[1], [13]]
Binary: 2, Bucket ID: 4, Local Depth: 1, Values: [[22], [26], [38]]
Binary: 3, Bucket ID: 5, Local Depth: 2, Values: [[3], [11], [19]]

Directory State:
Binary: 0, Bucket ID: 1, Local Depth: 1, Values: [[16], [20], [12]]
Binary: 1, Bucket ID: 2, Local Depth: 1, Values: [[1], [13]]
Binary: 2, Bucket ID: 4, L

In [13]:
insert([46])

Directory State:
Binary: 0, Bucket ID: 1, Local Depth: 1, Values: [[16], [20], [12]]
Binary: 1, Bucket ID: 2, Local Depth: 1, Values: [[1], [13]]
Binary: 2, Bucket ID: 4, Local Depth: 1, Values: [[22], [26], [38]]
Binary: 3, Bucket ID: 5, Local Depth: 2, Values: [[3], [11], [19]]
Binary: 4, Bucket ID: 11, Local Depth: 1, Values: []
Binary: 5, Bucket ID: 12, Local Depth: 1, Values: []
Binary: 6, Bucket ID: 13, Local Depth: 1, Values: [[46]]
Binary: 7, Bucket ID: 14, Local Depth: 1, Values: [[47]]



In [14]:
#When expanding the directory, I can't seem to successfully store and clear all
# of the values in the buckets, as seen in insert 47, only the contents of bucket 4 were 
#cleared and stored

In [15]:
import numpy as np
import heapq
import os

def generate_data(num_pages, page_size, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    for i in range(num_pages):
        data = np.random.randint(0, 100, page_size)
        np.save(os.path.join(output_dir, f'page_{i}.npy'), data)
        print(f'Generated page {i}: {data}')

def sort_pages(input_dir, num_pages):
    for i in range(num_pages):
        data = np.load(os.path.join(input_dir, f'page_{i}.npy'))
        sorted_data = np.sort(data)
        np.save(os.path.join(input_dir, f'sorted_page_{i}.npy'), sorted_data)
        print(f'Sorted page {i}: {sorted_data}')

def merge_pages(input_dir, num_pages, buffer_size, output_file):
    open_files = [open(os.path.join(input_dir, f'sorted_page_{i}.npy'), 'rb') for i in range(num_pages)]
    with open(output_file, 'wb') as out_file:
        buffers = [np.load(f) for f in open_files]
        min_heap = []
        for i, buffer in enumerate(buffers):
            if buffer.size > 0:
                heapq.heappush(min_heap, (buffer[0], i, 0))
        
        while min_heap:
            val, buffer_index, element_index = heapq.heappop(min_heap)
            out_file.write(val.tobytes())
            next_element_index = element_index + 1
            if next_element_index < buffers[buffer_index].size:
                heapq.heappush(min_heap, (buffers[buffer_index][next_element_index], buffer_index, next_element_index))

def external_merge_sort(input_dir, num_pages, page_size, buffer_size, output_file):
    sort_pages(input_dir, num_pages)
    merge_pages(input_dir, num_pages, buffer_size, output_file)

def display_data(file_path):
    data = np.fromfile(file_path, dtype=np.int32)
    print(f'Data in {file_path}: {data}')

# Example usage
output_dir = 'data'
generate_data(num_pages=5, page_size=10, output_dir=output_dir)
external_merge_sort(input_dir=output_dir, num_pages=5, page_size=10, buffer_size=10, output_file='sorted_data.bin')

# Display final merged data
display_data('sorted_data.bin')

Generated page 0: [90 63 60 27  1  0 80 42 57 73]
Generated page 1: [46 92 33 45 10 94 56 47 45 22]
Generated page 2: [84 37 49  0 76 61 47 77  4 49]
Generated page 3: [43 35 60 60  5 10 98 52 94 64]
Generated page 4: [83 52 56 95  3  8 78 45 98  5]
Sorted page 0: [ 0  1 27 42 57 60 63 73 80 90]
Sorted page 1: [10 22 33 45 45 46 47 56 92 94]
Sorted page 2: [ 0  4 37 47 49 49 61 76 77 84]
Sorted page 3: [ 5 10 35 43 52 60 60 64 94 98]
Sorted page 4: [ 3  5  8 45 52 56 78 83 95 98]
Data in sorted_data.bin: [ 0  0  1  3  4  5  5  8 10 10 22 27 33 35 37 42 43 45 45 45 46 47 47 49
 49 52 52 56 56 57 60 60 60 61 63 64 73 76 77 78 80 83 84 90 92 94 94 95
 98 98]
