11.1 You are given two sorted arrays, A and B, where A has a large enough buffer at
the end to hold B. Write a method to merge B into A in sorted order.

In [72]:
def merge(A, B, A_size):    
    a_idx, b_idx = A_size - 1, len(B) - 1
    i = len(A) - 1
    
    while a_idx >= 0 and b_idx >= 0:
        if A[a_idx] > B[b_idx]:
            A[i] = A[a_idx]
            a_idx = a_idx - 1
        else:
            A[i] = B[b_idx]
            b_idx = b_idx - 1
        i = i - 1
    
    # Handle remaining elements of B. At this point they are guaranteed
    # to be smaller than those handled in A.
    while b_idx >= 0:
        A[i] = B[b_idx]
        b_idx = b_idx - 1
        i = i - 1

def test_merge(A, B):
    A_with_buffer = A + [None] * len(B)
    merge(A_with_buffer, B, len(A))
    assert all(A_with_buffer[i] <= A_with_buffer[i+1] for i in range(len(A_with_buffer) - 1))

test_merge([1, 3, 5, 7, 9], [2, 4, 6, 8, 10])
test_merge([1, 2, 3], [4, 5, 6])
test_merge([4, 5, 6], [1, 2, 3])
test_merge([], [1, 2, 3])
test_merge([1, 2, 3], [])

11.2 Write a method to sort an array of strings so that all the anagrams are next to each
other.

In [66]:
from collections import defaultdict, deque

# Complexity of this is O(nm) where m is the string length
# and n is the number of strings
def sort_anagrams(A):
    # Get the same representation for strings that are anagrams by counting-sorting them.
    # Assuming ascii a-z only but this could easily be changed
    def representation(s):
        counts = [0] * (ord("z") - ord("a") + 1)
        ascii_a = ord("a")
        for c in s:
            counts[ord(c) - ascii_a] += 1
        return "".join([chr(c) + str(counts[c - ascii_a]) 
                for c in range(ord("a"), ord("z") + 1) if counts[c - ascii_a] > 0])
    
    # Only need to group anagrams together. There is no specified order
    # between different anagrams
    groups = defaultdict(deque)
    for a in A:
        groups[representation(a)].append(a)
    
    return [word for _, word_list in groups.items() for word in word_list]

sort_anagrams(["elvis", "silent", "admirer", "lives", "listen", "married"])

['elvis', 'lives', 'silent', 'listen', 'admirer', 'married']

11.3 Given a sorted array of n integers that has been rotated an unknown number of
times, write code to find an element in the array. You may assume that the array
was originally sorted in increasing order

In [61]:
def find(A, x):
    def _find(low, high):
        if low > high:
            return -1
        
        mid = low + (high - low) // 2
        if A[mid] == x:
            return mid

        if A[low] < A[mid]:
            # Left part is ordered correctly
            if A[low] <= x <= A[mid]:
                return _find(low, mid-1)
            else:
                return _find(mid+1, high)
        elif A[low] > A[mid]:
            # Left part contains the switching point so right
            # must be ordered correctly.
            if A[mid] <= x <= A[high]:
                return _find(mid+1, high)
            else:
                return _find(low, mid-1)
        elif A[low] == A[mid]:
            # Repeats exist in either left or right side
            if A[high] != A[mid]:
                # Right sides are not repeats
                return _find(mid+1, high) 
            else:
                # Have to look in both sides in this case
                l = _find(low, mid-1)
                if l != -1:
                    return l
                else:
                    return _find(mid+1, high)
            
            if A[high] == A[mid]:
                l = _find(low, mid-1)
                if l != -1:
                    return l
                else:
                    return _find(mid+1, high)
            else:
                return _find(mid+1, high)       
    
    return _find(0, len(A) - 1)

A1 = [0, 1, 2, 3, 4, 5]
assert all([find(A1, i) == i for i in range(len(A1))])

A2 = [1, 2, 3, 1, 1, 1, 1]
assert find(A2, 2) == 1
assert find(A2, 3) == 2
assert find(A2, 1) in [0, 3, 4, 5, 6]
assert find(A2, 4) == -1

A3 = [3, 3, 3, 3, 1, 2, 3]
assert find(A3, 1) == 4
assert find(A3, 2) == 5

assert find([], 1) == -1
assert find([1, 1, 1, 1, 1], 2) == -1

11.4 Imagine you have a 20 GB file with one string per line. Explain how you would
sort the file

In [21]:
from heapq import heapify, heappush, heappop


def sort_big_file(big_file, chunk_size):
    num_lines = len(big_file)
    temp_files = [] # Here we keep a small file for each chunk instead
    
    file_pointer = 0
    while file_pointer < len(big_file):
        # Load part of the file
        chunk = big_file[file_pointer:file_pointer+chunk_size]
        
        # Sort it and write back to a small temp file
        temp_files.append(sorted(chunk))
    
        # Step up file pointer to next part of the file
        file_pointer += chunk_size
        
    # Now start the merge of all temp files
    
    # Keep a pointer to current position in each temp file
    temp_file_pointers = [0 for _ in range(len(temp_files))]
    def read_next(tf_idx):
        fp = temp_file_pointers[tf_idx]
        temp_file_pointers[tf_idx] += 1
        return temp_files[tf_idx][fp] if fp < len(temp_files[tf_idx]) else None
    
    # Construct a heap of the first line in each temp file
    h = [(read_next(tf_idx), tf_idx) for tf_idx in range(len(temp_files))]
    heapify(h)
    hpush = lambda l, i: heappush(h, (l, i))
    hpop = lambda: heappop(h)
    file_pointer = 0
    while h:
        # Get the smallest entry from the heap
        line, tf_idx = hpop()
        
        # Write this line back to the big file. (Should be buffered writes)
        big_file[file_pointer] = line
        
        # Push the next line from the temp file that the current line came from to the heap
        next_line = read_next(tf_idx)
        if next_line is not None:
            hpush(next_line, tf_idx)
            
        file_pointer += 1
        
        
# Use few temp files
def sort_big_file_unfinished(bf, chunk_size, num_lines, num_temp_files):
    temp_files = [[] for _ in range(num_temp_files)] # Should preallocate space but python
    temp_file_idx = 0
    file_pointer = 0
    while file_pointer < num_lines:
        # Load part of the file
        chunk = bf[file_pointer:file_pointer+chunk_size]
        
        # Sort it and write back to the end of current temp file
        temp_files[temp_file_idx].extend(sorted(chunk))
        
        # Step up file pointer to next part of the file
        file_pointer += chunk_size
        
        # Use next temp_file for next chunk
        temp_file_idx = (temp_file_idx + 1) % num_temp_files
    
    # Now start the merge process
    temp_file_pointers = [0 for _ in range(num_temp_files)]
    def read_next(tf_idx):
        fp = temp_file_pointers[tf_idx]
        temp_file_pointers[tf_idx] += 1
        if fp >= len(temp_files[tf_idx]):
            return None
        return temp_files[tf_idx][fp]
        
    while True:
        h = []
        hpush = lambda l, i: heappush(h, (l, i))
        hpop = lambda: heappop(h)
        
        # Place the first entries of each sorted chunk in the temp files on a heap
        for tf_idx in range(num_temp_files):
            line = read_next(tf_idx)
            hpush(line, tf_idx)
        
        while h:
            line, tf_idx = hpop()
            if temp_file_pointers[tf_idx] % chunk_size != 0:
                # This chunk is not depleted so we add the next line from 
                # the chunk we got the current line from to the heap
                line = read_next(tf_idx)
                if line:
                    hpush(line, tf_idx)
            
                # TODO: where to write this line now? back to big file?
                # Need to recursively handle the sorted runs of num_temp_files * chunk_size
    
    
# Basically merge sort and would probably be slow with actual IO since there is a lot
def sort_big_file_mergesort(big_file, chunk_size):
    def merge(low, mid, high):
        temp_file = []
        i, j = low, mid+1
        while i <= mid and j <= high:
            if big_file[i] <= big_file[j]:
                temp_file.append(big_file[i])
                i += 1
            else:
                temp_file.append(big_file[j])
                j += 1
        # Check remainder
        while i <= mid:
            temp_file.append(big_file[i])
            i += 1
        while j <= high:
            temp_file.append(big_file[j])
            j += 1
        # Write back to big file
        i = low
        for l in temp_file:
            big_file[i] = l
            i += 1
    
    def big_merge_sort(low, high):
        if high - low + 1 <= chunk_size:
            chunk = big_file[low:high+1]
            big_file[low:high+1] = sorted(chunk)
        else:
            mid = low + (high - low) // 2
            big_merge_sort(low, mid)
            big_merge_sort(mid+1, high)
            merge(low, mid, high)
    
    big_merge_sort(0, len(big_file) - 1)

    
LINE_LENGTH = 80
NUM_LINES = 10000
CHUNK_SIZE = 1000
NUM_TEMP_FILES = 3
        
# Simulate a big file
import random
import string
alphabet = string.ascii_lowercase + string.ascii_uppercase + string.digits
random_string = lambda: "".join(random.choice(alphabet) for _ in range(LINE_LENGTH))
generate_big_file = lambda: [random_string() for _ in range(NUM_LINES)]

def test_sort(bf):
    assert all(bf[i] <= bf[i+1] for i in range(NUM_LINES - 1))

big_file1 = generate_big_file()
sort_big_file(big_file1, CHUNK_SIZE)
test_sort(big_file1)
del big_file1

big_file2 = generate_big_file()
sort_big_file_mergesort(big_file2, CHUNK_SIZE)
test_sort(big_file2)
del big_file2

# Don't let the big files be saved in this notebook
import gc
gc.collect()

9

11.5 Given a sorted array of strings which is interspersed with empty strings, write a
method to find the location of a given string

In [1]:
def find_interspersed(A, x):
    def find(low, high):
        if low > high:
            return -1
            
        mid = low + (high - low) // 2
        if A[mid] == x:
            return mid
        elif A[mid] == "":
            l = find(low, mid-1)
            if l != -1:
                return l
            else:
                return find(mid+1, high)
        elif x < A[mid]:
            return find(low, mid-1)
        else:
            return find(mid+1, high)
    
    return find(0, len(A)-1)

def find_interspersed2(A, x):
    def find(low, high):
        if low > high:
            return -1
            
        mid = low + (high - low) // 2
        if A[mid] == "":
            # Find the first non empty to left or right and set mid to it
            left, right = mid-1, mid+1
            while True:
                if left < low and right > high:
                    return -1
                elif left != "":
                    mid = left
                    break
                elif right != "":
                    mid = right
                    break
                left -= 1
                right += 1
        
        if A[mid] == x:
            return mid
        elif x < A[mid]:
            return find(low, mid-1)
        else:
            return find(mid+1, high)
    
    return find(0, len(A)-1)
        
A = ["at", "", "", "", "ball", "", "", "car", "dad", "", ""]
assert find_interspersed(A, "ball") == 4
assert find_interspersed(A, "car") == 7
assert find_interspersed2(A, "ball") == 4
assert find_interspersed2(A, "car") == 7

11.6 Given an M x N matrix in which each row and each column is sorted in ascending
order, write a method to find an element

In [25]:
def find_in_sorted_matrix(A, x):
    """Nested binary search to find x
    O(log(M) + log(N))"""
    M, N = len(A), len(A[0])
    r_low, r_high = 0, M - 1
    while r_low <= r_high:
        r_mid = r_low + (r_high - r_low) // 2
        if A[r_mid][0] <= x <= A[r_mid][N-1]:
            # If it's in the matrix, it should be in this row. Do another binary
            # search to find the column index if it exists.
            c_low, c_high = 0, N - 1
            while c_low <= c_high:
                c_mid = c_low + (c_high - c_low) // 2
                if A[r_mid][c_mid] == x:
                    return (r_mid, c_mid)
                elif x < A[r_mid][c_mid]:
                    c_high = c_mid - 1
                else:
                    c_low = c_mid + 1
            return None
        elif x < A[r_mid][0]:
            r_high = r_mid - 1
        else:
            r_low = r_mid + 1
        
    return None

A = [[1, 2, 3],
     [4, 5, 6],
     [7, 8, 9],
     [10, 11, 12],
     [13, 14, 15]]

assert find_in_sorted_matrix(A, 3) == (0, 2)
assert find_in_sorted_matrix(A, 10) == (3, 0)
assert find_in_sorted_matrix(A, 16) == None

11.7 A circus is designing a tower routine consisting of people standing atop one
another's shoulders. For practical and aesthetic reasons, each person must be
both shorter and lighter than the person below him or her. Given the heights
and weights of each person in the circus, write a method to compute the largest
possible number of people in such a tower

In [49]:
def circus_stunt(people):
    people.sort() # Sorts on height first, then weight

    # Now we can solve this with longest increasing subsequence when looking at the weights.
    # NOTE: O(n^2) but can be solved in O(nlog(n)) I think
    dp = [0] * len(people)
    dp[0] = 1
    for i in range(1, len(people)):
        m = max([dp[j] if people[j][0] < people[i][0] and people[j][1] < people[i][1] else 0 
                 for j in range(0, i)])
        dp[i] = 1 + m
    
    return dp[-1]
 
few_people = [
    (168, 65),
    (168, 70),
    (183, 70),
    (183, 80),
    (185, 80)
]

assert circus_stunt(few_people) == 3 # (168, 65) on (183, 70) on (185, 80)
    
import random
people = [(random.gauss(175, 12), random.gauss(75, 10)) for _ in range(1000)]
circus_stunt(people)

40

11.8 Imagine you are reading in a stream of integers. Periodically, you wish to be able
to look up the rank of a number x (the number of values less than or equal to x).
Implement the data structures and algorithms to support these operations.That
is, implement the method track(int x), which is called when each number
is generated, and the method getRankOfNumber(int x), which returns the
number of values less than or equal to x (not including x itself)