In [62]:
counts = []
import numpy as np
import pandas as pd
import urllib
for line in urllib.request.urlopen("http://benschmidt.org/sample_page_counts.txt").read().decode('utf-8').split("\n"):
    try:
        counts.append(np.array(list(map(int, line.rstrip().split(',')))))
    except:
        continue
    

In [63]:
len(counts)

5723

In [47]:
def current_scheme(pagecounts, chunk_target, overflow_strategy):
    cumsums = np.cumsum(pagecounts)
    breaks = np.zeros(cumsums.shape[0], np.int)

    # Store the start of chunks in an array.

    # 1-index the chunk names. 
    breaks[0] = 1

    
    # Last entry gives number of words
    ntokens = cumsums[-1]
    n_chunks = int(int((ntokens) / chunk_target))
    if n_chunks == 0:
        breaks[0] = 1
        return breaks
    # Use actual page counts not including zeros

    overflow = (ntokens % chunk_target)

    if overflow > chunk_target/2:
        overflow -= chunk_target
        n_chunks +=1

    # variable; how far do we want the next one to go?
    if overflow_strategy == "ends":
        target = chunk_target + overflow/2 # + avg_page_n/2
    elif overflow_strategy == "last":
        target = chunk_target
    elif overflow_strategy == "even":
        chunk_target += overflow / n_chunks
        target = chunk_target

    # Proportion of chunk_target that the length adjustment should cap at
    max_adjust = .1 * chunk_target
    # When the remaining words per chunk is higher/lower that x proportion
    #  of the chunk_target, add/remove a chunk.
    new_chunk_threshold = .4 * chunk_target

    i = 1
    while True:
        remaining_chunks = n_chunks - i
        if not remaining_chunks:
            break

        last_page = np.argmin(np.abs(cumsums - target))
        if last_page + 1 >= len(breaks):
            break

        breaks[last_page+1] = 1

        # Remainder adjust - nudge next section slightly, to try to balance
        # out consistently under or oversized parts.
        remaining_nwords = (cumsums[-1] - cumsums[last_page])
        remaining_word_per_chunk_diff = (remaining_nwords / remaining_chunks) - chunk_target
        if abs(remaining_word_per_chunk_diff) > new_chunk_threshold:
            n_chunks += np.sign(remaining_word_per_chunk_diff)

        if overflow_strategy == 'even':
            adjust = remaining_word_per_chunk_diff
        else:
            # Adjust slightly - allowing more adjustment early
            adjust = (0.5+0.5*remaining_chunks/n_chunks) * remaining_word_per_chunk_diff
        if np.abs(adjust) > max_adjust:
            adjust = max_adjust * np.sign(adjust)
        target = chunk_target + cumsums[last_page] + adjust
        i += 1

    return np.cumsum(breaks)



In [79]:
def end_chunks(page_counts, target, even = False, two_sided = True, procrastinate = False):
    
    # Register front and back offsets
    position = [0, len(page_counts)]

    breaks = np.zeros(page_counts.shape[0])
    breaks[0] = 1
    
    while True:
        forward = np.cumsum(page_counts)
        backward = np.cumsum(np.flip(page_counts))
        
        words_left = forward[-1]
        # Exit conditions
        if words_left < (target * 1.5):
            break
            
        overflow = words_left % target
    
        if (target - overflow) < overflow:
            overflow = -(target - overflow)
            
        if even:
            chunks_remaining = np.round(words_left/target)
            if chunks_remaining > 2 and two_sided:
                # The share belonging here
                overflow = overflow * 2 / chunks_remaining
            if (chunks_remaining > 1) and (two_sided == False):
                overflow = overflow/chunks_remaining
        # Split the overflow across the ends
        if two_sided:
            loc_target = target + overflow/2
        else:
            loc_target = target + overflow
        if procrastinate:
            # No overflow handling
            loc_target = target
            
        #What is this number supposed to be?    
        if two_sided and words_left < (target * 2.5):
            midpoint = np.argmin(np.abs(forward - words_left/2))
            breaks[midpoint + position[0] + 1]  = 1
            break

        best_front = np.argmin(np.abs(forward - loc_target))
        position[0] = position[0] + best_front + 1
        breaks[position[0]] = 1
        
        if two_sided:
            best_back = np.argmin(np.abs(backward - loc_target))
            position[1] = position[1] - best_back - 1
            breaks[position[1]] = 1
            new_end = page_counts.shape[0] - best_back - 1
        else:
            # Leave the back for later.
            new_end = page_counts.shape[0] + 1
            
        page_counts = page_counts[(best_front + 1):(new_end)]
        
    return np.cumsum(breaks)


In [49]:
def dumb_chunks(page_counts, target = 10000):
    cumsums = np.cumsum(page_counts)
    n_chunks = np.round(cumsums[-1]/target)
    chunk_size = (cumsums[-1]) // n_chunks + 1
    return cumsums // chunk_size

In [50]:


def test_algorithm(algorithm, print_every = False, **kwargs):
    forward = [[] for i in range(3)]
    backward = [[] for i in range(3)]
    mids = []
    centers = []
    all = []


    work_bias = []

    target = 10000
    vals = []
    for i, p in enumerate(counts):
        if print_every and i % print_every == 0:
            print(i)
        try:
            chunks = algorithm(p, **kwargs)
        except:
            raise
        v = pd.DataFrame({'chunk':chunks, "words": p}).groupby('chunk')['words']\
        .agg(['sum', 'count'])['sum']
        f = pd.DataFrame({"error": v.values - target})
        f['chunk'] = range(f.shape[0])
        f['book'] = i
        if len(f) < 3:
            f['which'] = "short"
        else:
            f["which"] = ["first"] + ["mid"] * (f.shape[0] - 2) + ["last"]
        vals.append(f)
    return pd.concat(vals, ignore_index = True)


# Current strategy

The current strategy isn't bad.

In [65]:
x = test_algorithm(current_scheme, chunk_target = 10000, overflow_strategy="even", print_every = 1000)
x.groupby('which')['error'].agg(['mean', 'std', 'min', 'max'])

0
1000
2000
3000
4000
5000


Unnamed: 0_level_0,mean,std,min,max
which,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
first,-2.386926,461.76995,-2378,10601
last,-1.398985,461.840757,-4733,3049
mid,-2.090569,323.311152,-2916,2830
short,-3221.590183,4700.954205,-9999,4977


### Variation within books

I think this is really important; the current method generally hovers in on an average for any individual book that may not be target chunk size. While overall stats are good, each book is typically 400 words of average chunks off the target, high or low.

In [66]:
x[x.which=='mid'].groupby('book')['error'].agg('mean').reset_index()['error'].agg(['mean', 'max', 'min', 'std'])

mean      -5.294905
max     1775.000000
min    -2330.000000
std      411.674475
Name: error, dtype: float64

# Two-sided end-clobbering

This produces slightly lower std deviations on the main class at the cost of significantly higher in first and last. 

In [67]:
x = test_algorithm(end_chunks, target = 10000, even = False, two_sided = True, procrastinate = False, print_every = 1000)
x.groupby('which')['error'].agg(['mean', 'std', 'min', 'max'])


0
1000
2000
3000
4000
5000


Unnamed: 0_level_0,mean,std,min,max
which,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
first,-13.892744,1454.993077,-2885,10601
last,-16.773218,1447.925811,-2933,3049
mid,0.271844,278.030147,-2999,3893
short,-989.549317,3066.104741,-9977,4977


### Much less deviation *within* books. 

While the overall stats are similar, this method is much better in that the average deviation by book is much more tightly constrained. Each book's mean error std of 51 words off the target, compared to 414 to the current method.

In [69]:
x[x.which=='mid'].groupby('book')['error'].agg('mean').reset_index()['error'].agg(['mean', 'max', 'min', 'std'])

mean     -0.306053
max     587.000000
min    -686.000000
std      51.753604
Name: error, dtype: float64

### Slightly better even chunking

The stats for even chunking (allocating the remainder slowly) come out a little better than the current method. But the within-book means still show much higher variance.

In [71]:
x = test_algorithm(end_chunks, target = 10000, even = True, two_sided = True, procrastinate = False)
x.groupby('which')['error'].agg(['mean', 'std', 'min', 'max'])

Unnamed: 0_level_0,mean,std,min,max
which,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
first,-2.386926,461.76995,-2378,10601
last,-1.115718,442.570489,-1966,3049
mid,-4.03898,319.339288,-2830,3893
short,-989.549317,3066.104741,-9977,4977


In [74]:
x[x.which=='mid'].groupby('book')['error'].agg('mean').reset_index()['error'].agg(['mean', 'max', 'min', 'std'])

mean     -19.233618
max     4921.000000
min    -5195.000000
std      906.224510
Name: error, dtype: float64

In [75]:
# Clobber the middle. A weird strategy
x = test_algorithm(end_chunks, target = 10000, even = False, two_sided = True, procrastinate = True)
x.groupby('which')['error'].agg(['mean', 'std', 'min', 'max'])

Unnamed: 0_level_0,mean,std,min,max
which,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
first,1.646922,217.191828,-2378,10601
last,1.708695,168.076652,-2930,1265
mid,-7.127571,726.881872,-9981,4998
short,-989.549317,3066.104741,-9977,4977


### Here's a clobber the last strategy.

It gets extremely low stds on the first and mid chunks (~222)

In [80]:
# Clobber the last.
x = test_algorithm(end_chunks, target = 10000, even = False, two_sided = False, procrastinate = True)
x.groupby('which')['error'].agg(['mean', 'std', 'min', 'max'])

Unnamed: 0_level_0,mean,std,min,max
which,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
first,1.646922,217.191828,-2378,10601
last,-46.68754,2891.640905,-5509,4999
mid,-1.527759,222.893664,-3565,2495
short,-989.549317,3224.958209,-9977,4979


In [81]:
x[x.which=='mid'].groupby('book')['error'].agg('mean').reset_index()['error'].agg(['mean', 'max', 'min', 'std'])

mean      1.789731
max     681.857143
min    -499.141892
std      58.569201
Name: error, dtype: float64

### A single-sided even strategy

This works quite well; maybe slightly better than the current single-sided even strategy.

In [None]:
x = test_algorithm(end_chunks, target = 10000, even = True, two_sided = False, procrastinate = False)
x.groupby('which')['error'].agg(['mean', 'std', 'min', 'max'])

In [None]:
x[x.which=='mid'].groupby('book')['error'].agg('mean').reset_index()['error'].agg(['mean', 'max', 'min', 'std'])