In [None]:
import glob
import os.path
import random
import pprint

import numpy as np

In [None]:
file_list = [(round(10000 * abs(random.normalvariate(0, 1))), ''.join(random.choices('abcdefg', k=6))) for _ in range(1000)]

In [None]:
file_list[:10]

In [None]:
sorted_file_list = sorted(file_list, reverse=True)

In [None]:
sorted_file_list[:10]

In [None]:
sorted_file_list[-10:]

In [None]:
bin_count = 24

In [None]:
bin_list = [ dict((('bin_weight', 0.0), ('bin_contents', []))) for _ in range(bin_count) ]

In [None]:
bin_list

In [None]:
target_bin_weight = np.round(np.sum([weight for weight, name in sorted_file_list]) / 24)
target_bin_weight

In [None]:
for (weight, name) in sorted_file_list:
    bin_list[0]['bin_contents'].append((weight, name))
    bin_list[0]['bin_weight'] += weight
    bin_list.sort(key=lambda bin: bin['bin_weight'])

In [None]:
bin_weights = [bin['bin_weight'] for bin in bin_list]

In [None]:
bin_weights

In [None]:
def pack_file_lists(file_list, bin_count):
    """
    Given 'file_list' of (file size, file path) tuples return 'bin_count' lists of file paths such
    that the files in each list have approximately the same total number of bytes.
    
    Arguments:
    file_list sequence of tuples (file size, file path)
    bin_count number of bins into which files will be packed
    """
    target_bin_weight = np.round(np.sum([weight for weight, name in file_list]) / bin_count)
    print('the target bin weight is {:8.1f}'.format(target_bin_weight))
    
    bin_list = [ dict((('bin_weight', 0.0), ('bin_contents', []))) for _ in range(bin_count) ]
    
    for (weight, name) in sorted(file_list, reverse=True):
        bin_list[0]['bin_contents'].append((weight, name))
        bin_list[0]['bin_weight'] += weight
        bin_list.sort(key=lambda bin: bin['bin_weight'])     
    
    print('minimum bin weight: {:8.1f}\nmaximum bin weight: {:8.1f}'.format(
        bin_list[0]['bin_weight'], bin_list[-1]['bin_weight']))
    
    return [bin['bin_contents'] for bin in bin_list]

In [None]:
packed_file_lists = pack_file_lists(file_list=file_list, bin_count=24)

In [None]:
def make_packed_file_lists(file_glob, file_list_count):
    """
    for example,
        file_glob = '/home/jklynch/host/project/**/*.py'
    """
    file_paths = glob.glob(file_glob, recursive=True)
    print('found {} file paths with "{}"'.format(len(file_paths), file_glob))
    
    file_size_path_list = [(os.path.getsize(fp), fp) for fp in file_paths]
    print('first 5 files:\n{}'.format(pprint.pformat(file_size_path_list[:5])))
    
    packed_file_lists = pack_file_lists(file_size_path_list, bin_count=file_list_count)
    
    return packed_file_lists

In [None]:
packed_python_file_lists = make_packed_file_lists(
    file_glob='/home/jklynch/host/project/**/*.py',
    file_list_count=5)

In [None]:
for packed_python_file_list in packed_python_file_lists:
    for i, (weight, fp) in enumerate(packed_python_file_list):
        print('{}: {}'.format(i, fp))