In [9]:
import numpy as np
import pandas as pd
import multiprocessing as mp

In [10]:
r = np.random.normal(0, .01, size=(1000, 10000))
num_threads = 24

In [11]:
def barrier_touch(r, width=.5):
    t = dict()
    p = np.log((1 + r).cumprod(axis=0))
    for j in range(r.shape[1]):
        for i in range(r.shape[0]):
            if p[i, j] >= width or p[i, j] <= -width:
                t[j] = i
                continue
    return t

In [12]:
%%time
parts = np.linspace(0, r.shape[0], min(num_threads, r.shape[0]) + 1)
parts = np.ceil(parts).astype(int)
jobs = []
for i  in range(1, len(parts)):
    jobs.append(r[:, parts[i-1]:parts[i]])
pool = mp.Pool(processes=num_threads)
out = []
outputs = pool.imap_unordered(barrier_touch, jobs)
for out_ in outputs:
    out.append(out_)
pool.close()
pool.join()

CPU times: user 38.1 ms, sys: 104 ms, total: 143 ms
Wall time: 365 ms


In [13]:
%%time
out = barrier_touch(r)

CPU times: user 4.39 s, sys: 90.3 ms, total: 4.48 s
Wall time: 4.48 s


In [14]:
def lin_parts(num_atoms, num_threads):
    parts = np.linspace(0, num_atoms, min(num_threads, num_atoms) + 1)
    parts = np.ceil(parts).astype(int)
    return parts

In [15]:
import pandas as pd
import numpy as np

def process_jobs(jobs):
    out = []
    for job in jobs:
        out_ = expand_call(job)
        out.append(out_)
    return out

def mp_pandas_obj(func, pd_obj, num_threads=24, mp_batches=1, lin_mols=True, **kwargs):
    num_atoms = len(pd_obj[1])
    if lin_mols:
        parts = lin_parts(num_atoms, num_threads * mp_batches)
    else:
        parts = nested_parts(num_atoms, num_threads * mp_batches)
    jobs = []
    for i in range(1, len(parts)):
        job = {pd_obj[0]: pd_obj[1][parts[i - 1]:parts[i]], 'func': func}
        job.update(kwargs)
        jobs.append(job)
    if num_threads == 1:
        out = process_jobs(jobs)
    else:
        out = process_jobs(jobs, num_threads=num_threads)
    
    if isinstance(out[0], pd.DataFrame):
        df0 = pd.DataFrame()
    elif isinstance(out[0], pd.Series):
        df0 = pd.Series()
    else:
        return out
    
    for i in out:
        df0 = df0.append(i)
    df0 = df0.sort_index()
    return df0

In [16]:
import multiprocessing as mp
import time
import datetime
import sys


def report_progress(job_num, num_jobs, time0, task):
    msg = [float(job_num) / num_jobs, (time.time() - time0) / 60.]
    msg.append(msg[1] * (1 / msg[0] - 1))
    time_stamp  = str(datetime.datetime.fromtimestamp(time.time()))
    msg = time_stamp + ' ' + str(round(msg[0] * 100, 2)) + '% ' + task + ' done after ' + \
        str(round(msg[1], 2)) + ' minutes. Remaining ' + str(round(mgs[2], 2)) + ' minutes.'
    if job_num < num_jobs:
        sys.stderr.write(msg + '\r')
    else:
        sys.stderr.write(msg + '\n')
        

def process_jobs(jobs, task=None, num_threads=24):
    if task is None:
        task = jobs[0]['func'].__name__
    pool = mp.Pool(processes=num_threads)
    
    outputs = pool.imap_unordered(expand_call, jobs)
    out = []
    time0 =  time.time()
    for i, out_ in enumerate(outputs, 1):
        out.append(out_)
        report_progress(i, len(jobs), time0, task)
    pool.close()
    pool.join()
    return out

# 20.1

In [17]:
from itertools import product

dict0 = {'a': ['1', '2'], 'b': ['+', '*'], 'c': ['!', '@']}
jobs = (dict(zip(dict0,i)) for i in product(*dict0.values()))

In [18]:
list(product(*dict0.values()))

[('1', '+', '!'),
 ('1', '+', '@'),
 ('1', '*', '!'),
 ('1', '*', '@'),
 ('2', '+', '!'),
 ('2', '+', '@'),
 ('2', '*', '!'),
 ('2', '*', '@')]

In [19]:
list(jobs)

[{'a': '1', 'b': '+', 'c': '!'},
 {'a': '1', 'b': '+', 'c': '@'},
 {'a': '1', 'b': '*', 'c': '!'},
 {'a': '1', 'b': '*', 'c': '@'},
 {'a': '2', 'b': '+', 'c': '!'},
 {'a': '2', 'b': '+', 'c': '@'},
 {'a': '2', 'b': '*', 'c': '!'},
 {'a': '2', 'b': '*', 'c': '@'}]