In [10]:
import time


def costly_compute(data, column):
    """Emulate a costly function by sleeping and returning a column."""
    time.sleep(2)
    return data[column]


def data_processing_mean(data, column):
    """Compute the mean of a column."""
    return costly_compute(data, column).mean()


Create some data using t5he random seed and uses a determinisitic data across py sesh.



In [11]:
import numpy as np
rng = np.random.RandomState(42)
data = rng.randn(int(1e4), 4)

In [23]:
start = time.time()
results = [data_processing_mean(data, col) for col in range(data.shape[1])]
stop = time.time()

print('\nSequential processing')
print('Elapsed time for the entire processing: {:.2f} s'
      .format(stop - start))


Sequential processing
Elapsed time for the entire processing: 8.02 s


In [17]:
from joblib import Memory

location = './cachedir'
memory = Memory(location, verbose=0)
costly_compute_cached = memory.cache(costly_compute)

define 'data_processing_mean_using_cache' which benefits from the cachce ny calling 'costly_compute_cached'

In [18]:
def data_processing_mean_using_cache(data, column):
    """compute the mean of a columnm"""
    
    return costly_compute_cached(data, column).mean()


execute process in parallel and caching the intermed results

In [22]:
from joblib import Parallel, delayed

start = time.time() 
results = Parallel(n_jobs=2)(
    delayed(data_processing_mean_using_cache)(data, col)
    for col in range(data.shape[1]))
stop = time.time()

print('\nFirst round - caching the data')
print('Elapsed time for the entire processing: {:.2f} s'
      .format(stop - start))


First round - caching the data
Elapsed time for the entire processing: 4.04 s


# 2 workers = 2X faster

Because we are not running the program sequentially it allows for a faster speed up.

Now this process will be executed again and the intermediate results obtained by calling 'costly_compute_cached' will be loaded from the cache instead of executing it again

In [24]:
start = time.time()
results = Parallel(n_jobs=2)(
    delayed(data_processing_mean_using_cache)(data, col)
    for col in range(data.shape[1]))
stop = time.time()

print('\nSecond round - reloading from the cache')
print('Elapsed time for the entire processing: {:.2f} s'
      .format(stop - start))


Second round - reloading from the cache
Elapsed time for the entire processing: 4.34 s
