## In this notebook, I will experiement on the usage of joblib parallelization, which will later be applied to the random forest algorithm.

In [5]:
import numpy as np
import time

In [6]:
rng = np.random.RandomState(42)
data = rng.randn(int(1e4), 4)

In [7]:
def costly_compute(data, column):
    """Emulate a costly function by sleeping and returning a column."""
    time.sleep(2)
    return data[column]

def data_processing_mean(data, column):
    """Compute the mean of a column."""
    return costly_compute(data, column).mean()

start = time.time()
results = [data_processing_mean(data, col) for col in range(data.shape[1])]
stop = time.time()

print('\nSequential processing')
print('Elapsed time for the entire processing: {:.2f} s'
      .format(stop - start))


Sequential processing
Elapsed time for the entire processing: 8.02 s


In [13]:
#Import package
from joblib import Parallel, delayed

In [12]:
def data_processing_mean_using_cache(data, column):
    """Compute the mean of a column."""
    return costly_compute(data, column).mean()

start = time.time()
results = Parallel(n_jobs=2)(
    delayed(data_processing_mean_using_cache)(data, col)
    for col in range(data.shape[1]))
stop = time.time()

print('Elapsed time for the entire processing: {:.2f} s'
      .format(stop - start))

Elapsed time for the entire processing: 4.54 s
