In [None]:
# Embarisingly Parallel for loops using Joblib
Example from documentation

In [102]:
from math import sqrt
[sqrt(i ** 2) for i in range(10)]

[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]

In [101]:
from joblib import Parallel, delayed
Parallel(n_jobs=2)(delayed(sqrt)(i ** 2) for i in range(10))

[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]

The progress meter: the higher the value of verbose, the more messages:
If verbose > 50 then message a for every task is returned.

In [97]:
from time import sleep
from joblib import Parallel, delayed
# n_jobs=1 turns off the parallel code for debuging.
r = Parallel(n_jobs=1, verbose=1)(delayed(sleep)(.1) for _ in range(100)) 

[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed:    4.8s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   10.0s finished


In [98]:
r = Parallel(n_jobs=2, verbose=5)(delayed(sleep)(.1) for _ in range(100)) 

[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed:    1.4s
[Parallel(n_jobs=2)]: Done  68 tasks      | elapsed:    4.1s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    5.7s finished


In [99]:
r = Parallel(n_jobs=-1, verbose=10)(delayed(sleep)(.1) for _ in range(100)) 

[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    3.4s finished


Running the same code on the simple sleep function shows the effect of increasing the number of seperate jobs/processes.

## Reusing a pool of workers

In [208]:
def test_reuse():
    """Test Reusing a pool of workers """
    with Parallel(n_jobs=2) as parallel:
        accumulator = 0.
        n_iter = 0
        while accumulator < 1000:
            results = parallel(delayed(sqrt)(accumulator + i ** 2) for i in range(5))
            accumulator += sum(results)  # synchronization barrier
            n_iter += 1

In [209]:
def test_no_reuse():
    """Test Showing Parallel overhead by not Reusing a pool of workers"""
    accumulator = 0.
    n_iter = 0
    while accumulator < 1000:
        results = Parallel(n_jobs=2)(delayed(sqrt)(accumulator + i ** 2) for i in range(5))
        accumulator += sum(results)  # synchronization barrier
        n_iter += 1

In [None]:
%timeit test_reuse()

The slowest run took 4.85 times longer than the fastest. This could mean that an intermediate result is being cached.
1 loop, best of 3: 1.46 s per loop


In [None]:
%timeit test_no_reuse()

# Generators:
Similar to comprehension lists but is effeicent with memory. When you create a comprehension list you need to store it in memory. This can be a problem if you use very large arrays.

The generator only creates one value at a time and then when it has used that value it forgets about it. Thus saving memory. As a result they can be used for iteration but only once.
You create a generator by using normal brackets "()" instead of square brackets "[]".

In [None]:
List = [x ** 2 for x in range(10) if (x%3) is 0]
print(List)
for val in List:
    print(val)

In [None]:
gen = (x ** 2 for x in range(10) if (x%3) is 0)
print(gen)
for val in gen:
    print(val)

In [None]:
print("Another-iteration") 
print(List)
for val in List:
    print(val)
    
# Re-iteration of generator does not return any more values
print(gen)    
for val in gen:
    print(val)


For large arrays it may be more effeicent to create a generator function more than once instead of have a large list saved in memory.

## File Processing thought example

Say you had many files that needed information out of or some processing/transformation.
Say you wanted to extract some information from each of the files e.g. the time, coordinates, some other header information
Or you wanted to normalize spectra (spectra.fits) and save the result to a new file (spectra_normalised.fits)

If all the files (input and output) are independant and your processing automatic then you would probably loop over the files. This should be able to be parallelized.

### Warning nested parallel processes are probably not a good idea. 

In [None]:
#1/2 Code 1/2 Psudocode for many file example:
filenames = ["file1.txt", "file2.txt", ...., "fileN.txt"]
#Serial example
for fname in filenames:
    # Open file and load in data
    with open(fname,"r") as f:
        # Read in data
        data = f.readlines()
    # Do task
    ans = calculations(data)
    
    #Exctract some information and/or # Save to a file
    with open(savefile, "w") as g:
        # Output to file
        g.write(ans)
        
    return ans


# Turn the code inside the loop into its own function
def file_processing(filename, *args):
     # Open file and load in data
    with open(fname,"r") as f:
        # Read in data
        data = f.readlines()
    # Do task
    ans = calculations(data)
    
    #Exctract some information and/or # Save to a file
    with open(savefile, "w") as g:
        # Output to file
        g.write(ans)
        
    return ans


# Serial example with function  
for fname in filenames:
    file_processing(filename, *args)
# or as comprehension list
[file_processing(fname, *args) for fname in filenames]


# Parallel with joblib.
Parallel(n_jobs=2)(delayed(file_processing)(fname, *args) for fname in filenames)

# If you need to then you can write code to extract the results from all the separate savefiles


# Convolution Example


# Joblibs Other tools

## Memory
Example from Joblib documentation showing the caching of input and outputs of the function sqaure().

When it is called with the same parameters again it jsut returns the result without recomputation.

In [None]:
from joblib import Memory
mem = Memory(cachedir='/tmp/joblib')
import numpy as np
a = np.vander(np.arange(10001)).astype(np.float)
b = np.vander(np.arange(5)).astype(np.float)
square = mem.cache(np.square)


In [None]:
%time c = square(a) 

In [None]:
%time d = square(b)

In [None]:
%time e = square(a) # Does not recomute square(a)

In [None]:
%time f = square(b) # Does not recomute square(b)

Timing these calls to square shows that the second call of the function with the same inputs give a much faster result.

## Persistance
joblib.dump() and joblib.load() provide a replacement for pickle to work efficiently on Python objects containing large data, in particular large numpy arrays.

Filename is important here, .pkl will make a pickle like persistance
where as .mmap with make a memory map location for parallel process shared access.

In [None]:
from tempfile import mkdtemp
savedir = mkdtemp()
import os
filename = os.path.join(savedir, 'test.pkl')
#filename = os.path.join(savedir, 'test.mmap')

In [None]:
#Then we create an object to be persisted:
import numpy as np
to_persist = [('a', [1, 2, 3]), ('b', np.arange(10))]
#to_persist = np.ones(int(1e6))

In [None]:
#which we save into savedir:
import joblib
joblib.dump(to_persist, filename)  

In [None]:
# We can then load the object from the file:
joblib.load(filename)
#joblib.load(filename, mmap_mode='r+')