# Comparison for list operations using comprehension and multiprocessing


### we would be comparing time taken for some random operation on HUGEEE lists

In [1]:
import numpy as np
from utilities import not_in
from multiprocessing import Pool
import itertools, time, datetime


### First operation

We will take two lists,say a and b. We want to create a third list c which has all elements of a which are not in b

a very simple list comprehension way of doing it is : [i for i in a if i not in b]

In [2]:
## our timer function

class CodeTimer:
    
    """
        Utility custom contextual class for calculating the time 
        taken for a certain code block to execute
    
    """
    def __init__(self, name=None):
        self.name = " '"  + name + "'" if name else ''

    def __enter__(self):
        self.start = time.clock()

    def __exit__(self, exc_type, exc_value, traceback):
        self.took = (time.clock() - self.start) * 1000.0
        time_taken = datetime.timedelta(milliseconds = self.took)
        print('Code block' + self.name + ' took(HH:MM:SS): ' + str(time_taken))



In [3]:
a = list(range(19800000)) 
b = [2,3,4]

timer = CodeTimer('list_comprehension')

with timer:
    [i for i in a if i not in b]
print(timer.took)
    
## we can see a list close to 19,800,000 is taking 4seconds. 

## remember we could be using setdiff1d method of numpy. bt that apparently works only for 1d arrays

Code block 'list_comprehension' took(HH:MM:SS): 0:00:04.611375
4611.374822791061


In [None]:
## this is the not in function defined in utitlies.py / we had to do it this way, since thats what jupyter expects
## else giving hell lot of not found in main module error
# def not_in(array, another_array):
#     return [i for i in array if i not in another_array]

In [14]:
## lets perform the same action using multiprocessing. we would chunk it into pieces and run those chunks in parallel


if __name__ == '__main__':  

#     a = list(range(19800000)) 
#     b = [2,3,4]


    timer = CodeTimer('mp')
    with timer:
        final = [None]

        with Pool(8) as p:

            added = p.starmap(not_in, [(i.tolist(),b) for i in np.array_split(a,780)])
        #     print(added)
            final = np.concatenate((final,added[0] if len(added)>0 else [None]), axis=0)
            # final.extend(p.starmap(not_in, [(i.tolist(),b) for i in np.array_split(a,2)]))

    print(timer.took)


Code block 'mp' took(HH:MM:SS): 0:00:07.302675
7302.675143872022


#### what we are seeing consistently is that , the usual list comprehension defeats the multiprocessing way by ~3secs every time

##### lets try a 2d array example

In [15]:
a = np.random.randint(1,100, (19800000,2))
a = a.tolist()
# a = [[22,2],[33,3],[44,4],[5,55]]
a.append([11,22])
a.append([22,31])

b = [[11,22],[22,31],[22,22],[11,1]]



timer = CodeTimer('list_comprehension')

with timer:
    [i for i in a if i not in b]
print(timer.took)
    

Code block 'list_comprehension' took(HH:MM:SS): 0:00:04.156448
4156.447902278501


In [16]:
if __name__ == '__main__':  

#     a = list(range(19800000)) 
#     b = [2,3,4]


    timer = CodeTimer('mp')
    with timer:
        final = [0,0]

        with Pool(8) as p:

            added = p.starmap(not_in, [(i.tolist(),b) for i in np.array_split(a,780)])
        #     print(added)
            final = np.concatenate((final,added[0] if len(added)>0 else [0,0]), axis=0)
            # final.extend(p.starmap(not_in, [(i.tolist(),b) for i in np.array_split(a,2)]))

    print(timer.took)

Code block 'mp' took(HH:MM:SS): 0:03:34.721808


ValueError: all the input arrays must have same number of dimensions