# Parallel Processing and Parallel Threading

In [None]:
'''
One golden rule (but it is questionable) is 
multithreading is good for IO-Bound tasks
multiprocessing is good for CPU-Bound tasks

speeding up tasks depends on inter-process communication (IPC) overhead, so that's why parallel process takes more time than sequential.
'''

In [None]:
'''
Below packages are explored:- 
1) multiprocessing
https://docs.python.org/3/library/multiprocessing.html
2) Ray
https://docs.ray.io/en/latest/ray-overview/getting-started.html
3) joblib
https://joblib.readthedocs.io/en/latest/
4) spark
https://spark.apache.org/docs/latest/api/python/getting_started/index.html
'''

In [1]:
from logging import getLogger, getLevelName, Formatter, StreamHandler

log = getLogger("TEST")
log.setLevel(getLevelName('INFO'))
log_formatter = Formatter("%(asctime)s [%(levelname)s] [%(processName)s] [%(threadName)s] [%(module)s] [%(funcName)s] [%(lineno)s] %(name)s: %(message)s") # I am printing thread id here

console_handler = StreamHandler()
console_handler.setFormatter(log_formatter)
log.addHandler(console_handler)

## multiprocessing

In [2]:
import time
import multiprocessing as mp

In [3]:
def dummy_func(ip, add_nbr):
    log.info("msg")
    time.sleep(1)
    return ip + add_nbr

def chunks_mtdh(l, n):
    chunks = []
    for i in range(0, len(l), n):
        chunks.extend(l[i:i+n])
    return chunks

In [4]:
inputs = [i for i in range(100)]
print(inputs)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]


In [5]:
'''
single process and single thread
MainProcess - MainThread
'''
start = time.perf_counter()
outputs = []
cnst_nbr = 10
for ip in inputs:
    result = dummy_func(ip, cnst_nbr)
    outputs.append(result)
    
print(outputs)
end = time.perf_counter()
print(f"total time taken {end-start}")

2023-04-05 17:21:01,760 [INFO] [MainProcess] [MainThread] [2968072285] [dummy_func] [2] TEST: msg
2023-04-05 17:21:02,765 [INFO] [MainProcess] [MainThread] [2968072285] [dummy_func] [2] TEST: msg
2023-04-05 17:21:03,767 [INFO] [MainProcess] [MainThread] [2968072285] [dummy_func] [2] TEST: msg
2023-04-05 17:21:04,773 [INFO] [MainProcess] [MainThread] [2968072285] [dummy_func] [2] TEST: msg
2023-04-05 17:21:05,774 [INFO] [MainProcess] [MainThread] [2968072285] [dummy_func] [2] TEST: msg
2023-04-05 17:21:06,780 [INFO] [MainProcess] [MainThread] [2968072285] [dummy_func] [2] TEST: msg
2023-04-05 17:21:07,781 [INFO] [MainProcess] [MainThread] [2968072285] [dummy_func] [2] TEST: msg
2023-04-05 17:21:08,788 [INFO] [MainProcess] [MainThread] [2968072285] [dummy_func] [2] TEST: msg
2023-04-05 17:21:09,793 [INFO] [MainProcess] [MainThread] [2968072285] [dummy_func] [2] TEST: msg
2023-04-05 17:21:10,797 [INFO] [MainProcess] [MainThread] [2968072285] [dummy_func] [2] TEST: msg
2023-04-05 17:21:11,

[10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109]
total time taken 100.442442239


In [6]:
'''
multiple processes and single thread
ForkPoolWorker-1 - MainThread
ForkPoolWorker-2 - MainThread
...
ForkPoolWorker-n - MainThread
'''

start = time.perf_counter()
CPU_CNT = mp.cpu_count()
print(CPU_CNT)
outputs = []
cnst_nbr = 10
with mp.Pool(CPU_CNT-1) as pool:
    chunks = chunks_mtdh(inputs, CPU_CNT-1)
    for result in pool.starmap(dummy_func, [(ip, cnst_nbr) for ip in chunks]):
        outputs.append(result)
        
print(outputs)
end = time.perf_counter()
print(f"total time taken {end-start}")

4


2023-04-05 17:23:25,461 [INFO] [ForkPoolWorker-2] [MainThread] [2968072285] [dummy_func] [2] TEST: msg
2023-04-05 17:23:25,461 [INFO] [ForkPoolWorker-1] [MainThread] [2968072285] [dummy_func] [2] TEST: msg
2023-04-05 17:23:25,462 [INFO] [ForkPoolWorker-3] [MainThread] [2968072285] [dummy_func] [2] TEST: msg
2023-04-05 17:23:26,467 [INFO] [ForkPoolWorker-2] [MainThread] [2968072285] [dummy_func] [2] TEST: msg
2023-04-05 17:23:26,467 [INFO] [ForkPoolWorker-1] [MainThread] [2968072285] [dummy_func] [2] TEST: msg
2023-04-05 17:23:26,469 [INFO] [ForkPoolWorker-3] [MainThread] [2968072285] [dummy_func] [2] TEST: msg
2023-04-05 17:23:27,471 [INFO] [ForkPoolWorker-2] [MainThread] [2968072285] [dummy_func] [2] TEST: msg
2023-04-05 17:23:27,471 [INFO] [ForkPoolWorker-1] [MainThread] [2968072285] [dummy_func] [2] TEST: msg
2023-04-05 17:23:27,472 [INFO] [ForkPoolWorker-3] [MainThread] [2968072285] [dummy_func] [2] TEST: msg
2023-04-05 17:23:28,474 [INFO] [ForkPoolWorker-1] [MainThread] [296807228

[10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109]
total time taken 36.279214630999974


In [8]:
'''
single process and multiple threads
MainProcess - Thread-8
MainProcess - Thread-9
...
MainProcess - Thread-n
'''
start = time.perf_counter()
CPU_CNT = mp.cpu_count()
print(CPU_CNT)
outputs = []
cnst_nbr = 10
with mp.pool.ThreadPool(CPU_CNT-1) as pool:
    chunks = chunks_mtdh(inputs, CPU_CNT-1)
    for result in pool.starmap(dummy_func, [(ip, cnst_nbr) for ip in chunks]):
        outputs.append(result)
        
print(outputs)
end = time.perf_counter()
print(f"total time taken {end-start}")

2023-04-05 17:27:20,611 [INFO] [MainProcess] [Thread-8] [2968072285] [dummy_func] [2] TEST: msg
2023-04-05 17:27:20,612 [INFO] [MainProcess] [Thread-9] [2968072285] [dummy_func] [2] TEST: msg
2023-04-05 17:27:20,613 [INFO] [MainProcess] [Thread-10] [2968072285] [dummy_func] [2] TEST: msg


4


2023-04-05 17:27:21,615 [INFO] [MainProcess] [Thread-8] [2968072285] [dummy_func] [2] TEST: msg
2023-04-05 17:27:21,618 [INFO] [MainProcess] [Thread-9] [2968072285] [dummy_func] [2] TEST: msg
2023-04-05 17:27:21,624 [INFO] [MainProcess] [Thread-10] [2968072285] [dummy_func] [2] TEST: msg
2023-04-05 17:27:22,618 [INFO] [MainProcess] [Thread-8] [2968072285] [dummy_func] [2] TEST: msg
2023-04-05 17:27:22,624 [INFO] [MainProcess] [Thread-9] [2968072285] [dummy_func] [2] TEST: msg
2023-04-05 17:27:22,626 [INFO] [MainProcess] [Thread-10] [2968072285] [dummy_func] [2] TEST: msg
2023-04-05 17:27:23,624 [INFO] [MainProcess] [Thread-8] [2968072285] [dummy_func] [2] TEST: msg
2023-04-05 17:27:23,628 [INFO] [MainProcess] [Thread-9] [2968072285] [dummy_func] [2] TEST: msg
2023-04-05 17:27:23,628 [INFO] [MainProcess] [Thread-10] [2968072285] [dummy_func] [2] TEST: msg
2023-04-05 17:27:24,625 [INFO] [MainProcess] [Thread-8] [2968072285] [dummy_func] [2] TEST: msg
2023-04-05 17:27:24,634 [INFO] [MainP

[10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109]
total time taken 36.247737588999996


## ray

In [10]:
import time
import ray

In [11]:
def dummy_func(ip, add_nbr):
    log.info("msg")
    time.sleep(1)
    return ip + add_nbr

def chunks_mtdh(l, n):
    chunks = []
    for i in range(0, len(l), n):
        chunks.extend(l[i:i+n])
    return chunks

In [12]:
inputs = [i for i in range(100)]
print(inputs)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]


In [13]:
start = time.perf_counter()
outputs = []
cnst_nbr = 10
for ip in inputs:
    result = dummy_func(ip, cnst_nbr)
    outputs.append(result)
    
print(outputs)
end = time.perf_counter()
print(f"total time taken {end-start}")

2023-04-05 17:29:47,050 [INFO] [MainProcess] [MainThread] [2968072285] [dummy_func] [2] TEST: msg
2023-04-05 17:29:48,056 [INFO] [MainProcess] [MainThread] [2968072285] [dummy_func] [2] TEST: msg
2023-04-05 17:29:49,058 [INFO] [MainProcess] [MainThread] [2968072285] [dummy_func] [2] TEST: msg
2023-04-05 17:29:50,066 [INFO] [MainProcess] [MainThread] [2968072285] [dummy_func] [2] TEST: msg
2023-04-05 17:29:51,071 [INFO] [MainProcess] [MainThread] [2968072285] [dummy_func] [2] TEST: msg
2023-04-05 17:29:52,074 [INFO] [MainProcess] [MainThread] [2968072285] [dummy_func] [2] TEST: msg
2023-04-05 17:29:53,079 [INFO] [MainProcess] [MainThread] [2968072285] [dummy_func] [2] TEST: msg
2023-04-05 17:29:54,084 [INFO] [MainProcess] [MainThread] [2968072285] [dummy_func] [2] TEST: msg
2023-04-05 17:29:55,090 [INFO] [MainProcess] [MainThread] [2968072285] [dummy_func] [2] TEST: msg
2023-04-05 17:29:56,095 [INFO] [MainProcess] [MainThread] [2968072285] [dummy_func] [2] TEST: msg
2023-04-05 17:29:57,

[10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109]
total time taken 100.39820348700005


In [14]:
# Ray is Distributed multiprocessing package
start = time.perf_counter()
CPU_CNT = mp.cpu_count()
print(CPU_CNT)
ray.init(num_cpus=CPU_CNT-1)
outputs = []
cnst_nbr = 10

remote_dummy_func = ray.remote(dummy_func) 
outputs = ray.get([remote_dummy_func.remote(ip, cnst_nbr) for ip in inputs])

ray.shutdown()
print(outputs)
end = time.perf_counter()
print(f"total time taken {end-start}")

4


2023-04-05 17:34:05,600	INFO worker.py:1528 -- Started a local Ray instance.


[10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109]
total time taken 43.62238994800009


## joblib

In [15]:
import time
import joblib
from joblib import Parallel, delayed
from joblib import wrap_non_picklable_objects

In [16]:
joblib.__version__

'1.2.0'

In [32]:
# from joblib.externals.loky import set_loky_pickler
# set_loky_pickler('pickle5')

In [36]:
@wrap_non_picklable_objects
def dummy_func(ip, add_nbr):
    log.info("msg")
    time.sleep(1)
    return ip + add_nbr

def chunks_mtdh(l, n):
    chunks = []
    for i in range(0, len(l), n):
        chunks.extend(l[i:i+n])
    return chunks

In [37]:
inputs = [i for i in range(100)]
print(inputs)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]


In [20]:
start = time.perf_counter()
outputs = []
cnst_nbr = 10
for ip in inputs:
    result = dummy_func(ip, cnst_nbr)
    outputs.append(result)
    
print(outputs)
end = time.perf_counter()
print(f"total time taken {end-start}")

[10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109]
total time taken 100.21423147399992


In [38]:
# works with python 3.8 ~ ValueError: unsupported pickle protocol: 5
start = time.perf_counter()
CPU_CNT = mp.cpu_count()
print(CPU_CNT)
outputs = []
cnst_nbr = 10

outputs = Parallel(n_jobs=CPU_CNT-1, prefer="processes")(delayed(dummy_func)(ip, cnst_nbr) for ip in inputs)

print(outputs)
end = time.perf_counter()
print(f"total time taken {end-start}")

4
[10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109]
total time taken 34.13869442300006


In [39]:
# works with python 3.8 ~ ValueError: unsupported pickle protocol: 5
start = time.perf_counter()
CPU_CNT = mp.cpu_count()
print(CPU_CNT)
outputs = []
cnst_nbr = 10

outputs = Parallel(n_jobs=CPU_CNT-1, prefer="threads")(delayed(dummy_func)(ip, cnst_nbr) for ip in inputs)

print(outputs)
end = time.perf_counter()
print(f"total time taken {end-start}")

2023-04-05 17:43:25,209 [INFO] [MainProcess] [Thread-25] [358232672] [dummy_func] [3] TEST: msg
2023-04-05 17:43:25,210 [INFO] [MainProcess] [Thread-26] [358232672] [dummy_func] [3] TEST: msg
2023-04-05 17:43:25,211 [INFO] [MainProcess] [Thread-27] [358232672] [dummy_func] [3] TEST: msg


4


2023-04-05 17:43:26,214 [INFO] [MainProcess] [Thread-25] [358232672] [dummy_func] [3] TEST: msg
2023-04-05 17:43:26,218 [INFO] [MainProcess] [Thread-26] [358232672] [dummy_func] [3] TEST: msg
2023-04-05 17:43:26,221 [INFO] [MainProcess] [Thread-27] [358232672] [dummy_func] [3] TEST: msg
2023-04-05 17:43:27,221 [INFO] [MainProcess] [Thread-25] [358232672] [dummy_func] [3] TEST: msg
2023-04-05 17:43:27,222 [INFO] [MainProcess] [Thread-26] [358232672] [dummy_func] [3] TEST: msg
2023-04-05 17:43:27,223 [INFO] [MainProcess] [Thread-27] [358232672] [dummy_func] [3] TEST: msg
2023-04-05 17:43:28,224 [INFO] [MainProcess] [Thread-25] [358232672] [dummy_func] [3] TEST: msg
2023-04-05 17:43:28,225 [INFO] [MainProcess] [Thread-26] [358232672] [dummy_func] [3] TEST: msg
2023-04-05 17:43:28,230 [INFO] [MainProcess] [Thread-27] [358232672] [dummy_func] [3] TEST: msg
2023-04-05 17:43:29,230 [INFO] [MainProcess] [Thread-25] [358232672] [dummy_func] [3] TEST: msg
2023-04-05 17:43:29,231 [INFO] [MainProc

[10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109]
total time taken 34.17786433200013


## spark

In [40]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [41]:
def dummy_func(ip, add_nbr):
    log.info("msg")
    time.sleep(1)
    return ip + add_nbr

def chunks_mtdh(l, n):
    chunks = []
    for i in range(0, len(l), n):
        chunks.extend(l[i:i+n])
    return chunks

In [42]:
inputs = [i for i in range(100)]
print(inputs)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]


In [43]:
start = time.perf_counter()
outputs = []
cnst_nbr = 10
for ip in inputs:
    result = dummy_func(ip, cnst_nbr)
    outputs.append(result)
    
print(outputs)
end = time.perf_counter()
print(f"total time taken {end-start}")

2023-04-05 17:46:40,101 [INFO] [MainProcess] [MainThread] [2968072285] [dummy_func] [2] TEST: msg
2023-04-05 17:46:41,107 [INFO] [MainProcess] [MainThread] [2968072285] [dummy_func] [2] TEST: msg
2023-04-05 17:46:42,112 [INFO] [MainProcess] [MainThread] [2968072285] [dummy_func] [2] TEST: msg
2023-04-05 17:46:43,115 [INFO] [MainProcess] [MainThread] [2968072285] [dummy_func] [2] TEST: msg
2023-04-05 17:46:44,120 [INFO] [MainProcess] [MainThread] [2968072285] [dummy_func] [2] TEST: msg
2023-04-05 17:46:45,123 [INFO] [MainProcess] [MainThread] [2968072285] [dummy_func] [2] TEST: msg
2023-04-05 17:46:46,124 [INFO] [MainProcess] [MainThread] [2968072285] [dummy_func] [2] TEST: msg
2023-04-05 17:46:47,126 [INFO] [MainProcess] [MainThread] [2968072285] [dummy_func] [2] TEST: msg
2023-04-05 17:46:48,132 [INFO] [MainProcess] [MainThread] [2968072285] [dummy_func] [2] TEST: msg
2023-04-05 17:46:49,137 [INFO] [MainProcess] [MainThread] [2968072285] [dummy_func] [2] TEST: msg
2023-04-05 17:46:50,

[10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109]
total time taken 100.48428956199996


In [44]:
import os
os.environ["JAVA_HOME"] = "/Users/jaydeepchakraborty/JC/Software/jdk-20.jdk/Contents/Home"
os.environ["PATH"] = os.environ.get("PATH", "") + ":" +  os.environ.get("JAVA_HOME", "")

os.environ["SPARK_HOME"] = "/Users/jaydeepchakraborty/JC/Software/spark-3.3.2-bin-hadoop3"
os.environ["PATH"] = os.environ.get("PATH", "") + ":" +  os.environ.get("SPARK_HOME", "")

os.environ["HADOOP_HOME"] = "/Users/jaydeepchakraborty/JC/Software/hadoop-3.3.3/libexec"
os.environ["PATH"] = os.environ.get("PATH", "") + ":" +  os.environ.get("HADOOP_HOME", "")

In [45]:
# spark is parallel-processing
start = time.perf_counter()
outputs = []
cnst_nbr = 10

CPU_CNT = mp.cpu_count()
spark = SparkSession.builder.master("local["+str(CPU_CNT-1)+"]").appName("TEST").getOrCreate()
inputs_rdd = spark.sparkContext.parallelize(inputs)

outputs = inputs_rdd.map(lambda x : dummy_func(x, cnst_nbr)).collect()
print(outputs)
end = time.perf_counter()
print(f"total time taken {end-start}")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/04/05 17:51:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


[Stage 0:>                                                          (0 + 3) / 3]

23/04/05 17:51:15 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors




[10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109]
total time taken 42.39749055099992


                                                                                