### 1. Parallel operattion : run simultaneously

- python built-ins
    1. Multithreading : GIL(Global Interpreter Lock) prevents multiple threads from accessing the same python obejct simultaneously. So second thread must wait for the first thread to release the GIL.

In [1]:
import threading

In [2]:
def count():
    x  = 0
    while x < 10000000:
        x += 1

In [3]:
# multi-threading
def multithreading_run():
    thread_1 = threading.Thread(target=count)
    thread_2 = threading.Thread(target=count)    
    thread_1.start()
    thread_2.start()
    thread_1.join()
    thread_2.join()

In [4]:
def serial_run():
    count()
    count()

In [5]:
%time multithreading_run()

CPU times: user 1.31 s, sys: 10 ms, total: 1.32 s
Wall time: 1.31 s


In [6]:
%time serial_run()

CPU times: user 1.14 s, sys: 1.72 ms, total: 1.14 s
Wall time: 1.14 s


    2. Multiprocessing : 

In [7]:
import multiprocessing

In [8]:
def multiprocessing_run():
    process_1 = multiprocessing.Process(target=count)
    process_2 = multiprocessing.Process(target=count)    
    process_1.start()
    process_2.start()
    process_1.join()
    process_2.join()

In [9]:
%time multiprocessing_run()

CPU times: user 2.08 ms, sys: 5.26 ms, total: 7.34 ms
Wall time: 590 ms


### Utilizing Pool

In [10]:
import collections

student = collections.namedtuple('student', ['name', 'year'])
students = (
    student(name='Lee', year='2007'),
    student(name='Kim', year='2017'),
    student(name='Park', year='2002'),
    student(name='Jung', year='2003'),
    student(name='Cha', year='2004'),
    student(name='Na', year='2010'),
    student(name='Kang', year='2001'),    
    student(name='Yoon', year='2014'),
    student(name='You', year='2016'),    
)

In [11]:
import time
current_year = int(time.strftime("%Y"))
print(str(current_year))

def calculate_years_from_graudate(stu):
    return {
        'name': stu.name,
        'year': current_year - int(stu.year)
    }

2019


In [12]:
def multi_pool_run():
    pool = multiprocessing.Pool()
    pool.map(calculate_years_from_graudate, students)

In [13]:
%time multi_pool_run()

CPU times: user 10.5 ms, sys: 19.8 ms, total: 30.3 ms
Wall time: 31.3 ms


In [14]:
%time list(map(lambda x: calculate_years_from_graudate(x), students))

CPU times: user 16 µs, sys: 4 µs, total: 20 µs
Wall time: 22.9 µs


[{'name': 'Lee', 'year': 12},
 {'name': 'Kim', 'year': 2},
 {'name': 'Park', 'year': 17},
 {'name': 'Jung', 'year': 16},
 {'name': 'Cha', 'year': 15},
 {'name': 'Na', 'year': 9},
 {'name': 'Kang', 'year': 18},
 {'name': 'Yoon', 'year': 5},
 {'name': 'You', 'year': 3}]

### With pandas Dataframe

In [15]:
import pandas as pd

In [16]:
chem_ds = pd.read_csv('/Users/grace/workspace/bio_dataset/chemical2pubtator/chemical2pubtator.gz',\
                        compression='gzip', sep='\t', quotechar='"')

In [17]:
chem_ds.head()

Unnamed: 0,PMID,MeshID,Mentions,Resource
0,1,MESH:D000432,methanol,tmChem|MESH
1,1,MESH:D005561,Formate,MESH
2,10,MESH:D004074,Digitoxin,MESH
3,1000,MESH:D009249,NADP,MESH
4,1000001,MESH:D002118,calcium,tmChem|MESH


In [18]:
chem_ds.dtypes

PMID         int64
MeshID      object
Mentions    object
Resource    object
dtype: object

In [19]:
chem_ds = chem_ds[:5000]

In [20]:
def convert_to_str(data):
    return str(data)

In [29]:
%time chem_ds['Resource'] = chem_ds['Resource'].map(convert_to_str)

CPU times: user 1.88 ms, sys: 110 µs, total: 1.99 ms
Wall time: 1.96 ms


In [31]:
chem_ds.dtypes

PMID         int64
MeshID      object
Mentions    object
Resource    object
dtype: object

In [24]:
def convert_resource_to_str(data):
    data['Resource'] = data['Resource'].apply(lambda x: str(x))
    return data

In [25]:
import numpy as np
from multiprocessing import cpu_count

num_cores = cpu_count()
print(str(num_cores))

def parallelize(df, func):
    data_split = np.array_split(df, num_cores)
    pool = multiprocessing.Pool(num_cores)
    data = pd.concat(pool.map(func, data_split))
    pool.close()
    pool.join()
    return df

8


In [26]:
%time parallelize(chem_ds, convert_resource_to_str)

CPU times: user 20.9 ms, sys: 41.2 ms, total: 62.1 ms
Wall time: 153 ms


Unnamed: 0,PMID,MeshID,Mentions,Resource
0,1,MESH:D000432,methanol,tmChem|MESH
1,1,MESH:D005561,Formate,MESH
2,10,MESH:D004074,Digitoxin,MESH
3,1000,MESH:D009249,NADP,MESH
4,1000001,MESH:D002118,calcium,tmChem|MESH
5,1000001,MESH:D008274,magnesium,tmChem|MESH
6,1000002,MESH:D011374,progesterone,MESH|tmChem
7,1000003,MESH:D011188,potassium,tmChem|MESH
8,1000003,MESH:D012964,sodium,MESH|tmChem
9,1000004,MESH:D014529,uridine,MESH


In [27]:
def serial_df():
    for i, row in chem_ds.iterrows():
        row['Resource'] = str(row['Resource'])

In [28]:
%time serial_df()

CPU times: user 310 ms, sys: 1.97 ms, total: 312 ms
Wall time: 312 ms
