在某些情况下，multiprocessing 是 threading 的一个替代者，用于编写需要利用多个 CPU 来规避由于 Python 的全局解释器锁带来的计算瓶颈问题的场合。

In [3]:
import multiprocessing
import time


def slow_worker():
    print('Starting worker')
    time.sleep(0.1)
    print('Finished worker')



p = multiprocessing.Process(target=slow_worker)
print('BEFORE:', p, p.is_alive())

p.start()
print('DURING:', p, p.is_alive())

p.terminate()  # 终止进程，及其子进程
print('TERMINATED:', p, p.is_alive())

p.join()
print('JOINED:', p, p.is_alive())

BEFORE: <Process(Process-1, initial)> False
DURING: <Process(Process-1, started)> True
TERMINATED: <Process(Process-1, started)> True
JOINED: <Process(Process-1, stopped[SIGTERM])> False


In [5]:
import sys

def exit_error():
    sys.exit(1)
    
def exit_ok():
    return 

def return_value():
    return 1

def raises():
    raise RuntimeError('There was an error')
    
def terminated():
    time.sleep(3)
    
jobs = []
funcs = [exit_error, exit_ok, return_value, raises, terminated]
for f in funcs:
    print('Starting process for ', f.__name__)
    j = multiprocessing.Process(target=f, name=f.__name__)
    jobs.append(j)
    j.start()
    
jobs[-1].terminate()
for j in jobs:
    j.join()
    print(f"{j.name:>15} exitcode={j.exitcode}")

Starting process for  exit_error
Starting process for  exit_ok
Starting process for  return_value
Starting process for  raises
Starting process for  terminated


Process raises:
Traceback (most recent call last):
  File "/usr/local/Cellar/python/3.6.5_1/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/local/Cellar/python/3.6.5_1/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-5-a4b3dc014785>", line 13, in raises
    raise RuntimeError('There was an error')
RuntimeError: There was an error


     exit_error exitcode=1
        exit_ok exitcode=0
   return_value exitcode=0
         raises exitcode=1
     terminated exitcode=-15


In [6]:
import logging
def worker():
    print('doing some work')
    sys.stdout.flush()
    
multiprocessing.log_to_stderr(logging.DEBUG)
p = multiprocessing.Process(target=worker)
p.start()
p.join()

[INFO/Process-19] child process calling self.run()


doing some work


[INFO/Process-19] process shutting down
[DEBUG/Process-19] running all "atexit" finalizers with priority >= 0
[DEBUG/Process-19] running the remaining "atexit" finalizers
[INFO/Process-19] process exiting with exitcode 0


## Passing Messages to Processes

In [3]:
class MyFancyClass:
    def __init__(self, name):
        self.name = name
    def  do_sth(self):
        proc_name = multiprocessing.current_process().name
        print(f"Doing sth for {self.name} in {proc_name}")
        
def worker(q):
    obj = q.get()
    obj.do_sth()
    

queue = multiprocessing.Queue()
p1 = multiprocessing.Process(target=worker, args=(queue,))
p2 = multiprocessing.Process(target=worker, args=(queue,))
p1.start()
p2.start()
queue.put(MyFancyClass('sdjaklsdj'))
queue.put(MyFancyClass('gahahah'))
# wait for worker to finish
queue.close()
queue.join_thread()
p1.join()
p2.join()

Doing sth for sdjaklsdj in Process-3
Doing sth for gahahah in Process-4


In [4]:
class Consumer(multiprocessing.Process):
    def __init__(self, task_queue, result_queue):
        super().__init__()
        self.task_queue = task_queue
        self.result_queue = result_queue
        
    def run(self):
        proc_name = self.name
        while True:
            next_task = self.task_queue.get()
            if next_task is None:
                # Poison pill means shutdown
                print(f"{proc_name} Exiting")
                self.task_queue.task_done()
                break
            print(f"{proc_name}:{next_task}")
            answer = next_task()
            self.task_queue.task_done()
            self.result_queue.put(answer)
            
class Task:
    def __init__(self, a, b):
        self.a = a
        self.b = b
    def __call__(self):
        time.sleep(0.1) # pretend to take time to do the work
        return f"{self.a} * {self.b} = {self.a*self.b}"
    def __str__(self):
        return f"{self.a} * {self.b}"

# Establish communication queues
tasks = multiprocessing.JoinableQueue()
results = multiprocessing.Queue()

# Start consumers
num_consumers = multiprocessing.cpu_count()*2
print(f"create {num_consumers} consumers")

consumers = [Consumer(tasks, results) for _ in range(num_consumers)]
for w in consumers:
    w.start()
    
num_jobs = 10
for i in range(num_jobs):
    tasks.put(Task(i, i))
    
# add a posion pill for each consumer
for i in range(num_consumers):
    tasks.put(None)

# wait for all task to finish
tasks.join()
while num_jobs:
    result = results.get()
    print("Result:", result)
    num_jobs -=1

create 8 consumers
Consumer-9:4 * 4
Consumer-5:0 * 0
Consumer-8:3 * 3
Consumer-7:2 * 2
Consumer-6:1 * 1
Consumer-10:5 * 5
Consumer-11:6 * 6
Consumer-12:7 * 7
Consumer-5:8 * 8
Consumer-9:9 * 9
Consumer-7 Exiting
Consumer-6 Exiting
Consumer-10 Exiting
Consumer-8 Exiting
Consumer-11 Exiting
Consumer-12 Exiting
Consumer-5 Exiting
Consumer-9 Exiting
Result: 0 * 0 = 0
Result: 4 * 4 = 16
Result: 2 * 2 = 4
Result: 1 * 1 = 1
Result: 5 * 5 = 25
Result: 3 * 3 = 9
Result: 6 * 6 = 36
Result: 7 * 7 = 49
Result: 8 * 8 = 64
Result: 9 * 9 = 81


## Signaling between Processes

In [8]:
def wait_for_event(e):
    print('wait for event starting:')
    e.wait()
    print('wait_for_event, e.is_set() -->', e.is_set())

def wait_for_event_timeout(e, t):
    print('wait for event timeout starting:')
    e.wait(t)
    print('wait_for_event_timeout, e.is_set() -->', e.is_set())
    
e = multiprocessing.Event()
w1 = multiprocessing.Process(target=wait_for_event, args=(e,), name='block')
w1.start()
w2 = multiprocessing.Process(target=wait_for_event_timeout, args=(e,2), name='non-block')
w2.start()
print('wait for call e is set:')
time.sleep(4)
e.set()
print('main event is set')

wait for event starting:
wait for event timeout starting:
wait for call e is set:
wait_for_event_timeout, e.is_set() --> False
main event is set
wait_for_event, e.is_set() --> True


## Controlling Access to Resources

In [10]:
import sys

def work_with(lock, stream):
    with lock:
        stream.write('lock acquied via with \n')
        
def work_not_with(lock, stream):
    lock.acquire()
    try:
        stream.write('lock acquied via directly \n')      
    finally:
        lock.release()
        
lock = multiprocessing.Lock()
w = multiprocessing.Process(target=work_with, args=(lock, sys.stdout))
nw = multiprocessing.Process(target=work_not_with, args=(lock, sys.stdout))
w.start()
nw.start()
w.join()
nw.join()

lock acquied via with 
lock acquied via directly 


## Synchronizing Operations

In [11]:
def stage_1(cond):
    name = multiprocessing.current_process().name
    print('Starting ', name)
    with cond:
        print(f'{name} done and ready for stage_2')
        cond.notify_all()
        
def stage_2(cond):
    name = multiprocessing.current_process().name
    print('Starting ', name)
    with cond:
        cond.wait()
        print(f'{name} running')
        
condition = multiprocessing.Condition()
s1 = multiprocessing.Process(target=stage_1, args=(condition, ), name = 's1')
s2_clients = [multiprocessing.Process(target=stage_2, args=(condition, ), name = f'stage2[{i}]') for i in range(1,3)]
for c in s2_clients:
    c.start()
    time.sleep(1)
s1.start()
s1.join()
for c in s2_clients:
    c.join()

Starting  stage2[1]
Starting  stage2[2]
Starting  s1
s1 done and ready for stage_2
stage2[2] running
stage2[1] running


## Controlling Concurrent Access to Resources

In [14]:
import random
class ActivePool:
    def __init__(self):
        self.mgr = multiprocessing.Manager()
        self.active = self.mgr.list()
        self.lock = multiprocessing.Lock()
    def make_active(self, name):
        with self.lock:
            self.active.append(name)
    def make_inactive(self, name):
        with self.lock:
            self.active.remove(name)
    def __str__(self):
        with self.lock:
            return str(self.active)
            
def worker(s, pool):
    name = multiprocessing.current_process().name
    with s:
        pool.make_active(name)
        print(f"Activating {name} now running {pool}")
        time.sleep(random.random())
        pool.make_inactive(name)
        
pool = ActivePool()
s = multiprocessing.Semaphore(3)
jobs = [multiprocessing.Process(target=worker, args=(s, pool), name = f'worker[{i}]') for i in range(10)]
for j in jobs:
    j.start()
    
while True:
    alive = 0
    for j in jobs:
        if j.is_alive():
            alive += 1
            j.join(timeout=0.1)
            print('now running ', pool)
    if alive == 0:
        print('Done')
        break

Activating worker[0] now running ['worker[0]', 'worker[1]']
Activating worker[1] now running ['worker[0]', 'worker[1]', 'worker[2]']
Activating worker[2] now running ['worker[0]', 'worker[1]', 'worker[2]']
now running  ['worker[0]', 'worker[1]', 'worker[2]']
now running  ['worker[0]', 'worker[1]', 'worker[2]']
Activating worker[3] now running ['worker[0]', 'worker[2]', 'worker[3]']
now running  ['worker[0]', 'worker[1]', 'worker[2]']
now running  ['worker[0]', 'worker[1]', 'worker[2]']
Activating worker[4] now running ['worker[0]', 'worker[2]', 'worker[4]']
now running  ['worker[0]', 'worker[2]', 'worker[3]']
now running  ['worker[0]', 'worker[2]']
Activating worker[5] now running ['worker[0]', 'worker[2]', 'worker[5]']
Activating worker[6] now running ['worker[2]', 'worker[5]', 'worker[6]']
now running  ['worker[0]', 'worker[2]', 'worker[4]']
now running  ['worker[0]', 'worker[2]', 'worker[4]']
Activating worker[7] now running ['worker[5]', 'worker[6]', 'worker[7]']
Activating worker[

## Managing Shared State

In [17]:
import pprint

def worker(d, key, value):
    d[key] = value
    
mgr = multiprocessing.Manager()
d = mgr.dict()
jobs = [multiprocessing.Process(target=worker, args=(d, i, i*2)) for i in range(10)]
for j in jobs:
    j.start()
for j in jobs:
    j.join()
pprint.pprint(d.items())

[(0, 0),
 (2, 4),
 (1, 2),
 (3, 6),
 (4, 8),
 (5, 10),
 (6, 12),
 (7, 14),
 (9, 18),
 (8, 16)]


## Shared Namespaces

In [18]:
def producer(ns, event):
    ns.value = 'This is the value'
    event.set()


def consumer(ns, event):
    try:
        print('Before event: {}'.format(ns.value))
    except Exception as err:
        print('Before event, error:', str(err))
    event.wait()
    print('After event:', ns.value)


mgr = multiprocessing.Manager()
namespace = mgr.Namespace()
event = multiprocessing.Event()
p = multiprocessing.Process(target=producer,args=(namespace, event),)
c = multiprocessing.Process(target=consumer,args=(namespace, event),)

c.start()
p.start()

c.join()
p.join()

Before event, error: 'Namespace' object has no attribute 'value'
After event: This is the value


In [19]:
def producer(ns, event):
    # DOES NOT UPDATE GLOBAL VALUE!
    ns.my_list.append('This is the value')
    event.set()


def consumer(ns, event):
    print('Before event:', ns.my_list)
    event.wait()
    print('After event :', ns.my_list)



mgr = multiprocessing.Manager()
namespace = mgr.Namespace()
namespace.my_list = []

event = multiprocessing.Event()
p = multiprocessing.Process(target=producer,args=(namespace, event),)
c = multiprocessing.Process(target=consumer,args=(namespace, event),)

c.start()
p.start()

c.join()
p.join()

Before event: []
After event : []


Namespace 中的可变数据类型并不会自动做相应的变化

## Process Pools

In [4]:
def do_cal(data):
    return data*2

def start_pro():
    print('Starting ', multiprocessing.current_process().name)
    
inputs = list(range(10))
print('Input:', inputs)
bulidin_output = map(do_cal, inputs)
print('Buildin output', bulidin_output)

pool_size = multiprocessing.cpu_count()*2
pool = multiprocessing.Pool(processes=pool_size, initializer=start_pro, maxtasksperchild=2,)
pool_outputs = pool.map(do_cal, inputs)
pool.close()
pool.join()
print('Pool', pool_outputs)

Input: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
Buildin output <map object at 0x11129c358>
Starting  ForkPoolWorker-2
Starting  ForkPoolWorker-3
Starting  ForkPoolWorker-4
Starting  ForkPoolWorker-5
Starting  ForkPoolWorker-6
Starting  ForkPoolWorker-7
Starting  ForkPoolWorker-8
Starting  ForkPoolWorker-9
Pool [0, 2, 4, 6, 8, 10, 12, 14, 16, 18]


## Implementing MapReduce

In [5]:
import collections
import itertools
import string

class SimpleMapReduce:
    def __init__(self, map_func, reduce_func, num_workers=None):
        self.map_func = map_func
        self.reduce_func = reduce_func
        self.pool = multiprocessing.Pool(num_workers)
    def partition(self, mapped_values):
        partitioned_data = collections.defaultdict(list)
        for k, v in mapped_values:
            partitioned_data[k].append(v)
        return partitioned_data.items()
    
    def __call__(self, inputs, chunksize=1):
        map_response = self.pool.map(self.map_func, inputs, chunksize=chunksize)
        partitioned_data = self.partition(itertools.chain(*map_response))
        reduced_values = self.pool.map(self.reduce_func, partitioned_data)
        return reduced_values
    
            

In [9]:
def file_to_words(filename):
    STOP_WORDS = set([
        'a', 'an', 'and', 'are', 'as', 'be', 'by', 'for', 'if','in', 'is', 'it', 'of', 'or', 'py', 'rst', 'that', 'the','to', 'with',
    ])
    TR = str.maketrans({p : ' ' for p in string.punctuation})
    print(f"{multiprocessing.current_process().name} reading {filename}")
    output = []
    with open(filename, 'rt') as f:
        for line in f:
            if line.lstrip().startswith('..'):
                continue
            line = line.translate(TR)
            for word in line.split():
                word = word.lower()
                if word.isalpha() and word not in STOP_WORDS:
                    output.append((word, 1))
    return output

def count_words(item):
    word, ouucrences = item
    return (word, sum(ouucrences))

            

In [10]:
import operator
import glob
import pprint

inputs = glob.glob('*.py')
mapper = SimpleMapReduce(file_to_words, count_words)
word_counts = mapper(inputs)
word_counts.sort(key=operator.itemgetter(1))
word_counts.reverse()

print('TOP 20 WORD')
top20 = word_counts[:20]
pprint.pprint(top20)


ForkPoolWorker-18 reading subprocess_signal_parent_shell.py
ForkPoolWorker-20 reading subprocess_signal_setgrp.py
ForkPoolWorker-19 reading signal_parent.py
ForkPoolWorker-21 reading repeater.py
ForkPoolWorker-19 reading signal_child.py
TOP 20 WORD
[('signal', 24),
 ('import', 22),
 ('sys', 21),
 ('flush', 14),
 ('script', 14),
 ('child', 13),
 ('pid', 13),
 ('stdout', 10),
 ('print', 10),
 ('proc', 10),
 ('time', 10),
 ('os', 10),
 ('shell', 9),
 ('f', 8),
 ('file', 8),
 ('parent', 7),
 ('subprocess', 7),
 ('received', 6),
 ('sleep', 6),
 ('popen', 6)]
