# chapter 5. Using Multiprocessing and ProcessPoolExecutor

In [7]:
import platform
platform.python_version()

'2.7.9'

## 리소스 (resource)

![resource](./images/resource.png)

- 공유 리소스 : 네트워크 사용자가 사용할 수 있는 폴더, 파일, 프린터 및 명명된 파이프와 같은 모든 리소스. 
   ( https://technet.microsoft.com/ko-kr/library/cc772501.aspx )

## thread

## process

## inter-process communication (IPC)

http://en.wikipedia.org/wiki/Inter-process_communication

## 선점형 vs 비선점형 OS

http://ko.wikipedia.org/wiki/%EC%8A%A4%EC%BC%80%EC%A4%84%EB%A7%81_(%EC%BB%B4%ED%93%A8%ED%8C%85)

# Understanding the concept of a process


## 1. Understanding the process model

## 2. Defining the states of a process

# Implementing multiprocessing communication

## 1. Using multiprocessing.Pipe

In [None]:
# %load multiprocessing_pipe.py
import os, random
from multiprocessing import Process, Pipe


def producer_task(conn):
    value = random.randint(1, 10)
    conn.send(value)
    print('Value [%d] sent by PID [%d]' % (value, os.getpid()))
    conn.close()

def consumer_task(conn):
    print('Value [%d] received by PID [%d]' % (conn.recv(), os.getpid()))

if __name__ == '__main__':
    producer_conn, consumer_conn = Pipe()
    consumer = Process(target=consumer_task, args=(consumer_conn,))
    producer = Process(target=producer_task, args=(producer_conn,))
    
    consumer.start()
    producer.start()
    
    consumer.join()
    producer.join()



## 2. Understanding multiprocessing.Queue

# Using multiprocessing to compute Fibonacci series terms with multiple inputs

In [None]:
# %load multiprocessing_fibonacci.py

import sys, logging, time, os, random
from multiprocessing import Process, Queue, Pool, \
    cpu_count, current_process, Manager

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(message)s')

ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
ch.setFormatter(formatter)
logger.addHandler(ch)

def producer_task(q, fibo_dict):
    for i in range(15):
        value = random.randint(1, 20)
        fibo_dict[value] = None
        logger.info("Producer [%s] putting value [%d] into queue.. "
                % (current_process().name, value))
        q.put(value)

def consumer_task(q, fibo_dict):
    while not q.empty():
        value = q.get(True, 0.05)
        a, b = 0, 1
        for item in range(value):
            a, b = b, a + b
            fibo_dict[value] = a
        logger.info("consumer [%s] getting value [%d] from queue..."
                    % (current_process().name, value))

if __name__ == '__main__':
    data_queue = Queue()
    number_of_cpus = cpu_count()
    manager = Manager()
    fibo_dict = manager.dict()
    
    producer = Process(target=producer_task, args=(data_queue, fibo_dict))
    producer.start()
    producer.join()
    
    consumer_list = []
    for i in range(number_of_cpus):
        consumer = Process(target=consumer_task, args=(data_queue, fibo_dict))
        consumer.start()
        consumer_list.append(consumer)
    
    [consumer.join() for consumer in consumer_list]
    
    logger.info(fibo_dict)


# Crawling the Web using ProcessPoolExecutor

In [None]:
# %load process_pool_executor_web_crawler.py

import sys, time, random, re, requests
import concurrent.futures
from multiprocessing import Queue, cpu_count, current_process, Manager


def group_urls_task(urls, result_dict, html_link_regex):
    try:
        url = urls.get(True, 0.05)
        result_dict[url] = None
        print("[%s] putting url [%s] in dictionary..." % (
            current_process().name, url))
    except queue.Empty:
        print('Nothing to be done, queue is empty')

def crawl_task(url, html_link_regex):
    links = []
    try:
        request_data = requests.get(url)
        print("[%s] crawling url [%s] ..." % (
            current_process().name, url))
        links = html_link_regex.findall(request_data.text)
    except:
        print(sys.exc_info())
        raise
    finally:
        return (url, links)

if __name__ == '__main__':
    manager = Manager()
    urls = manager.Queue()
    urls.put('http://www.google.com')
    urls.put('http://br.bing.com/')
    urls.put('https://duckduckgo.com/')
    urls.put('https://github.com/')
    urls.put('http://br.search.yahoo.com/')
    result_dict = manager.dict()
    
    html_link_regex = \
        re.compile('<a\s(?:.*?\s)*?href=[\'"](.*?)[\'"].*?>')
    
    number_of_cpus = cpu_count()
    
    with concurrent.futures.ProcessPoolExecutor(max_workers=number_of_cpus) as group_link_processes:
        for i in range(urls.qsize()):
            group_link_processes.submit(group_urls_task, urls, result_dict, html_link_regex)
    
    with concurrent.futures.ProcessPoolExecutor(max_workers=number_of_cpus) as crawler_link_processes:
        future_tasks = {crawler_link_processes.submit(crawl_task, url, html_link_regex): url for url in result_dict.keys()}
        for future in concurrent.futures.as_completed(future_tasks):
            result_dict[future.result()[0]] = future.result()[1]

    for url, links in result_dict.items():
        print("[%s] with links : [%s..." % (url, links[0]))