Chapter 2. Working with large datasets faster: parallelization and the map function
====
### Mastering Large Datasets with Python by JT Wolohan 



### Listing 2.1 and 2.2 :: Formatting phone numbers with loops and maps

In [None]:
import re


class PhoneFormatter:
    def __init__(self):
        self.r = re.compile(r"\d")
        
    def pretty_format(self, phone_number):
        numbers = self.r.findall(phone_number)
        area_code = "".join(numbers[-10:-7])
        first_3 = "".join(numbers[-7:-4])
        last_4 = "".join(numbers[-4:len(numbers)])
        return "({}) {}-{}".format(area_code, first_3, last_4)

In [None]:
phone_numbers = [
    "(123) 456-7890",
    "1234567890",
    "123.456.7890",
    "+1 123 456-7890"
]

P = PhoneFormatter()

In [None]:
clean_numbers = []
for phone_number in phone_numbers:
    pretty = P.pretty_format(phone_number)
    clean_numbers.append(pretty)
print(clean_numbers)

In [None]:
print(list(map(P.pretty_format, phone_numbers)))

### Parallel blog processing

In [None]:
from datetime import date
from urllib import request

from multiprocessing import Pool

def days_between(start,stop):
  today = date(*start)
  stop = date(*stop)
  while today < stop:
    datestr = today.strftime("%m-%d-%Y")
    yield "http://jtwolohan.com/arch-rival-blog/"+datestr
    today = date.fromordinal(today.toordinal()+1)

In [None]:
def get_url(path):
  return request.urlopen(path).read()

with Pool() as P:
  blog_posts = P.map(get_url,days_between((2000,1,1),(2011,1,1)))

### Fizz Buzz - state and parallelization

In [51]:
class FizzBuzzer:
  def __init__(self):
    self.n = 0
  def foo(self,_):
    self.n += 1
    if (self.n % 3)  == 0:
      x = "buzz"
    else: x = "fizz"
    print(x)
    return x

In [None]:
FB = FizzBuzzer()
for i in range(21):
  FB.foo(i)

In [None]:
with Pool() as P:
    P.map(FB.foo, range(1,22))

### Wikipedia scraping

In [3]:
import json
from urllib import request, parse
from multiprocessing import Pool
from itertools import chain
import networkx as nx

In [8]:
def link_to_title(link):
  return link["title"]

In [9]:
def clean_if_key(page,key):
    if key in page.keys():
        return map(link_to_title,page[key])
    else: return []

In [20]:
def get_Wiki_links(pageTitle):
    safe_title = parse.quote(pageTitle)
    url = "https://en.wikipedia.org/w/api.php?action=query&\
prop=links|linkshere&pllimit=500&lhlimit=500&titles={}&\
format=json&formatversion=2".format(safe_title)
    page = request.urlopen(url).read().decode('utf-8')
    j = json.loads(page)
    jpage = j['query']['pages'][0]
    inbound = clean_if_key(jpage,"links")
    outbound = clean_if_key(jpage,"linkshere")
    return {"title": pageTitle,
            "in-links":list(inbound),
            "out-links":list(outbound)}

In [21]:
def flatten_network(page):
    return page["in-links"]+page["out-links"]

In [22]:
def page_to_edges(page):
    a = [(page['title'],p) for p in page['out-links']]
    b = [(p,page['title']) for p in page['in-links']]
    return a+b

In [23]:
root = get_Wiki_links("Parallel_computing")
initial_network = flatten_network(root)
with Pool() as P:
    all_pages = P.map(get_Wiki_links, initial_network)
    edges = P.map(page_to_edges, all_pages)
edges = chain.from_iterable(edges)

In [None]:
G = nx.DiGraph()
for e in edges:
    G.add_edge(*e)
nx.readwrite.gexf.write_gexf(G,"./MyGraph.gexf")

[Read for more? Go to chapter 3!](./Ch03_notebook.ipynb)