In [None]:
from dask import bag
import json
from bokeh.plotting import output_notebook
output_notebook()

Some of this notebook is taken from [the Dask Examples repository](https://github.com/dask/dask-examples/blob/master/github-on-ec2.ipynb)

To gather the data, I ran this in my terminal from the `data` directory:

`wget http://data.githubarchive.org/2016-01-01-{0..23}.json.gz
wget http://data.githubarchive.org/2015-12-31-{0..23}.json.gz`

This is not (by any means) big data, but is used for example

In [None]:
db = bag.read_text(['../data/2016*.json.gz', '../data/2015*.json.gz']).map(json.loads)

In [None]:
db.count().compute()

In [None]:
first = db.take(1)[0]
first

In [None]:
tenth = db.take(10)[-1]
tenth

In [None]:
%time db.pluck('type').frequencies().compute()

In [None]:
import re
time_pattern = re.compile('[\d\-]+T(?P<hour>[\d]+)')

pushes = db.filter(lambda x: x['type'] == 'PushEvent')
hours = pushes.pluck('created_at').map(lambda x: re.search(time_pattern, x).group('hour'))
top_10_hours = hours.frequencies().topk(10, key=lambda time, count: count)
%time top_10_hours.compute()

In [None]:
def get_hours(x):
    """The key for foldby, like a groupby key. Get the hour from a PushEvent"""
    return re.search(time_pattern, x['created_at']).group('hour')

def binop(total, x):
    """Count the number of commits in a PushEvent"""
    return total + len(x['payload']['commits'])

def combine(total1, total2):
    """This combines commit counts from PushEvents"""
    return total1 + total2

commits = pushes.foldby(get_hours, binop, initial=0, combine=combine)
top_commits = commits.topk(10, key=lambda time, count: count)
%time top_commits.compute()

In [None]:
messages = pushes.pluck('payload').map(lambda x: ' '.join([c['message'].lower() for c in x['commits']]))
top_10_words = messages.str.split().concat().frequencies().topk(10, lambda word, count: count)
%time top_10_words.compute()

If you haven't run `nltk` yet, you'll need to download your corpora. To do so, use this:

`import nltk; nltk.download()`

Follow the prompt and select (d) for Download and then type: `stopwords`

Then you can use (q) to quit once the download is completed.

In [None]:
from nltk.corpus import stopwords

In [None]:
def get_combined_messages(x):
    long_str = ' '.join([c['message'].lower() for c in x['commits']])
    return ' '.join([w for w in long_str.split() if w not in stopwords.words('english')])

In [None]:
long_strs = pushes.pluck('payload').map(get_combined_messages)
long_strs.take(5)

In [None]:
top_20_words = long_strs.str.split().concat().frequencies().topk(20, lambda word, count: count)

In [None]:
from dask.diagnostics import Profiler
prof = Profiler()

with prof:
    res = top_20_words.compute()

prof.visualize()

In [None]:
res