In [2]:
from dask import bag
import json
from bokeh.plotting import output_notebook
output_notebook()

Some of this notebook is taken from [the Dask Examples repository](https://github.com/dask/dask-examples/blob/master/github-on-ec2.ipynb)

To gather the data, I ran this in my terminal from the `data` directory:

`wget http://data.githubarchive.org/2016-01-01-{0..23}.json.gz
wget http://data.githubarchive.org/2015-12-31-{0..23}.json.gz`

This is not (by any means) big data, but is used for example

In [3]:
db = bag.read_text(['../data/2016*.json.gz', '../data/2015*.json.gz']).map(json.loads)

In [4]:
db.count().compute()

859983

In [5]:
first = db.take(1)[0]
first

{'actor': {'avatar_url': 'https://avatars.githubusercontent.com/u/8684332?',
  'gravatar_id': '',
  'id': 8684332,
  'login': 'Deovandski',
  'url': 'https://api.github.com/users/Deovandski'},
 'created_at': '2016-01-01T00:00:01Z',
 'id': '3486701206',
 'payload': {'action': 'started'},
 'public': True,
 'repo': {'id': 29355212,
  'name': 'bssthu/KSP_GPWS',
  'url': 'https://api.github.com/repos/bssthu/KSP_GPWS'},
 'type': 'WatchEvent'}

In [6]:
tenth = db.take(10)[-1]
tenth

{'actor': {'avatar_url': 'https://avatars.githubusercontent.com/u/2889931?',
  'gravatar_id': '',
  'id': 2889931,
  'login': 'dastgir',
  'url': 'https://api.github.com/users/dastgir'},
 'created_at': '2016-01-01T00:00:02Z',
 'id': '3486701226',
 'org': {'avatar_url': 'https://avatars.githubusercontent.com/u/6535184?',
  'gravatar_id': '',
  'id': 6535184,
  'login': 'ROClientSide',
  'url': 'https://api.github.com/orgs/ROClientSide'},
 'payload': {'before': '3a0e13ce2f3e4b0eb80db313a114c60a386f71f5',
  'commits': [{'author': {'email': '0e0740481823b422262defa4314b7e6de25f1598@rocketmail.com',
     'name': 'Dastgir'},
    'distinct': True,
    'message': '2014-10-10data_true_k.rgz',
    'sha': '4c77d2aec345ec9288a4def5f83646e1e0b122bf',
    'url': 'https://api.github.com/repos/ROClientSide/kRO-RAW/commits/4c77d2aec345ec9288a4def5f83646e1e0b122bf'}],
  'distinct_size': 1,
  'head': '4c77d2aec345ec9288a4def5f83646e1e0b122bf',
  'push_id': 919779014,
  'ref': 'refs/heads/master',
  'size

In [7]:
%time db.pluck('type').frequencies().compute()

CPU times: user 92 ms, sys: 36 ms, total: 128 ms
Wall time: 5.87 s


[('GollumEvent', 6048),
 ('ReleaseEvent', 3383),
 ('CommitCommentEvent', 3703),
 ('IssuesEvent', 34760),
 ('IssueCommentEvent', 60842),
 ('DeleteEvent', 15690),
 ('WatchEvent', 84809),
 ('PullRequestEvent', 36867),
 ('PublicEvent', 658),
 ('CreateEvent', 102296),
 ('PushEvent', 472118),
 ('ForkEvent', 26823),
 ('PullRequestReviewCommentEvent', 9219),
 ('MemberEvent', 2767)]

In [8]:
import re
time_pattern = re.compile('[\d\-]+T(?P<hour>[\d]+)')

pushes = db.filter(lambda x: x['type'] == 'PushEvent')
hours = pushes.pluck('created_at').map(lambda x: re.search(time_pattern, x).group('hour'))
top_10_hours = hours.frequencies().topk(10, key=lambda time, count: count)
%time top_10_hours.compute()

CPU times: user 104 ms, sys: 52 ms, total: 156 ms
Wall time: 6.12 s


[('16', 23666),
 ('17', 22998),
 ('15', 22889),
 ('14', 22078),
 ('18', 21891),
 ('19', 21707),
 ('20', 20977),
 ('22', 20813),
 ('13', 20653),
 ('21', 20329)]

In [9]:
def get_hours(x):
    """The key for foldby, like a groupby key. Get the hour from a PushEvent"""
    return re.search(time_pattern, x['created_at']).group('hour')

def binop(total, x):
    """Count the number of commits in a PushEvent"""
    return total + len(x['payload']['commits'])

def combine(total1, total2):
    """This combines commit counts from PushEvents"""
    return total1 + total2

commits = pushes.foldby(get_hours, binop, initial=0, combine=combine)
top_commits = commits.topk(10, key=lambda time, count: count)
%time top_commits.compute()

CPU times: user 120 ms, sys: 36 ms, total: 156 ms
Wall time: 6.47 s


[('16', 35982),
 ('17', 35502),
 ('15', 34556),
 ('19', 34045),
 ('14', 33403),
 ('18', 32819),
 ('22', 31887),
 ('20', 31542),
 ('13', 30938),
 ('21', 30915)]

In [10]:
messages = pushes.pluck('payload').map(lambda x: ' '.join([c['message'].lower() for c in x['commits']]))
top_10_words = messages.str.split().concat().frequencies().topk(10, lambda word, count: count)
%time top_10_words.compute()

CPU times: user 8.78 s, sys: 96 ms, total: 8.88 s
Wall time: 13.9 s


[('to', 141009),
 ('the', 129375),
 ('for', 81087),
 ('update', 77323),
 ('and', 74744),
 ('of', 63951),
 ('add', 63837),
 ('in', 60930),
 ('from', 60303),
 ('merge', 59281)]

If you haven't run `nltk` yet, you'll need to download your corpora. To do so, use this:

`import nltk; nltk.download()`

Follow the prompt and select (d) for Download and then type: `stopwords`

Then you can use (q) to quit once the download is completed.

In [1]:
from nltk.corpus import stopwords

In [12]:
def get_combined_messages(x):
    long_str = ' '.join([c['message'].lower() for c in x['commits']])
    return ' '.join([w for w in long_str.split() if w not in stopwords.words('english')])

In [14]:
long_strs = pushes.pluck('payload').map(get_combined_messages)
long_strs.take(5)

('857 threads update',
 '',
 '2014-10-10data_true_k.rgz',
 'update',
 'fix hsqldb pom.xml file change-id: i44280df790fce678649c36ed736eff93c0ab1ac7 signed-off-by: sharon aicler <saichler@cisco.com>')

In [15]:
top_20_words = long_strs.str.split().concat().frequencies().topk(20, lambda word, count: count)

In [16]:
from dask.diagnostics import Profiler
prof = Profiler()

with prof:
    res = top_20_words.compute()

prof.visualize()

<bokeh.plotting.figure.Figure at 0x7ffb6d59cb70>

In [17]:
res

[('update', 77323),
 ('add', 63837),
 ('merge', 59281),
 ('added', 44989),
 ('fix', 42818),
 ('-', 35355),
 ('request', 32685),
 ('pull', 31862),
 ('branch', 25603),
 ('git-svn-id:', 24360),
 ('new', 23980),
 ('updated', 20548),
 ('signed-off-by:', 20469),
 ('commit', 19870),
 ('fdecad78-55fc-0310-b1b2-d7d25cf747c9', 19411),
 ('test', 18522),
 ('use', 17921),
 ('change', 16499),
 ('remove', 16463),
 ('fixed', 16147)]