In [1]:
!ulimit -n 4096

In [2]:
from dask.distributed import Client, progress
from dask_jobqueue import SGECluster

cluster = SGECluster(cores=24,
                     processes=24,
                     memory="250GB",
                     queue="grid_short.q",
                     interface="ib0",
                     scheduler_options={"interface": "bond1"},
                     local_directory="/var/tmp",
                     job_extra=["-pe smp 24"],
                     walltime="04:00:00")

cluster.scale(jobs=10)

client = Client(cluster)

  from distributed.utils import tmpfile


In [3]:
import dask.bag as db
import dask.dataframe as dd
import json

lines = db.read_text("filtered/events.*.json.gz")

def parse_json(s):
    try:
        return json.loads(s)
    except:
        return {"type": "ParseError"}

events = lines.map(parse_json)

In [17]:
issues_events = events.filter(lambda e: e["type"] == "IssuesEvent")
issue_comment_events = events.filter(lambda e: e["type"] == "IssueCommentEvent")
pull_request_events = events.filter(lambda e: e["type"] == "PullRequestEvent")
pull_request_review_comment_events = events.filter(lambda e: e["type"] == "PullRequestReviewCommentEvent")
push_events = events.filter(lambda e: e["type"] == "PushEvent")
release_events = events.filter(lambda e: e["type"] == "ReleaseEvent")

In [29]:
issues_events.filter(lambda e: type(e["payload"]["issue"]) is dict).map(lambda e: {
    "action": e["payload"]["action"],
    "actor": e["actor"]["login"],
    "repo": e["repo"]["name"],
    "number": e["payload"]["issue"]["number"],
    "title": e["payload"]["issue"]["title"],
    "body": e["payload"]["issue"]["body"],
    "created_at": e["payload"]["issue"]["created_at"],
    "updated_at": e["payload"]["issue"]["updated_at"],
    "closed_at": e["payload"]["issue"]["closed_at"],
}).to_dataframe(meta={
    "action": "string",
    "actor": "string",
    "repo": "string",
    "number": "int64",
    "title": "string",
    "body": "string",
    "created_at": "datetime64[ns]",
    "updated_at": "datetime64[ns]",
    "closed_at": "datetime64[ns]",
}).set_index("created_at").repartition(10).to_parquet("./issues", overwrite=True)

In [28]:
pull_request_events.map(lambda e: {
    "action": e["payload"]["action"],
    "actor": e["actor"]["login"],
    "repo": e["repo"]["name"],
    "number": e["payload"]["pull_request"]["number"],
    "title": e["payload"]["pull_request"]["title"],
    "body": e["payload"]["pull_request"].get("body"),
    "created_at": e["payload"]["pull_request"].get("created_at", e["created_at"]),
    "updated_at": e["payload"]["pull_request"].get("updated_at"),
    "closed_at": e["payload"]["pull_request"].get("closed_at"),
    "merged_at": e["payload"]["pull_request"].get("merged_at"),
}).to_dataframe(meta={
    "action": "string",
    "actor": "string",
    "repo": "string",
    "number": "int64",
    "title": "string",
    "body": "string",
    "created_at": "datetime64[ns]",
    "updated_at": "datetime64[ns]",
    "closed_at": "datetime64[ns]",
    "merged_at": "datetime64[ns]"
}).set_index("created_at").repartition(10).to_parquet("./pull_requests", overwrite=True)

In [31]:
release_events.map(lambda e: {
    "action": e["payload"]["action"],
    "actor": e["actor"]["login"],
    "repo": e["repo"]["name"],
    "tag_name": e["payload"]["release"]["tag_name"],
    "name": e["payload"]["release"]["name"],
    "created_at": e["payload"]["release"]["created_at"],
    "published_at": e["payload"]["release"]["published_at"],
    "tarball_url": e["payload"]["release"]["tarball_url"],
    "zipball_url": e["payload"]["release"]["zipball_url"]
}).to_dataframe(meta={
    "action": "string",
    "actor": "string",
    "repo": "string",
    "tag_name": "string",
    "name": "string",
    "created_at": "datetime64[ns]",
    "published_at": "datetime64[ns]",
    "tarball_url": "string",
    "zipball_url": "string"
}).set_index("created_at").repartition(10).to_parquet("./releases", overwrite=True)