In [1]:
from pyspark.sql import SparkSession

import getpass
username = getpass.getuser()

spark = SparkSession. \
    builder. \
    config('spark.ui.port', '0'). \
    config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
    enableHiveSupport(). \
    appName(f'{username} | Analyze GitHub Archive Data'). \
    master('yarn'). \
    getOrCreate()

In [2]:
ghdata = spark.read.table(f'{username}_raw.ghactivity')

In [None]:
ghdata.printSchema()

In [4]:
ghdata. \
    select('type'). \
    distinct(). \
    show(truncate=False)

+-----------------------------+
|type                         |
+-----------------------------+
|PullRequestReviewEvent       |
|PushEvent                    |
|GollumEvent                  |
|ReleaseEvent                 |
|CommitCommentEvent           |
|CreateEvent                  |
|PullRequestReviewCommentEvent|
|IssueCommentEvent            |
|DeleteEvent                  |
|IssuesEvent                  |
|ForkEvent                    |
|PublicEvent                  |
|MemberEvent                  |
|WatchEvent                   |
|PullRequestEvent             |
+-----------------------------+



In [5]:
ghdata. \
    select('payload.ref_type'). \
    distinct(). \
    show()

+----------+
|  ref_type|
+----------+
|      null|
|       tag|
|    branch|
|repository|
+----------+



In [8]:
from pyspark.sql.functions import count, col, lit

ghdata. \
    groupBy('payload.ref_type'). \
    agg(count(lit(1)).alias('event_count')). \
    orderBy(col('event_count').desc()). \
    show()

+----------+-----------+
|  ref_type|event_count|
+----------+-----------+
|      null|    8550769|
|    branch|    1183829|
|repository|     438739|
|       tag|     131003|
+----------+-----------+



In [6]:
ghdata.count()

5686929

In [None]:
from pyspark.sql.functions import substring, count, col, lit

ghdata. \
    filter('payload.ref_type = "repository" AND type = "CreateEvent"'). \
    groupBy(substring('created_at', 1, 10).alias('created_dt')). \
    agg(count(lit(1)).alias('repo_count')). \
    orderBy('created_dt'). \
    show()