In [1]:
from pyspark.sql import SparkSession

import getpass
username = getpass.getuser()

spark = SparkSession. \
    builder. \
    config('spark.ui.port', '0'). \
    config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
    enableHiveSupport(). \
    appName(f'{username} | Analyze GitHub Archive Data'). \
    master('yarn'). \
    getOrCreate()

In [None]:
%%sh

hdfs dfs -ls /user/${USER}/itv-github/landing/

In [2]:
ghdata = spark.read.json(f'/user/{username}/itv-github/landing/2021-01-13-0.json.gz')

In [None]:
ghdata.printSchema()

In [7]:
ghdata. \
    groupBy('type'). \
    count(). \
    show(truncate=False)

+-----------------------------+-----+
|type                         |count|
+-----------------------------+-----+
|PullRequestReviewEvent       |2493 |
|PushEvent                    |48569|
|GollumEvent                  |227  |
|ReleaseEvent                 |425  |
|CommitCommentEvent           |319  |
|CreateEvent                  |11762|
|PullRequestReviewCommentEvent|1727 |
|IssueCommentEvent            |6062 |
|DeleteEvent                  |2812 |
|IssuesEvent                  |2419 |
|ForkEvent                    |1697 |
|PublicEvent                  |376  |
|MemberEvent                  |206  |
|WatchEvent                   |4488 |
|PullRequestEvent             |7329 |
+-----------------------------+-----+



In [10]:
ghdata. \
    groupBy('payload.action'). \
    count(). \
    show(truncate=False)

+---------+-----+
|action   |count|
+---------+-----+
|null     |65762|
|created  |10282|
|reopened |67   |
|closed   |4407 |
|published|425  |
|opened   |5274 |
|added    |206  |
|started  |4488 |
+---------+-----+



In [14]:
ghdata.select('payload.head').printSchema()

root
 |-- head: string (nullable = true)



In [16]:
ghdata.select('payload.ref').show()

+--------------------+
|                 ref|
+--------------------+
|                null|
|                main|
|                null|
|snyk-fix-1e1002e1...|
|                null|
|   refs/heads/master|
|   refs/heads/master|
|     refs/heads/main|
|   refs/heads/master|
|                null|
|                null|
|   refs/heads/master|
|refs/heads/locald...|
|   refs/heads/master|
|     refs/heads/main|
|                null|
|                null|
|     refs/heads/main|
|refs/heads/cap35-...|
|                null|
+--------------------+
only showing top 20 rows

