In [4]:
from pyspark.sql import SparkSession

import getpass
username = getpass.getuser()

spark = SparkSession. \
    builder. \
    config('spark.ui.port', '0'). \
    config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
    enableHiveSupport(). \
    appName(f'{username} | Analyze GitHub Archive Data'). \
    master('yarn'). \
    getOrCreate()

In [None]:
%%sh

hdfs dfs -ls /user/${USER}/itv-github/landing/

In [5]:
ghdata = spark.read.json(f'/user/{username}/itv-github/landing/2021-01-13-0.json.gz')

In [None]:
ghdata.printSchema()

In [None]:
ghdata.select('repo').show()

In [8]:
ghdata.select('repo').printSchema()

root
 |-- repo: struct (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- url: string (nullable = true)



In [9]:
ghdata.select('repo.id', 'repo.name', 'repo.url').show()

+---------+--------------------+--------------------+
|       id|                name|                 url|
+---------+--------------------+--------------------+
| 67224522|   i-RIC/prepost-gui|https://api.githu...|
|329141406| kaneda96/React-quiz|https://api.githu...|
|221279833|archesproject/arc...|https://api.githu...|
|182814691|    Audentio/kinetic|https://api.githu...|
|  4542716|       NixOS/nixpkgs|https://api.githu...|
|329130975|   eterwin/schastota|https://api.githu...|
|104382627|littlebizzy/slick...|https://api.githu...|
|302490178|   qmk/qmk_keyboards|https://api.githu...|
|156042726|MaybeNotWrong/lc-sep|https://api.githu...|
|329144511|direwolf-github/e...|https://api.githu...|
| 91074692|zalando/postgres-...|https://api.githu...|
|280011532|       GeopJr/GeopJr|https://api.githu...|
| 32481543|cBioPortal/cbiopo...|https://api.githu...|
|270887418|feedarchive/freen...|https://api.githu...|
|322448852|ehenn345/hf_helpe...|https://api.githu...|
|325641835|machinegunhairy/P

In [None]:
ghdata.select('created_at', 'repo.*').show()

In [11]:
ghdata.select('payload.commits').printSchema()

root
 |-- commits: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- author: struct (nullable = true)
 |    |    |    |-- email: string (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |-- distinct: boolean (nullable = true)
 |    |    |-- message: string (nullable = true)
 |    |    |-- sha: string (nullable = true)
 |    |    |-- url: string (nullable = true)



In [14]:
ghdata.count()

90911

In [13]:
from pyspark.sql.functions import explode
ghdata. \
    select(explode('payload.commits').alias('commits')). \
    printSchema()

root
 |-- commits: struct (nullable = true)
 |    |-- author: struct (nullable = true)
 |    |    |-- email: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |-- distinct: boolean (nullable = true)
 |    |-- message: string (nullable = true)
 |    |-- sha: string (nullable = true)
 |    |-- url: string (nullable = true)



In [15]:
from pyspark.sql.functions import explode
ghdata. \
    select(explode('payload.commits').alias('commits')). \
    count()

75708

In [17]:
from pyspark.sql.functions import explode_outer
ghdata. \
    select(explode_outer('payload.commits').alias('commits')). \
    count()

119495