In [None]:
from pyspark.sql import SparkSession

import getpass
username = getpass.getuser()

spark = SparkSession. \
    builder. \
    config('spark.ui.port', '0'). \
    config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
    enableHiveSupport(). \
    appName(f'{username} | Analyze GitHub Archive Data'). \
    master('yarn'). \
    getOrCreate()

In [70]:
spark.conf.set('spark.sql.shuffle.partitions', 8)

In [71]:
process_dt = '2021-01-13'
ghdata = spark. \
    read. \
    json(f'/user/{username}/itv-github/landing/{process_dt}-*.json.gz')

In [72]:
from pyspark.sql.functions import substring, col

In [73]:
spark.sql(f'CREATE DATABASE IF NOT EXISTS {username}_raw')

DataFrame[]

In [74]:
spark.sql(f'DROP TABLE IF EXISTS {username}_raw.ghactivity')

DataFrame[]

In [75]:
%%sh

hdfs dfs -ls /user/${USER}/warehouse/${USER}_raw.db

In [76]:
%%sh

hdfs dfs -rm -R -skipTrash /user/${USER}/warehouse/${USER}_raw.db/ghactivity

rm: `/user/itversity/warehouse/itversity_raw.db/ghactivity': No such file or directory


CalledProcessError: Command 'b'\nhdfs dfs -rm -R -skipTrash /user/${USER}/warehouse/${USER}_raw.db/ghactivity\n'' returned non-zero exit status 1.

In [77]:
ghdata. \
    withColumn('year', substring('created_at', 1, 4)). \
    withColumn('month', substring('created_at', 6, 2)). \
    withColumn('day', substring('created_at', 9, 2)). \
    show()

+--------------------+--------------------+-----------+--------------------+--------------------+------+--------------------+-----------------+----+-----+---+
|               actor|          created_at|         id|                 org|             payload|public|                repo|             type|year|month|day|
+--------------------+--------------------+-----------+--------------------+--------------------+------+--------------------+-----------------+----+-----+---+
|[https://avatars....|2021-01-13T15:00:00Z|14806766244|                null|[opened,,,,,,,,,,...|  true|[328163487, heret...| PullRequestEvent|2021|   01| 13|
|[https://avatars....|2021-01-13T15:00:00Z|14806766250|[https://avatars....|[created,, [, COL...|  true|[155705622, conta...|IssueCommentEvent|2021|   01| 13|
|[https://avatars....|2021-01-13T15:00:00Z|14806766257|                null|[,,,, A robot pow...|  true|[329342078, gmone...|      CreateEvent|2021|   01| 13|
|[https://avatars....|2021-01-13T15:00:00Z|148

In [78]:
ghdata = ghdata. \
    withColumn('year', substring('created_at', 1, 4)). \
    withColumn('month', substring('created_at', 6, 2)). \
    withColumn('day', substring('created_at', 9, 2))

In [79]:
ghdata. \
    write. \
    partitionBy('year', 'month', 'day'). \
    saveAsTable(f'{username}_raw.ghactivity')

In [80]:
spark.sql(f'SHOW PARTITIONS {username}_raw.ghactivity').show(truncate=False)

+-------------------------+
|partition                |
+-------------------------+
|year=2021/month=01/day=13|
+-------------------------+



In [81]:
%%sh

hdfs dfs -ls -R /user/${USER}/warehouse/${USER}_raw.db/ghactivity

-rw-r--r--   2 itversity students          0 2021-01-24 09:48 /user/itversity/warehouse/itversity_raw.db/ghactivity/_SUCCESS
drwxr-xr-x   - itversity students          0 2021-01-24 09:47 /user/itversity/warehouse/itversity_raw.db/ghactivity/year=2021
drwxr-xr-x   - itversity students          0 2021-01-24 09:47 /user/itversity/warehouse/itversity_raw.db/ghactivity/year=2021/month=01
drwxr-xr-x   - itversity students          0 2021-01-24 09:48 /user/itversity/warehouse/itversity_raw.db/ghactivity/year=2021/month=01/day=13
-rw-r--r--   2 itversity students  159325117 2021-01-24 09:47 /user/itversity/warehouse/itversity_raw.db/ghactivity/year=2021/month=01/day=13/part-00000-b10eafc3-d753-4237-b5dd-eaafc9a0019d.c000.snappy.parquet
-rw-r--r--   2 itversity students  165619120 2021-01-24 09:48 /user/itversity/warehouse/itversity_raw.db/ghactivity/year=2021/month=01/day=13/part-00001-b10eafc3-d753-4237-b5dd-eaafc9a0019d.c000.snappy.parquet
-rw-r--r--   2 itversity students  152417041 2021-01

In [82]:
spark.sql(f'''
    SELECT substring(created_at, 1, 10) AS created_dt, count(1)
    FROM {username}_raw.ghactivity
    GROUP BY created_dt
    ORDER BY created_dt
'''). \
    show()

+----------+--------+
|created_dt|count(1)|
+----------+--------+
|2021-01-13| 2829111|
+----------+--------+



In [92]:
from pyspark.sql.functions import substring, col
process_dt = input('Enter date to copy the data (yyyy-MM-dd): ')
spark.conf.set('spark.sql.shuffle.partitions', 8)
ghdata = spark. \
    read. \
    json(f'/user/{username}/itv-github/landing/{process_dt}-*.json.gz')
ghdata = ghdata. \
    withColumn('year', substring('created_at', 1, 4)). \
    withColumn('month', substring('created_at', 6, 2)). \
    withColumn('day', substring('created_at', 9, 2))
ghdata. \
    write. \
    mode('append'). \
    partitionBy('year', 'month', 'day'). \
    parquet(f'/user/{username}/warehouse/{username}_raw.db/ghactivity')

Enter date to copy the data (yyyy-MM-dd):  2021-01-16


In [93]:
spark.sql(f'SHOW PARTITIONS {username}_raw.ghactivity').show(truncate=False)

+-------------------------+
|partition                |
+-------------------------+
|year=2021/month=01/day=13|
|year=2021/month=01/day=14|
+-------------------------+



In [94]:
%%sh

hdfs dfs -ls -R /user/${USER}/warehouse/${USER}_raw.db/ghactivity

-rw-r--r--   2 itversity students          0 2021-01-24 09:59 /user/itversity/warehouse/itversity_raw.db/ghactivity/_SUCCESS
drwxr-xr-x   - itversity students          0 2021-01-24 09:47 /user/itversity/warehouse/itversity_raw.db/ghactivity/year=2021
drwxr-xr-x   - itversity students          0 2021-01-24 09:59 /user/itversity/warehouse/itversity_raw.db/ghactivity/year=2021/month=01
drwxr-xr-x   - itversity students          0 2021-01-24 09:48 /user/itversity/warehouse/itversity_raw.db/ghactivity/year=2021/month=01/day=13
-rw-r--r--   2 itversity students  159325117 2021-01-24 09:47 /user/itversity/warehouse/itversity_raw.db/ghactivity/year=2021/month=01/day=13/part-00000-b10eafc3-d753-4237-b5dd-eaafc9a0019d.c000.snappy.parquet
-rw-r--r--   2 itversity students  165619120 2021-01-24 09:48 /user/itversity/warehouse/itversity_raw.db/ghactivity/year=2021/month=01/day=13/part-00001-b10eafc3-d753-4237-b5dd-eaafc9a0019d.c000.snappy.parquet
-rw-r--r--   2 itversity students  152417041 2021-01

In [95]:
spark.sql(f'''
    MSCK REPAIR TABLE {username}_raw.ghactivity
''')

DataFrame[]

In [96]:
spark.sql(f'SHOW PARTITIONS {username}_raw.ghactivity').show(truncate=False)

+-------------------------+
|partition                |
+-------------------------+
|year=2021/month=01/day=13|
|year=2021/month=01/day=14|
|year=2021/month=01/day=15|
|year=2021/month=01/day=16|
+-------------------------+



In [97]:
spark.sql(f'''
    SELECT substring(created_at, 1, 10) AS created_dt, count(1)
    FROM {username}_raw.ghactivity
    GROUP BY created_dt
    ORDER BY created_dt
'''). \
    show()

+----------+--------+
|created_dt|count(1)|
+----------+--------+
|2021-01-13| 2829111|
|2021-01-14| 2857818|
|2021-01-15| 2652900|
|2021-01-16| 1964511|
+----------+--------+

