In [1]:
from pyspark.sql import functions as F
from pyspark.sql import DataFrame

In [2]:
# Initializing SparkSession
'''
from pyspark.sql import SparkSession

spark = SparkSession.builder \
                    .master('local[*]') \
                    .appName('spark-read-from-bigquery') \
                    .config('parentProject', 'khung-playground') \
                    .config("credentialsFile", "../khung-playground-cb7110dd8c95.json").getOrCreate()
'''

'\nfrom pyspark.sql import SparkSession\n\nspark = SparkSession.builder                     .master(\'local[*]\')                     .appName(\'spark-read-from-bigquery\')                     .config(\'parentProject\', \'khung-playground\')                     .config("credentialsFile", "../khung-playground-cb7110dd8c95.json").getOrCreate()\n'

In [3]:
### We can place the jar in /usr/local/spark-3.2.1-bin-hadoop3.2/jars/
### Or define the jar when setting the builder like this
from pyspark.sql import SparkSession

spark = SparkSession.builder \
                    .master('local[*]') \
                    .appName('spark-read-from-bigquery') \
                    .config('parentProject', 'khung-playground') \
                    .config('spark.jars', '../spark-bigquery-latest_2.12.jar') \
                    .config("credentialsFile", "../khung-playground-cb7110dd8c95.json").getOrCreate()

In [4]:
spark

In [4]:
TABLE_LANGUAGE = "khung-playground.github.languages"
TABLE_COMMIT = "khung-playground.github.commits"
LANGUAGE = "Python"

In [5]:
def explode_nested_type(df: DataFrame, data_type = "array"):
    
    normal_column = []
    to_be_explode_column = []
    explode = []

    for column in df.dtypes:
        if column[1].startswith(data_type):
            to_be_explode_column.append(column)
        else:
            normal_column.append(column[0])

    # explode the column
    for column in to_be_explode_column:
        
        if data_type == "array":
            explode.append(F.explode(df[column[0]]).alias(column[0]))
        elif data_type == "struct":
            explode.append(F.col(f"{column[0]}.*"))

    # put all back together
    df = df.select(
        *normal_column,
        *explode
    )
    
    return df

### Language Dataset

In [6]:
# Creating DataFrames for language
df_language = spark.read.format('bigquery').option('table', TABLE_LANGUAGE).load()

In [7]:
df_language = explode_nested_type(df_language)
df_language = explode_nested_type(df_language, "struct")

In [8]:
df_language.printSchema()

root
 |-- repo_name: string (nullable = true)
 |-- name: string (nullable = true)
 |-- bytes: long (nullable = true)



In [9]:
df_language = df_language.where(df_language["name"] == LANGUAGE)

In [10]:
duplicate_data = ["repo_name", "name"]
df_language = df_language.dropDuplicates(duplicate_data)

### Commit Dataset

In [28]:
commit_columns = ['commit', 'committer', 'repo_name']
df_commit = spark.read.format('bigquery').option('table', TABLE_COMMIT).load()
df_commit = df_commit.select(*commit_columns)

In [29]:
df_commit = explode_nested_type(df_commit, 'struct')
df_commit = df_commit.withColumnRenamed("name","commiter_name")

In [30]:
df_commit.printSchema()

root
 |-- commit: string (nullable = true)
 |-- repo_name: string (nullable = true)
 |-- commiter_name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- time_sec: long (nullable = true)
 |-- tz_offset: long (nullable = true)
 |-- date: timestamp (nullable = true)



#### I aware there are records of commits are at exactally same time and same commiter, In my asscept this doesn't make sense so I see this as duplicate hence drop it.  

In [32]:
duplicate_data = ["commiter_name", "email", "time_sec", "tz_offset", "date", "repo_name"]
df_commit = df_commit.dropDuplicates(duplicate_data)


#### Join df_language with df_commit

In [33]:
df_commit_language = df_commit.join(df_language, on = ["repo_name"])

In [34]:
df_commit_language.cache()

DataFrame[repo_name: string, commit: string, commiter_name: string, email: string, time_sec: bigint, tz_offset: bigint, date: timestamp, name: string, bytes: bigint]

In [35]:
df_commit_language.printSchema()

root
 |-- repo_name: string (nullable = true)
 |-- commit: string (nullable = true)
 |-- commiter_name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- time_sec: long (nullable = true)
 |-- tz_offset: long (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- name: string (nullable = true)
 |-- bytes: long (nullable = true)



#### Window function on the merged data in order to calculate time distribution of two commits

In [36]:
from pyspark.sql.window import Window

partition_columns = ["repo_name"]
windowSpec  = Window.partitionBy(partition_columns).orderBy("date")

df_commit_language = df_commit_language.withColumn("commit_seq",F.row_number().over(windowSpec))
df_commit_language = df_commit_language.withColumn("lag_time_sec", F.lag("time_sec", 1).over(windowSpec))
df_commit_language = df_commit_language.withColumn("prv_time_diff", df_commit_language["time_sec"]  - df_commit_language["lag_time_sec"])

In [37]:
df_commit_language.printSchema()

root
 |-- repo_name: string (nullable = true)
 |-- commit: string (nullable = true)
 |-- commiter_name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- time_sec: long (nullable = true)
 |-- tz_offset: long (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- name: string (nullable = true)
 |-- bytes: long (nullable = true)
 |-- commit_seq: integer (nullable = false)
 |-- lag_time_sec: long (nullable = true)
 |-- prv_time_diff: long (nullable = true)



In [38]:
df_commit_language.show()

+----------------+--------------------+------------------+--------------------+----------+---------+-------------------+------+-----+----------+------------+-------------+
|       repo_name|              commit|     commiter_name|               email|  time_sec|tz_offset|               date|  name|bytes|commit_seq|lag_time_sec|prv_time_diff|
+----------------+--------------------+------------------+--------------------+----------+---------+-------------------+------+-----+----------+------------+-------------+
|Microsoft/vscode|8f35cc4768393b254...|       Erich Gamma|8c10e1560732a7a60...|1447421978|       60|2015-11-13 13:39:38|Python| 2405|         1|        null|         null|
|Microsoft/vscode|6f9e2ae3907632e2f...|        Chris Dias|711c73f64afdce07b...|1447426118|       60|2015-11-13 14:48:38|Python| 2405|         2|  1447421978|         4140|
|Microsoft/vscode|0a2f0cbc5c7ebc457...|   Benjamin Pasero|d377d17da589177b3...|1447428762|       60|2015-11-13 15:32:42|Python| 2405|       

#### Write back to filesystem

In [40]:
import datetime


write_file_date = str(datetime.datetime.today().date())
file_name = f"{LANGUAGE}_commits_{write_file_date}.parguet"
df_commit_language.write \
                  .mode("overwrite") \
                  .parquet(f"../spark_output/{file_name}")
                  

#### To read the data I cleaned

In [44]:
df = spark.read.parquet("../spark_output/Python_commits_2022-03-02.parguet")
df.show()

+-----------+--------------------+-------------+--------------------+----------+---------+-------------------+------+-------+----------+------------+-------------+
|  repo_name|              commit|commiter_name|               email|  time_sec|tz_offset|               date|  name|  bytes|commit_seq|lag_time_sec|prv_time_diff|
+-----------+--------------------+-------------+--------------------+----------+---------+-------------------+------+-------+----------+------------+-------------+
|apple/swift|18844bc65229786b9...|Chris Lattner|721a06c08c6174084...|1279410659|        0|2010-07-17 23:50:59|Python|1954012|         1|        null|         null|
|apple/swift|afc81c1855bf71131...|Chris Lattner|721a06c08c6174084...|1279411451|        0|2010-07-18 00:04:11|Python|1954012|         2|  1279410659|          792|
|apple/swift|5e88a2175579b0b2e...|Chris Lattner|721a06c08c6174084...|1279414087|        0|2010-07-18 00:48:07|Python|1954012|         3|  1279411451|         2636|
|apple/swift|874

DataFrame[repo_name: string, commit: string, commiter_name: string, email: string, time_sec: bigint, tz_offset: bigint, date: timestamp, name: string, bytes: bigint, commit_seq: int, lag_time_sec: bigint, prv_time_diff: bigint]