## Pyspark SQL -- BigQuery (read/write)

In [1]:
# Transform events table into events_dwh
# from_unixtime --> datetime (yyyy-MM-DD HH:mm:ss)
# Run in Cloud Jupyter Notebook

import pyspark
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.context import SparkContext

# credentials_location = '/root/.google/credentials/google-creds.json'


conf = SparkConf() \
    .setAppName('events') \
    .set("spark.jars", "/usr/lib/spark/jars/gcs-connector-hadoop3-latest.jar") \
    .set("spark.hadoop.google.cloud.auth.service.account.enable", "true") 
#    .set("spark.hadoop.google.cloud.auth.service.account.json.keyfile", credentials_location)

spark = SparkSession.builder.config(conf=conf).getOrCreate()

project_id = "semar-de-project1"
dataset_id = "project1"
table_source = "events"

df = spark.read.format('bigquery') \
    .option("temporaryGcsBucket","dataproc-temp-asia-southeast2-212352110204-1oi7hped") \
    .option("project", project_id) \
    .option("dataset", dataset_id) \
    .load(table_source)
    
df.createOrReplaceTempView("temp_events")

events_transform = spark.sql("""
select from_unixtime((timestamp / 1000), "yyyy-MM-dd HH:mm:ss") as timestamp, 
    visitorid, event, itemid, transactionid
from temp_events
""")

# events_transform.show()

project_id = "semar-de-project1"
dataset_id = "project1"
table_target = "events_dwh"
parttition_column = "DATE_FORMAT(timestamp, 'yyyy-MM')"
cluster_column = "event"

events_transform.write \
    .format("bigquery") \
    .option("temporaryGcsBucket","dataproc-temp-asia-southeast2-212352110204-1oi7hped") \
    .option("table", f"{project_id}.{dataset_id}.{table_target}") \
    .option("PARTITION BY",  parttition_column) \
    .option("CLUSTER BY", cluster_column) \
    .mode('Overwrite') \
    .save()

# Stop Spark session
spark.stop()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/01 01:12:23 INFO SparkEnv: Registering MapOutputTracker
24/04/01 01:12:23 INFO SparkEnv: Registering BlockManagerMaster
24/04/01 01:12:23 INFO SparkEnv: Registering BlockManagerMasterHeartbeat
24/04/01 01:12:24 INFO SparkEnv: Registering OutputCommitCoordinator
                                                                                

In [None]:
# Transform item_properties table into item_properties_dwh
# from_unixtime --> datetime (yyyy-MM-DD HH:mm:ss)
# Run in Cloud Jupyter Notebook

import pyspark
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.context import SparkContext

# credentials_location = '/root/.google/credentials/google-creds.json'


conf = SparkConf() \
    .setAppName('item_properties') \
    .set("spark.jars", "/usr/lib/spark/jars/gcs-connector-hadoop3-latest.jar") \
    .set("spark.hadoop.google.cloud.auth.service.account.enable", "true") 
#    .set("spark.hadoop.google.cloud.auth.service.account.json.keyfile", credentials_location)

spark = SparkSession.builder.config(conf=conf).getOrCreate()

project_id = "semar-de-project1"
dataset_id = "project1"
table_source = "item_properties"

df = spark.read.format('bigquery') \
    .option("temporaryGcsBucket","dataproc-temp-asia-southeast2-212352110204-1oi7hped") \
    .option("project", project_id) \
    .option("dataset", dataset_id) \
    .load(table_source)
    
df.createOrReplaceTempView("temp_item_properties")

item_properties_transform = spark.sql("""
select from_unixtime((timestamp / 1000), "yyyy-MM-dd HH:mm:ss") as timestamp, 
    itemid, property, value
from temp_item_properties
""")

# item_properties_transform.show()

project_id = "semar-de-project1"
dataset_id = "project1"
table_target = "item_properties_dwh"
parttition_column = "DATE_FORMAT(timestamp, 'yyyy-MM')"
cluster_column = "property"

item_properties_transform.write \
    .format("bigquery") \
    .option("temporaryGcsBucket","dataproc-temp-asia-southeast2-212352110204-1oi7hped") \
    .option("table", f"{project_id}.{dataset_id}.{table_target}") \
    .option("PARTITION BY",  parttition_column) \
    .option("CLUSTER BY", cluster_column) \
    .mode('Overwrite') \
    .save()

# Stop Spark session
spark.stop()

24/04/01 01:13:21 INFO SparkEnv: Registering MapOutputTracker
24/04/01 01:13:21 INFO SparkEnv: Registering BlockManagerMaster
24/04/01 01:13:21 INFO SparkEnv: Registering BlockManagerMasterHeartbeat
24/04/01 01:13:21 INFO SparkEnv: Registering OutputCommitCoordinator
24/04/01 01:13:23 ERROR ApplicationMaster: Uncaught exception: 
org.apache.hadoop.yarn.exceptions.InvalidApplicationMasterRequestException: Application doesn't exist in cache appattempt_1711926671331_0006_000001
	at org.apache.hadoop.yarn.server.resourcemanager.ApplicationMasterService.throwApplicationDoesNotExistInCacheException(ApplicationMasterService.java:362)
	at org.apache.hadoop.yarn.server.resourcemanager.ApplicationMasterService.registerApplicationMaster(ApplicationMasterService.java:260)
	at org.apache.hadoop.yarn.api.impl.pb.service.ApplicationMasterProtocolPBServiceImpl.registerApplicationMaster(ApplicationMasterProtocolPBServiceImpl.java:90)
	at org.apache.hadoop.yarn.proto.ApplicationMasterProtocol$Applicatio