Bridging Spark SQL with JDBC

In [1]:
# Import required modules
from pyspark.sql import SparkSession
from datetime import datetime
from pyspark.sql import functions as f
from pyspark.sql import *
from pyspark.sql.types import *
# Delta is a storage layer for data lakes
from delta.tables import * 
# DeltaTable is the main class for Delta tables
from delta.tables import DeltaTable 

# Initialize SparkSession
# Create a SparkSession and set the extraClassPath configuration
spark = SparkSession.builder.master("local[1]") \
    .appName("BridgeMySQL") \
    .config("spark.driver.extraClassPath", "/home/jovyan/work/jars/*") \
    .getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
# Retrieving and Printing Spark Configuration Settings
conf = spark.sparkContext.getConf().getAll()
conf_dict = dict(conf)
for key, value in conf_dict.items():
    print("{} = {}".format(key, value))

spark.hadoop.fs.s3a.connection.ssl.enabled = false
spark.hadoop.fs.s3a.path.style.access = true
spark.sql.hive.metastore.schema.verification = true
spark.driver.extraClassPath = /home/jovyan/work/jars/*
spark.hadoop.fs.s3a.block.size = 1000M
spark.sql.hive.metastore.schema.verification.record.version = true
spark.serializer = org.apache.spark.serializer.KryoSerializer
spark.hadoop.fs.s3a.endpoint = s3.amazonaws.com
spark.app.name = BridgeMySQL
spark.serializer.objectStreamReset = 100
spark.submit.deployMode = client
spark.sql.hive.metastore.version = 2.3.9
spark.master = local[1]
spark.hadoop.fs.s3a.secret.key = 
spark.sql.hive.metastore.sharedPrefixes = org.mariadb.jdbc,com.mysql.cj.jdbc,com.mysql.jdbc,org.postgresql,com.microsoft.sqlserver,oracle.jdbc
spark.executor.id = driver
spark.driver.host = myjupyter
spark.hadoop.fs.s3a.impl = org.apache.hadoop.fs.s3a.S3AFileSystem
spark.sql.catalogImplementation = hive
spark.sql.hive.metastore.jars = builtin
spark.rdd.compress = True
spark.dr

In [9]:
spark.sparkContext.setLogLevel("ERROR")

# read csv with options
# Read the data with the specified schema and create a DataFrame
df = spark.read.format("csv") \
    .option("header", True) \
    .load("/home/jovyan/work/data/ecomm_orders.csv")
df.show(2)
df.printSchema()

# convert to avro
df.write.format("com.databricks.spark.avro") \
    .mode("overwrite") \
   .save("/home/jovyan/work/data/output_file.avro")

# convert to parquet
df.write.mode("overwrite").parquet("/home/jovyan/work/data/ecomm_orders.parquet")

# convert to json
df.write.mode("overwrite").json("/home/jovyan/work/data/ecomm_orders.json")

+----+--------------+---------------+----------+-------------+--------------------+--------+----------+-------+--------------+--------------------+------------+----------+--------+----------+----------+------------+-----------+
|  id|     eventType|        subject| eventTime|customer_name|             address|    city|postalcode|country|         phone|               email|product_name|order_date|currency|order_mode|sale_price|order_number|dataVersion|
+----+--------------+---------------+----------+-------------+--------------------+--------+----------+-------+--------------+--------------------+------------+----------+--------+----------+----------+------------+-----------+
|3001|recordInserted|ecomm/customers|2021-01-01|   Julie Rich|Ap #255-3031 Dui ...|Billings|     80834|    USA|1-528-884-4331|Donec.felis@neque...|   microwave|18/05/2021|     EUR|       NEW|     32.34|         385|        1.0|
|3002|recordInserted|ecomm/customers|2021-01-01|Ratnali Kumar|Ap #476-7527 Aene...|Bhilw

In [10]:
jdbcDriver = spark.conf.get("spark.jdbc.driver.class", "org.mariadb.jdbc.Driver")
dbHost = spark.conf.get("spark.jdbc.host","mysql")
dbPort = spark.conf.get("spark.jdbc.port", "3306")
defaultDb = spark.conf.get("spark.jdbc.default.db", "default")
dbTable = spark.conf.get("spark.jdbc.table", "customers")
dbUser = spark.conf.get("spark.jdbc.user", "dataeng")
dbPass = spark.conf.get("spark.jdbc.password", "dataengineering_user")

connectionUrl = f"jdbc:mysql://{dbHost}:{dbPort}/{defaultDb}"