In [1]:
import os
# Import required modules
from pyspark.sql import SparkSession
from datetime import datetime
from pyspark.sql.functions import col
from pyspark.sql import functions as f
from pyspark.sql import *
from pyspark.sql.types import *
# Delta is a storage layer for data lakes
from delta.tables import * 
# DeltaTable is the main class for Delta tables
from delta.tables import DeltaTable 

# create SparkSession
# Create a SparkSession and set the extraClassPath configuration
spark = SparkSession.builder.master("local[1]") \
    .appName("BridgeMySQL") \
    .config("spark.driver.extraClassPath", "/home/jovyan/work/jars/*") \
    .getOrCreate()
spark.sparkContext.setLogLevel("ERROR")



# retrieve database login details from environment variables
jdbcDriver = spark.conf.get("spark.jdbc.driver.class", "org.mariadb.jdbc.Driver")
dbHost = spark.conf.get("spark.jdbc.host","mysql")
dbPort = spark.conf.get("spark.jdbc.port", "3306")
defaultDb = spark.conf.get("spark.jdbc.default.db", "default")
dbTable = spark.conf.get("spark.jdbc.table", "bettercustomers")
dbUser = spark.conf.get("spark.jdbc.user", "dataeng")
dbPass = spark.conf.get("spark.jdbc.password", "dataengineering_user")

# create jdbc url
connection_url = f'jdbc:mysql://{dbHost}:{dbPort}/{defaultDb}'

# read data from mysql
customers_sdf = spark.read \
    .format("jdbc") \
    .option("url", connection_url) \
    .option("driver", jdbcDriver) \
    .option("dbtable", dbTable) \
    .option("user", dbUser) \
    .option("password", dbPass) \
    .load()

customers_sdf.printSchema()
customers_sdf.createOrReplaceTempView("customers")

# show data
customers_sdf.show()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


root
 |-- id: integer (nullable = true)
 |-- created: timestamp (nullable = true)
 |-- updated: timestamp (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- email: string (nullable = true)



                                                                                

+---+-------------------+-------------------+----------+---------+--------------------+
| id|            created|            updated|first_name|last_name|               email|
+---+-------------------+-------------------+----------+---------+--------------------+
|  1|2021-02-16 00:16:06|2023-03-06 09:25:20|     Scott|   Haines|  scott@coffeeco.com|
|  2|2021-02-16 00:16:06|2023-03-06 09:25:20|      John|     Hamm|  john.hamm@acme.com|
|  3|2021-02-16 00:16:06|2023-03-06 09:25:20|      Milo|   Haines|mhaines@coffeeco.com|
|  4|2021-02-21 21:00:00|2023-03-06 09:25:20|     Penny|   Haines|  penny@coffeeco.com|
|  5|2021-02-21 22:00:00|2023-03-06 09:25:20|     Cloud|     Fast| cloud.fast@acme.com|
|  6|2021-02-21 23:00:00|2023-03-06 09:25:20|   Marshal|   Haines|   paws@coffeeco.com|
|  7|2021-02-24 09:00:00|2023-03-06 09:25:20|    Willow|   Haines| willow@coffeeco.com|
|  8|2021-02-24 09:00:00|2023-03-06 09:25:20|    Clover|   Haines|    pup@coffeeco.com|
+---+-------------------+-------

### Query The View

In [3]:
# Query the view
customers_sql = spark.sql("SELECT * FROM customers")
customers_sql.show()

+---+-------------------+-------------------+----------+---------+--------------------+
| id|            created|            updated|first_name|last_name|               email|
+---+-------------------+-------------------+----------+---------+--------------------+
|  1|2021-02-16 00:16:06|2023-03-06 09:25:20|     Scott|   Haines|  scott@coffeeco.com|
|  2|2021-02-16 00:16:06|2023-03-06 09:25:20|      John|     Hamm|  john.hamm@acme.com|
|  3|2021-02-16 00:16:06|2023-03-06 09:25:20|      Milo|   Haines|mhaines@coffeeco.com|
|  4|2021-02-21 21:00:00|2023-03-06 09:25:20|     Penny|   Haines|  penny@coffeeco.com|
|  5|2021-02-21 22:00:00|2023-03-06 09:25:20|     Cloud|     Fast| cloud.fast@acme.com|
|  6|2021-02-21 23:00:00|2023-03-06 09:25:20|   Marshal|   Haines|   paws@coffeeco.com|
|  7|2021-02-24 09:00:00|2023-03-06 09:25:20|    Willow|   Haines| willow@coffeeco.com|
|  8|2021-02-24 09:00:00|2023-03-06 09:25:20|    Clover|   Haines|    pup@coffeeco.com|
+---+-------------------+-------

In [6]:
customers_sql = spark.sql("SELECT * FROM customers WHERE first_name LIKE '%c%';")
customers_sql.show()

+---+-------------------+-------------------+----------+---------+-------------------+
| id|            created|            updated|first_name|last_name|              email|
+---+-------------------+-------------------+----------+---------+-------------------+
|  1|2021-02-16 00:16:06|2023-03-06 09:25:20|     Scott|   Haines| scott@coffeeco.com|
|  5|2021-02-21 22:00:00|2023-03-06 09:25:20|     Cloud|     Fast|cloud.fast@acme.com|
|  8|2021-02-24 09:00:00|2023-03-06 09:25:20|    Clover|   Haines|   pup@coffeeco.com|
+---+-------------------+-------------------+----------+---------+-------------------+



In [7]:
customers_sql = spark.sql("SELECT * FROM customers WHERE email LIKE '%c%';")
customers_sql.show()

+---+-------------------+-------------------+----------+---------+--------------------+
| id|            created|            updated|first_name|last_name|               email|
+---+-------------------+-------------------+----------+---------+--------------------+
|  1|2021-02-16 00:16:06|2023-03-06 09:25:20|     Scott|   Haines|  scott@coffeeco.com|
|  2|2021-02-16 00:16:06|2023-03-06 09:25:20|      John|     Hamm|  john.hamm@acme.com|
|  3|2021-02-16 00:16:06|2023-03-06 09:25:20|      Milo|   Haines|mhaines@coffeeco.com|
|  4|2021-02-21 21:00:00|2023-03-06 09:25:20|     Penny|   Haines|  penny@coffeeco.com|
|  5|2021-02-21 22:00:00|2023-03-06 09:25:20|     Cloud|     Fast| cloud.fast@acme.com|
|  6|2021-02-21 23:00:00|2023-03-06 09:25:20|   Marshal|   Haines|   paws@coffeeco.com|
|  7|2021-02-24 09:00:00|2023-03-06 09:25:20|    Willow|   Haines| willow@coffeeco.com|
|  8|2021-02-24 09:00:00|2023-03-06 09:25:20|    Clover|   Haines|    pup@coffeeco.com|
+---+-------------------+-------