# MySQL DataSource via JDBC DataFrameReader

In [1]:
pip install python-decouple

Note: you may need to restart the kernel to use updated packages.


In [2]:
## Importing the required libraries
import os
# Import required modules
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from datetime import datetime
from pyspark.sql.functions import col
from pyspark.sql import functions as f
from pyspark.sql import *
from pyspark.sql.types import *


from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import TimestampType
from pyspark.sql.functions import current_timestamp
from datetime import datetime

## Create Spark Session

In [3]:
# Create a SparkSession and set the extraClassPath configuration
spark = SparkSession.builder.master("local[1]") \
    .appName("MySQLSpark") \
    .config("spark.driver.extraClassPath", "/home/jovyan/work/jars/*") \
    .getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

In [4]:
# To check the configuration

# Get the Spark configuration settings
config_settings = spark.sparkContext.getConf().getAll()

# Format the configuration settings as a string
formatted_settings = "\n".join([f"{key}={value}" for key, value in config_settings])

# Print the formatted settings
print(formatted_settings)
 

spark.hadoop.fs.s3a.connection.ssl.enabled=false
spark.hadoop.fs.s3a.path.style.access=true
spark.hadoop.fs.s3a.aws.credentials.provider=org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider
spark.app.name=MySQLSpark
spark.driver.port=33881
spark.driver.extraClassPath=/home/jovyan/work/jars/*
spark.sql.hive.metastore.schema.verification=true
spark.repl.local.jars=file:///home/jovyan/.ivy2/jars/io.delta_delta-core_2.12-2.0.0.jar,file:///home/jovyan/.ivy2/jars/io.delta_delta-storage-2.0.0.jar,file:///home/jovyan/.ivy2/jars/org.antlr_antlr4-runtime-4.8.jar,file:///home/jovyan/.ivy2/jars/org.codehaus.jackson_jackson-core-asl-1.9.13.jar
spark.sql.hive.metastore.schema.verification.record.version=true
spark.hadoop.fs.s3a.block.size=1000M
spark.app.initial.jar.urls=spark://myjupyter:33881/jars/io.delta_delta-core_2.12-2.0.0.jar,spark://myjupyter:33881/jars/org.antlr_antlr4-runtime-4.8.jar,spark://myjupyter:33881/jars/org.codehaus.jackson_jackson-core-asl-1.9.13.jar,spark://myjupyter:33881/jar

### Extract data from MySQL

In [5]:
from decouple import config

# Load database login details from .env file
db_host = config('DB_HOST')  # Replace 'DB_HOST' with the actual key in your .env file
db_port = config('DB_PORT')  # Replace 'DB_PORT' with the actual key in your .env file
db_name = config('DB_NAME')  # Replace 'DB_NAME' with the actual key in your .env file
db_user = config('DB_USER')  # Replace 'DB_USER' with the actual key in your .env file
db_password = config('DB_PASSWORD')  # Replace 'DB_PASSWORD' with the actual key in your .env file
db_driver = config('DB_DRIVER')  # Replace 'DB_DRIVER' with the actual key in your .env file

# Define MySQL JDBC connection properties using the loaded values
mysql_props = {
    "url": f"jdbc:mysql://{db_host}:{db_port}/{db_name}",
    "dbtable": "bettercustomers",
    "user": db_user,
    "password": db_password,
    "driver": db_driver
}

### Load Data From MySQL

In [6]:
# Load data from MySQL table
customers_df = spark.read \
    .format("jdbc") \
    .options(**mysql_props) \
    .load()

# Show the DataFrame
customers_df.show()

# Print the schema of the DataFrame
customers_df.printSchema()

+---+-------------------+-------------------+----------+---------+--------------------+
| id|            created|            updated|first_name|last_name|               email|
+---+-------------------+-------------------+----------+---------+--------------------+
|  1|2023-10-09 10:45:51|2023-10-09 10:45:51|      John|      Doe| johndoe@example.com|
|  2|2023-10-09 10:45:51|2023-10-09 10:45:51|      Jane|    Smith|janesmith@example...|
|  3|2023-10-09 10:45:51|2023-10-09 10:45:51|       Bob|  Johnson|bobjohnson@exampl...|
|  4|2023-10-09 10:45:51|2023-10-09 10:45:51|     Alice|      Lee|alicelee@example.com|
|  5|2023-10-09 10:45:51|2023-10-09 10:45:51|     David|      Kim|davidkim@example.com|
|  6|2023-10-09 10:45:51|2023-10-09 10:45:51|     Linda|   Nguyen|lindanguyen@examp...|
|  7|2023-10-09 10:45:51|2023-10-09 10:45:51|      Mike|   Garcia|mikegarcia@exampl...|
|  8|2023-10-09 10:45:51|2023-10-09 10:45:51|     Emily|     Chen|emilychen@example...|
|  9|2023-10-09 10:45:51|2023-10

In [11]:
# Create a view for the customers DataFrame
customers_df.createOrReplaceTempView("bettercustomers")

In [13]:
# Query the view "bettercustomers_view"
result = spark.sql("""
    SELECT *
    FROM bettercustomers
    WHERE id BETWEEN 1 AND 5
""")

# Show the result of the query
result.show()

+---+-------------------+-------------------+----------+---------+--------------------+
| id|            created|            updated|first_name|last_name|               email|
+---+-------------------+-------------------+----------+---------+--------------------+
|  1|2023-10-09 10:45:51|2023-10-09 10:45:51|      John|      Doe| johndoe@example.com|
|  2|2023-10-09 10:45:51|2023-10-09 10:45:51|      Jane|    Smith|janesmith@example...|
|  3|2023-10-09 10:45:51|2023-10-09 10:45:51|       Bob|  Johnson|bobjohnson@exampl...|
|  4|2023-10-09 10:45:51|2023-10-09 10:45:51|     Alice|      Lee|alicelee@example.com|
|  5|2023-10-09 10:45:51|2023-10-09 10:45:51|     David|      Kim|davidkim@example.com|
+---+-------------------+-------------------+----------+---------+--------------------+



In [14]:
# Insert new entries (11-20) into the "bettercustomers_view" using Spark SQL
spark.sql("""
    INSERT INTO bettercustomers (first_name, last_name, email, created, updated)
    VALUES
        ('Chris', 'Wilson', 'chriswilson@example.com', current_timestamp(), current_timestamp()),
        ('Eva', 'Martin', 'evamartin@example.com', current_timestamp(), current_timestamp()),
        ('Michael', 'Thompson', 'michaelthompson@example.com', current_timestamp(), current_timestamp()),
        ('Olivia', 'Lopez', 'olivialopez@example.com', current_timestamp(), current_timestamp()),
        ('Daniel', 'Hall', 'danielhall@example.com', current_timestamp(), current_timestamp()),
        ('Sophia', 'Gonzalez', 'sophiagonzalez@example.com', current_timestamp(), current_timestamp()),
        ('William', 'Parker', 'williamparker@example.com', current_timestamp(), current_timestamp()),
        ('Emma', 'Davis', 'emmadavis@example.com', current_timestamp(), current_timestamp()),
        ('Ava', 'Adams', 'avaadams@example.com', current_timestamp(), current_timestamp()),
        ('James', 'Baker', 'jamesbaker@example.com', current_timestamp(), current_timestamp())
""")

# Show the updated "bettercustomers_view" after insertion
spark.sql("SELECT * FROM bettercustomers").show()

AnalysisException: unknown requires that the data to be inserted have the same number of columns as the target table: target table has 6 column(s) but the inserted data has 5 column(s), including 0 partition column(s) having constant value(s).

## Create New Entries

In [8]:
# Define the timestamp conversion function
def ts(timeStr):
    return datetime.strptime(timeStr, '%Y-%m-%d %H:%M:%S')

# Create a function to get the current timestamp
def time():
    return datetime.now()

# Define the new records
new_records = [
    Row("11", ts("2023-10-09 11:00:00"), time(), "Penny", "Haines", "penny@coffeeco.com"),
    Row("12", ts("2023-10-09 12:00:00"), time(), "Cloud", "Fast", "cloud.fast@acme.com"),
    Row("13", ts("2023-10-09 13:00:00"), time(), "Marshal", "Haines", "paws@coffeeco.com"),
    Row("14", ts("2023-10-09 14:00:00"), time(), "Laura", "Smith", "laura.smith@example.com"),
    Row("15", ts("2023-10-09 15:00:00"), time(), "Sarah", "Johnson", "sarah.johnson@example.com"),
    Row("16", ts("2023-10-09 16:00:00"), time(), "Michael", "Lee", "michael.lee@example.com"),
    Row("17", ts("2023-10-09 17:00:00"), time(), "Ella", "Kim", "ella.kim@example.com"),
    Row("18", ts("2023-10-09 18:00:00"), time(), "Oliver", "Nguyen", "oliver.nguyen@example.com"),
    Row("19", ts("2023-10-09 19:00:00"), time(), "Sophia", "Garcia", "sophia.garcia@example.com"),
    Row("20", ts("2023-10-09 20:00:00"), time(), "Daniel", "Chen", "daniel.chen@example.com")
]

# Define the schema for the new records DataFrame
schema = [
    "id", "created", "updated", "first_name", "last_name", "email"
]

# Create a DataFrame from the new records
new_records_df = spark.createDataFrame(new_records, schema=schema)

# Append the new records to the existing DataFrame
existing_df = customers_df.union(new_records_df)

# Show the updated DataFrame
existing_df.show()

+---+-------------------+--------------------+----------+---------+--------------------+
| id|            created|             updated|first_name|last_name|               email|
+---+-------------------+--------------------+----------+---------+--------------------+
|  1|2023-10-09 10:45:51| 2023-10-09 10:45:51|      John|      Doe| johndoe@example.com|
|  2|2023-10-09 10:45:51| 2023-10-09 10:45:51|      Jane|    Smith|janesmith@example...|
|  3|2023-10-09 10:45:51| 2023-10-09 10:45:51|       Bob|  Johnson|bobjohnson@exampl...|
|  4|2023-10-09 10:45:51| 2023-10-09 10:45:51|     Alice|      Lee|alicelee@example.com|
|  5|2023-10-09 10:45:51| 2023-10-09 10:45:51|     David|      Kim|davidkim@example.com|
|  6|2023-10-09 10:45:51| 2023-10-09 10:45:51|     Linda|   Nguyen|lindanguyen@examp...|
|  7|2023-10-09 10:45:51| 2023-10-09 10:45:51|      Mike|   Garcia|mikegarcia@exampl...|
|  8|2023-10-09 10:45:51| 2023-10-09 10:45:51|     Emily|     Chen|emilychen@example...|
|  9|2023-10-09 10:45

In [9]:
# Calculate the maximum existing ID in the DataFrame
max_existing_id = existing_df.selectExpr("max(id)").collect()[0][0] or 0

# Add an offset to the ID values to start from '1'
existing_df = existing_df.withColumn("id", existing_df["id"] + max_existing_id)


existing_df.show()

+----+-------------------+--------------------+----------+---------+--------------------+
|  id|            created|             updated|first_name|last_name|               email|
+----+-------------------+--------------------+----------+---------+--------------------+
|10.0|2023-10-09 10:45:51| 2023-10-09 10:45:51|      John|      Doe| johndoe@example.com|
|11.0|2023-10-09 10:45:51| 2023-10-09 10:45:51|      Jane|    Smith|janesmith@example...|
|12.0|2023-10-09 10:45:51| 2023-10-09 10:45:51|       Bob|  Johnson|bobjohnson@exampl...|
|13.0|2023-10-09 10:45:51| 2023-10-09 10:45:51|     Alice|      Lee|alicelee@example.com|
|14.0|2023-10-09 10:45:51| 2023-10-09 10:45:51|     David|      Kim|davidkim@example.com|
|15.0|2023-10-09 10:45:51| 2023-10-09 10:45:51|     Linda|   Nguyen|lindanguyen@examp...|
|16.0|2023-10-09 10:45:51| 2023-10-09 10:45:51|      Mike|   Garcia|mikegarcia@exampl...|
|17.0|2023-10-09 10:45:51| 2023-10-09 10:45:51|     Emily|     Chen|emilychen@example...|
|18.0|2023

### Write into JDBC

In [33]:
# Write the appended DataFrame back to the database or another storage format
# Write the appended DataFrame back to the database using the values from .env
# Define your database connection details (you can also load them from .env)
db_host = "mysql"  # Replace with your MySQL host
db_port = "3306"    # Replace with your MySQL port
db_name = "default" # Replace with your database name
db_user = "dataeng" # Replace with your MySQL username
db_password = "dataengineering_user" # Replace with your MySQL password

# Define the JDBC URL
jdbc_url = f"jdbc:mysql://{db_host}:{db_port}/{db_name}"

# Define the table name
table_name = "bettercustomers"  # Replace with your desired table name

# Write the updated DataFrame to the "bettercustomers" table in MySQL
existing_df.write \
    .format("jdbc") \
    .option("url", jdbc_url) \
    .option("dbtable", table_name) \
    .option("user", db_user) \
    .option("password", db_password) \
    .option("driver", "org.mariadb.jdbc.Driver") \
    .mode("overwrite") \
    .save()

In [34]:
df = spark.read \
    .format("jdbc") \
    .option("url", jdbc_url) \
    .option("dbtable", table_name) \
    .option("user", db_user) \
    .option("password", db_password) \
    .option("driver", "org.mariadb.jdbc.Driver") \
    .load()

In [35]:
# Show the DataFrame
df.show()

+---+-------------------+-------------------+----------+---------+--------------------+
| id|            created|            updated|first_name|last_name|               email|
+---+-------------------+-------------------+----------+---------+--------------------+
| 11|2023-10-09 11:00:00|2023-10-09 10:37:48|     Penny|   Haines|  penny@coffeeco.com|
| 12|2023-10-09 12:00:00|2023-10-09 10:37:48|     Cloud|     Fast| cloud.fast@acme.com|
| 13|2023-10-09 13:00:00|2023-10-09 10:37:48|   Marshal|   Haines|   paws@coffeeco.com|
| 14|2023-10-09 14:00:00|2023-10-09 10:37:48|     Laura|    Smith|laura.smith@examp...|
| 15|2023-10-09 15:00:00|2023-10-09 10:37:48|     Sarah|  Johnson|sarah.johnson@exa...|
| 16|2023-10-09 16:00:00|2023-10-09 10:37:48|   Michael|      Lee|michael.lee@examp...|
| 17|2023-10-09 17:00:00|2023-10-09 10:37:48|      Ella|      Kim|ella.kim@example.com|
| 18|2023-10-09 18:00:00|2023-10-09 10:37:48|    Oliver|   Nguyen|oliver.nguyen@exa...|
| 19|2023-10-09 19:00:00|2023-10

In [7]:
# Define the timestamp conversion function
def ts(timeStr):
    return datetime.strptime(timeStr, '%Y-%m-%d %H:%M:%S')

# Create a function to get the current timestamp
def time():
    return datetime.now()

# Create some new customers
records = [
    Row("4", ts("2021-02-21 21:00:00"), time(), "Penny", "Haines", "penny@coffeeco.com"),
    Row("5", ts("2021-02-21 22:00:00"), time(), "Cloud", "Fast", "cloud.fast@acme.com"),
    Row("6", ts("2021-02-21 23:00:00"), time(), "Marshal", "Haines", "paws@coffeeco.com")
]

# Define the schema for the new customers DataFrame
schema = [
    "id", "created", "updated", "first_name", "last_name", "email"
]

# Create a DataFrame from the new records
new_customers = spark.createDataFrame(records, schema)

# Show the new DataFrame
new_customers.show()

+---+-------------------+--------------------+----------+---------+-------------------+
| id|            created|             updated|first_name|last_name|              email|
+---+-------------------+--------------------+----------+---------+-------------------+
|  4|2021-02-21 21:00:00|2023-09-16 11:41:...|     Penny|   Haines| penny@coffeeco.com|
|  5|2021-02-21 22:00:00|2023-09-16 11:41:...|     Cloud|     Fast|cloud.fast@acme.com|
|  6|2021-02-21 23:00:00|2023-09-16 11:41:...|   Marshal|   Haines|  paws@coffeeco.com|
+---+-------------------+--------------------+----------+---------+-------------------+



## Load Table into MySQL & Append

In [8]:
# Define the JDBC connection properties
mysql_props = {
    "url": "jdbc:mysql://mysql:3306/default",  # Change the host, port, and database name as needed
    "dbtable": "Customers",  # The name of the MySQL table to which you want to write the data
    "user": "dataeng",  # Change the username and password as needed
    "password": "dataengineering_user",
    "driver": "org.mariadb.jdbc.Driver"  # Use the appropriate JDBC driver
}

# Write the newCustomers DataFrame to the MySQL table
new_customers.write \
    .format("jdbc") \
    .mode("append") \
    .options(**mysql_props) \
    .save()

In [26]:
# Define the JDBC connection properties for reading
mysql_read_props = {
    "url": "jdbc:mysql://mysql:3306/default",  # Change the host, port, and database name as needed
    "dbtable": "Customers",  # The name of the MySQL table to read data from
    "user": "dataeng",  # Change the username and password as needed
    "password": "dataengineering_user",
    "driver": "org.mariadb.jdbc.Driver"  # Use the appropriate JDBC driver
}

# Read data from the MySQL table into a DataFrame
customers_df = spark.read \
    .format("jdbc") \
    .options(**mysql_read_props) \
    .load()

# Show the content of the DataFrame
customers_df.show()

+---+-------------------+-------------------+----------+---------+-------------------+
| id|            created|            updated|first_name|last_name|              email|
+---+-------------------+-------------------+----------+---------+-------------------+
|  4|2021-02-21 21:00:00|2023-10-09 10:00:55|     Penny|   Haines| penny@coffeeco.com|
|  5|2021-02-21 22:00:00|2023-10-09 10:00:55|     Cloud|     Fast|cloud.fast@acme.com|
|  6|2021-02-21 23:00:00|2023-10-09 10:00:55|   Marshal|   Haines|  paws@coffeeco.com|
+---+-------------------+-------------------+----------+---------+-------------------+



In [10]:
# # Combine the two DataFrames into one
# combined_dfs = new_customers.union(customers_df)

# # Add a new column with consecutive IDs
# windowSpec = Window.orderBy("created")
# combined_dfs = combined_dfs.withColumn("id", row_number().over(windowSpec))

# # Show the combined DataFrame
# combined_dfs.show()