# MySQL DataSource via JDBC DataFrameReader

In [11]:
## Importing the required libraries
import os
# Import required modules
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from datetime import datetime
from pyspark.sql.functions import col
from pyspark.sql import functions as f
from pyspark.sql import *
from pyspark.sql.types import *


from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import TimestampType
from pyspark.sql.functions import current_timestamp
from datetime import datetime

## Create Spark Session

In [4]:
# Create a SparkSession and set the extraClassPath configuration
spark = SparkSession.builder.master("local[1]") \
    .appName("MySQLSpark") \
    .config("spark.driver.extraClassPath", "/home/jovyan/work/jars/*") \
    .getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

In [5]:
# To check the configuration

# Get the Spark configuration settings
config_settings = spark.sparkContext.getConf().getAll()

# Format the configuration settings as a string
formatted_settings = "\n".join([f"{key}={value}" for key, value in config_settings])

# Print the formatted settings
print(formatted_settings)
 

spark.driver.port=43369
spark.executor.id=driver
spark.app.name=MySQLSpark
spark.driver.host=myjupyter
spark.app.startTime=1694864450626
spark.driver.extraClassPath=/home/jovyan/work/jars/*
spark.sql.warehouse.dir=file:/home/jovyan/work/notebooks/spark-warehouse
spark.rdd.compress=True
spark.serializer.objectStreamReset=100
spark.submit.pyFiles=
spark.submit.deployMode=client
spark.app.id=local-1694864452675
spark.ui.showConsoleProgress=true
spark.master=local[1]


### Extract data from MySQL

In [6]:
# Establish a connection to the database
# Define MySQL JDBC connection properties
mysql_props = {
    "url": "jdbc:mysql://mysql:3306/default",  # Change the host, port, and database name as needed
    "dbtable": "customers",
    "user": "dataeng",  # Change the username and password as needed
    "password": "dataengineering_user",
    "driver": "org.mariadb.jdbc.Driver"  # Use the appropriate JDBC driver
}

# Load data from MySQL table
customers_df = spark.read \
    .format("jdbc") \
    .options(**mysql_props) \
    .load()

# Show the DataFrame
customers_df.show()

# Print the schema of the DataFrame
customers_df.printSchema()

+---+-------------------+-------------------+----------+---------+-------------------+
| id|            created|            updated|first_name|last_name|              email|
+---+-------------------+-------------------+----------+---------+-------------------+
|  1|2021-02-21 21:00:00|2023-09-16 11:35:27|     Penny|   Haines| penny@coffeeco.com|
|  2|2021-02-21 22:00:00|2023-09-16 11:35:27|     Cloud|     Fast|cloud.fast@acme.com|
|  3|2021-02-21 23:00:00|2023-09-16 11:35:27|   Marshal|   Haines|  paws@coffeeco.com|
+---+-------------------+-------------------+----------+---------+-------------------+

root
 |-- id: integer (nullable = true)
 |-- created: timestamp (nullable = true)
 |-- updated: timestamp (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- email: string (nullable = true)



## Create New Entries

In [7]:



# Define the timestamp conversion function
def ts(timeStr):
    return datetime.strptime(timeStr, '%Y-%m-%d %H:%M:%S')

# Create a function to get the current timestamp
def time():
    return datetime.now()

# Create some new customers
records = [
    Row("4", ts("2021-02-21 21:00:00"), time(), "Penny", "Haines", "penny@coffeeco.com"),
    Row("5", ts("2021-02-21 22:00:00"), time(), "Cloud", "Fast", "cloud.fast@acme.com"),
    Row("6", ts("2021-02-21 23:00:00"), time(), "Marshal", "Haines", "paws@coffeeco.com")
]

# Define the schema for the new customers DataFrame
schema = [
    "id", "created", "updated", "first_name", "last_name", "email"
]

# Create a DataFrame from the new records
new_customers = spark.createDataFrame(records, schema)

# Show the new DataFrame
new_customers.show()

+---+-------------------+--------------------+----------+---------+-------------------+
| id|            created|             updated|first_name|last_name|              email|
+---+-------------------+--------------------+----------+---------+-------------------+
|  4|2021-02-21 21:00:00|2023-09-16 11:41:...|     Penny|   Haines| penny@coffeeco.com|
|  5|2021-02-21 22:00:00|2023-09-16 11:41:...|     Cloud|     Fast|cloud.fast@acme.com|
|  6|2021-02-21 23:00:00|2023-09-16 11:41:...|   Marshal|   Haines|  paws@coffeeco.com|
+---+-------------------+--------------------+----------+---------+-------------------+



## Load Table into MySQL & Append

In [8]:
# Define the JDBC connection properties
mysql_props = {
    "url": "jdbc:mysql://mysql:3306/default",  # Change the host, port, and database name as needed
    "dbtable": "Customers",  # The name of the MySQL table to which you want to write the data
    "user": "dataeng",  # Change the username and password as needed
    "password": "dataengineering_user",
    "driver": "org.mariadb.jdbc.Driver"  # Use the appropriate JDBC driver
}

# Write the newCustomers DataFrame to the MySQL table
new_customers.write \
    .format("jdbc") \
    .mode("append") \
    .options(**mysql_props) \
    .save()

In [9]:
# Define the JDBC connection properties for reading
mysql_read_props = {
    "url": "jdbc:mysql://mysql:3306/default",  # Change the host, port, and database name as needed
    "dbtable": "Customers",  # The name of the MySQL table to read data from
    "user": "dataeng",  # Change the username and password as needed
    "password": "dataengineering_user",
    "driver": "org.mariadb.jdbc.Driver"  # Use the appropriate JDBC driver
}

# Read data from the MySQL table into a DataFrame
customers_df = spark.read \
    .format("jdbc") \
    .options(**mysql_read_props) \
    .load()

# Show the content of the DataFrame
customers_df.show()

+---+-------------------+-------------------+----------+---------+-------------------+
| id|            created|            updated|first_name|last_name|              email|
+---+-------------------+-------------------+----------+---------+-------------------+
|  1|2021-02-21 21:00:00|2023-09-16 11:35:27|     Penny|   Haines| penny@coffeeco.com|
|  2|2021-02-21 22:00:00|2023-09-16 11:35:27|     Cloud|     Fast|cloud.fast@acme.com|
|  3|2021-02-21 23:00:00|2023-09-16 11:35:27|   Marshal|   Haines|  paws@coffeeco.com|
|  4|2021-02-21 21:00:00|2023-09-16 11:41:07|     Penny|   Haines| penny@coffeeco.com|
|  5|2021-02-21 22:00:00|2023-09-16 11:41:07|     Cloud|     Fast|cloud.fast@acme.com|
|  6|2021-02-21 23:00:00|2023-09-16 11:41:07|   Marshal|   Haines|  paws@coffeeco.com|
+---+-------------------+-------------------+----------+---------+-------------------+



In [10]:
# # Combine the two DataFrames into one
# combined_dfs = new_customers.union(customers_df)

# # Add a new column with consecutive IDs
# windowSpec = Window.orderBy("created")
# combined_dfs = combined_dfs.withColumn("id", row_number().over(windowSpec))

# # Show the combined DataFrame
# combined_dfs.show()