# MySQL DataSource via JDBC DataFrameReader

In [17]:
## Importing the required libraries
import os
# Import required modules
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from datetime import datetime
from pyspark.sql.functions import col
from pyspark.sql import functions as f
from pyspark.sql import *
from pyspark.sql.types import *

## Create Spark Session

In [18]:
# Create a SparkSession and set the extraClassPath configuration
spark = SparkSession.builder.master("local[1]") \
    .appName("MySQLSpark") \
    .config("spark.driver.extraClassPath", "/home/jovyan/work/jars/*") \
    .getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

In [19]:
# To check the configuration

# Get the Spark configuration settings
config_settings = spark.sparkContext.getConf().getAll()

# Format the configuration settings as a string
formatted_settings = "\n".join([f"{key}={value}" for key, value in config_settings])

# Print the formatted settings
print(formatted_settings)
 

spark.app.id=local-1694794787556
spark.app.startTime=1694794785302
spark.executor.id=driver
spark.app.name=MySQLSpark
spark.driver.host=myjupyter
spark.driver.extraClassPath=/home/jovyan/work/jars/*
spark.sql.warehouse.dir=file:/home/jovyan/work/notebooks/spark-warehouse
spark.rdd.compress=True
spark.serializer.objectStreamReset=100
spark.submit.pyFiles=
spark.submit.deployMode=client
spark.ui.showConsoleProgress=true
spark.master=local[1]
spark.driver.port=44929


### Extract data from MySQL

In [24]:
# Establish a connection to the database
# Define MySQL JDBC connection properties
mysql_props = {
    "url": "jdbc:mysql://mysql:3306/default",  # Change the host, port, and database name as needed
    "dbtable": "customers",
    "user": "dataeng",  # Change the username and password as needed
    "password": "dataengineering_user",
    "driver": "org.mariadb.jdbc.Driver"  # Use the appropriate JDBC driver
}

# Load data from MySQL table
customers_df = spark.read \
    .format("jdbc") \
    .options(**mysql_props) \
    .load()

# Show the DataFrame
customers_df.show()

# Print the schema of the DataFrame
customers_df.printSchema()

+---+-------------------+-------------------+----------+---------+--------------------+
| id|            created|            updated|first_name|last_name|               email|
+---+-------------------+-------------------+----------+---------+--------------------+
|  1|2023-09-15 17:01:44|2023-09-15 17:01:44|     Scott|   Haines|  scott@coffeeco.com|
|  2|2023-09-15 17:01:44|2023-09-15 17:01:44|      John|     Hamm|  john.hamm@acme.com|
|  3|2023-09-15 17:01:44|2023-09-15 17:01:44|      Milo|   Haines|mhaines@coffeeco.com|
+---+-------------------+-------------------+----------+---------+--------------------+

root
 |-- id: string (nullable = true)
 |-- created: timestamp (nullable = true)
 |-- updated: timestamp (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- email: string (nullable = true)



In [5]:
# grab a couple columns 

In [25]:
customers_df.select("id", "created", "email").show()

+---+-------------------+--------------------+
| id|            created|               email|
+---+-------------------+--------------------+
|  1|2023-09-15 17:01:44|  scott@coffeeco.com|
|  2|2023-09-15 17:01:44|  john.hamm@acme.com|
|  3|2023-09-15 17:01:44|mhaines@coffeeco.com|
+---+-------------------+--------------------+



In [None]:
# Create a list of new records
new_records_data = [
    (4, "John", "Doe", "johndoe@example.com"),
    (5, "Jane", "Smith", "janesmith@example.com"),
    (6, "Bob", "Johnson", "bobjohnson@example.com"),
    (7, "Alice", "Lee", "alicelee@example.com"),
    (8, "David", "Kim", "davidkim@example.com"),
    (9, "Linda", "Nguyen", "lindanguyen@example.com"),
    (10, "Mike", "Garcia", "mikegarcia@example.com")
]

# Create a DataFrame for new records
new_records_df = spark.createDataFrame(new_records_data, ["id", "first_name", "last_name", "email"])

# Add the 'created' and 'updated' columns with the current timestamp
from pyspark.sql.functions import current_timestamp
new_records_df = new_records_df.withColumn("created", current_timestamp())
new_records_df = new_records_df.withColumn("updated", current_timestamp())

# Append new records to the existing DataFrame
combined_df = customers_df.union(new_records_df)

# Show the combined DataFrame
combined_df.show()