In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
                    .appName("Connect to Relational databases") \
                    .config("spark.executor.memory", "1G") \
                    .config("spark.driver.memory", "1G") \
                    .config("spark.driver.maxResultSize", "1G") \
                    .config("spark.jars.packages", "com.mysql:mysql-connector-j:9.0.0,org.postgresql:postgresql:42.7.3") \
                    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
                    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
                    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")
spark

:: loading settings :: url = jar:file:/opt/spark-3.5.1-bin-without-hadoop/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
com.mysql#mysql-connector-j added as a dependency
org.postgresql#postgresql added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-4a846a7e-9ddc-4130-b38b-dc423006f8c8;1.0
	confs: [default]
	found com.mysql#mysql-connector-j;9.0.0 in central
	found com.google.protobuf#protobuf-java;4.26.1 in central
	found org.postgresql#postgresql;42.7.3 in central
	found org.checkerframework#checker-qual;3.42.0 in central
downloading https://repo1.maven.org/maven2/com/mysql/mysql-connector-j/9.0.0/mysql-connector-j-9.0.0.jar ...
	[SUCCESSFUL ] com.mysql#mysql-connector-j;9.0.0!mysql-connector-j.jar (1113ms)
downloading https://repo1.maven.org/maven2/org/postgresql/postgresql/42.7.3/postgresql-42.7.3.jar ...
	[SUCCESSFUL ] org.postgresql#postgresql;42.7.3!postgresql.jar (314ms)
downloading https://repo1.maven.org/maven2/com/google/protobuf/protobuf-java/4.26.1/protobuf-jav

# MySQL

In [2]:
# Setup the JDBC connection
jdbc_url = "jdbc:mysql://mysql:3306/sakila"
connection_properties = {
      "user" : "root",
      "password" : "root",
      "driver" : "com.mysql.cj.jdbc.Driver"
    }

# Create a query
query = """
            SELECT * FROM actor
        """

# run the query
bh_bairros_df = spark.read \
                     .jdbc(url=jdbc_url, 
                           table=f"({query}) AS t", 
                           properties=connection_properties)

bh_bairros_df.show()
print(bh_bairros_df.count())
bh_bairros_df.printSchema()

                                                                                

+--------+----------+------------+-------------------+
|actor_id|first_name|   last_name|        last_update|
+--------+----------+------------+-------------------+
|       1|  PENELOPE|     GUINESS|2006-02-15 04:34:33|
|       2|      NICK|    WAHLBERG|2006-02-15 04:34:33|
|       3|        ED|       CHASE|2006-02-15 04:34:33|
|       4|  JENNIFER|       DAVIS|2006-02-15 04:34:33|
|       5|    JOHNNY|LOLLOBRIGIDA|2006-02-15 04:34:33|
|       6|     BETTE|   NICHOLSON|2006-02-15 04:34:33|
|       7|     GRACE|      MOSTEL|2006-02-15 04:34:33|
|       8|   MATTHEW|   JOHANSSON|2006-02-15 04:34:33|
|       9|       JOE|       SWANK|2006-02-15 04:34:33|
|      10| CHRISTIAN|       GABLE|2006-02-15 04:34:33|
|      11|      ZERO|        CAGE|2006-02-15 04:34:33|
|      12|      KARL|       BERRY|2006-02-15 04:34:33|
|      13|       UMA|        WOOD|2006-02-15 04:34:33|
|      14|    VIVIEN|      BERGEN|2006-02-15 04:34:33|
|      15|      CUBA|     OLIVIER|2006-02-15 04:34:33|
|      16|

In [3]:
from pyspark.sql import functions as F

bh_bairros_df.write \
             .mode("overwrite") \
             .option("truncate", "true") \
             .jdbc(url=jdbc_url, 
                   table="new_table",
                   properties=connection_properties)

bh_bairros_df.printSchema()

root
 |-- actor_id: integer (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- last_update: timestamp (nullable = true)



# Postgres

In [4]:
# Setup the JDBC connection
jdbc_url = "jdbc:postgresql://postgres:5432/mydb"
connection_properties = {
      "user" : "user",
      "password" : "pass",
      "driver" : "org.postgresql.Driver"
    }

# Create a query
query = """
            SELECT * FROM products
        """

# run the query
df = spark.read \
                     .jdbc(url=jdbc_url, 
                           table=f"({query}) AS t", 
                           properties=connection_properties)

df.show()
print(df.count())
df.printSchema()

+--------------------+------------------+--------------------+------+--------------------+
|                  id|              name|              amount| price|       id_categories|
+--------------------+------------------+--------------------+------+--------------------+
|1.000000000000000000|         Lampshade|100.0000000000000...|800.00|4.000000000000000000|
|2.000000000000000000|Table for painting|1000.000000000000...|560.00|9.000000000000000000|
|3.000000000000000000|     Notebook desk|10000.00000000000...| 25.50|9.000000000000000000|
|4.000000000000000000|     Computer desk|350.0000000000000...|320.50|6.000000000000000000|
|5.000000000000000000|             Chair|3000.000000000000...|210.64|9.000000000000000000|
|6.000000000000000000|        Home alarm|750.0000000000000...|460.00|4.000000000000000000|
+--------------------+------------------+--------------------+------+--------------------+

6
root
 |-- id: decimal(38,18) (nullable = true)
 |-- name: string (nullable = true)
 |--

In [5]:
from pyspark.sql import functions as F

df.write \
     .mode("overwrite") \
     .option("truncate", "true") \
     .jdbc(url=jdbc_url, 
           table="new_table",
           properties=connection_properties)

df.printSchema()

root
 |-- id: decimal(38,18) (nullable = true)
 |-- name: string (nullable = true)
 |-- amount: decimal(38,18) (nullable = true)
 |-- price: decimal(7,2) (nullable = true)
 |-- id_categories: decimal(38,18) (nullable = true)



                                                                                