In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import types

spark = SparkSession.builder \
    .config("spark.jars", "C:\\tools\\spark-3.3.2-bin-hadoop3\\jars\\postgresql-42.6.2.jar")\
    .master("local[*]") \
    .appName('spark-app') \
    .getOrCreate()

In [2]:
spark.version

'3.3.2'

In [10]:
from pyspark.sql.types import StructType,StructField, StringType, IntegerType

In [21]:
schema_customers = StructType([ \
    StructField("customer_id", StringType(), False), \
    StructField("customer_unique_id", StringType(), False), \
    StructField("customer_zip_code_prefix", StringType(), True), \
    StructField("customer_city", StringType(), True), \
    StructField("customer_state", StringType(), True), \
  ])

In [20]:
df_customers = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://localhost:5432/postgres") \
    .option("schema", schema_customers)\
    .option("dbtable", "olist_customers_dataset") \
    .option("user", "root") \
    .option("password", "root") \
    .option("driver", "org.postgresql.Driver") \
    .load()

df_customers.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- customer_unique_id: string (nullable = true)
 |-- customer_zip_code_prefix: string (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_state: string (nullable = true)



In [8]:
df_orders = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://localhost:5432/postgres") \
    .option("dbtable", "olist_orders_dataset") \
    .option("user", "root") \
    .option("password", "root") \
    .option("driver", "org.postgresql.Driver") \
    .option("inferschema", True)\
    .load()

df_orders.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_purchase_timestamp: timestamp (nullable = true)
 |-- order_approved_at: timestamp (nullable = true)
 |-- order_delivered_carrier_date: timestamp (nullable = true)
 |-- order_delivered_customer_date: timestamp (nullable = true)
 |-- order_estimated_delivery_date: timestamp (nullable = true)



In [9]:
df_orders.show(10)

+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+
|            order_id|         customer_id|order_status|order_purchase_timestamp|  order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|
+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+
|e481f51c-bdc5-467...|9ef432eb-6251-297...|   delivered|     2017-10-02 10:56:33|2017-10-02 11:07:15|         2017-10-04 19:55:00|          2017-10-10 21:25:13|          2017-10-18 00:00:00|
|53cdb2fc-8bc7-dce...|b0830fb4-747a-6c6...|   delivered|     2018-07-24 20:41:37|2018-07-26 03:24:27|         2018-07-26 14:31:00|          2018-08-07 15:27:45|          2018-08-13 00:00:00|
|47770eb9-100c-2d0...|41ce2a54-c0b0-3bf...|  