In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

In [3]:
appName = "load_parquet"
master = "local"

In [4]:
spark = SparkSession.builder \
        .master(master) \
        .appName(appName) \
        .getOrCreate()

21/09/09 07:44:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


## Read in Parquet

In [5]:
customers_sdf = spark.read.parquet('/home/jovyan/filesystem/customers.parquet')
orders_sdf = spark.read.parquet('/home/jovyan/filesystem/orders.parquet')
geolocation_sdf = spark.read.parquet('/home/jovyan/filesystem/geolocation.parquet')

                                                                                

In [6]:
customers_sdf.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- customer_unique_id: string (nullable = true)
 |-- customer_zip_code_prefix: string (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_state: string (nullable = true)



In [8]:
orders_sdf.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_purchase_timestamp: timestamp (nullable = true)
 |-- order_approved_at: timestamp (nullable = true)
 |-- order_delivered_carrier_date: timestamp (nullable = true)
 |-- order_delivered_customer_date: timestamp (nullable = true)
 |-- order_estimated_delivery_date: timestamp (nullable = true)



In [21]:
geolocation_sdf.printSchema()

root
 |-- geolocation_zip_code_prefix: string (nullable = true)
 |-- geolocation_lat: double (nullable = true)
 |-- geolocation_lng: double (nullable = true)
 |-- geolocation_city: string (nullable = true)
 |-- geolocation_state: string (nullable = true)



## Load to Postgres

- Have to download PostgreSQL JDBC driver from []() and place it in the `/usr/local/spark/jar/` folder. Use

    `docker cp drivers/postgresql-42.2.23.jar dbt-with-postgres_pyspark-notebook_1:/usr/local/spark/jars/`

- Schema (as in Postgres schema) has to exist beforehand. Here, only writing to `public` schema is allowed.

- After writing to Postgres database, the schema (as in column types for the table) are not preserved.

- To preserve schema, add an additional option for `createTableColumnTypes` or `createTableOptions`. This means there is no need to create the tables beforehand but the schema has to be specified here.

In [11]:
# alternative method
customers_sdf.write \
    .jdbc(url="jdbc:postgresql://postgres-dest:5432/destdb", table="public.customers", properties={"user": "destdb1", "password": "destdb1"})

These do not preserve schema:

In [12]:
customers_sdf.write \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://postgres-dest:5432/destdb") \
    .option("dbtable", "public.customers") \
    .option("user", "destdb1") \
    .option("password", "destdb1") \
    .save()

                                                                                

In [15]:
orders_sdf.write \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://postgres-dest:5432/destdb") \
    .option("dbtable", "public.orders") \
    .option("user", "destdb1") \
    .option("password", "destdb1") \
    .save()

                                                                                

In [16]:
geolocation_sdf.write \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://postgres-dest:5432/destdb") \
    .option("dbtable", "public.geolocation") \
    .option("user", "destdb1") \
    .option("password", "destdb1") \
    .save()

                                                                                

### Create Schema

In [53]:
print(customers_sdf.dtypes)

[('customer_id', 'string'), ('customer_unique_id', 'string'), ('customer_zip_code_prefix', 'string'), ('customer_city', 'string'), ('customer_state', 'string')]


In [55]:
customers_cols = [*map(lambda c: c[0], customers_sdf.dtypes)]
orders_cols = [*map(lambda c: c[0], orders_sdf.dtypes)]
geolocation_cols = [*map(lambda c: c[0], geolocation_sdf.dtypes)]

`TEXT` data type is not supported in Spark

In [73]:
customers_coltypes = "VARCHAR(128) VARCHAR(128) CHAR(5) VARCHAR(128) CHAR(2)".split(" ")
orders_coltypes = "VARCHAR(128) VARCHAR(128) VARCHAR(128) TIMESTAMP TIMESTAMP TIMESTAMP TIMESTAMP TIMESTAMP".split(" ")
geolocation_coltypes = "CHAR(5) NUMERIC NUMERIC VARCHAR(128) CHAR(2)".split(" ")

In [74]:
customers_schema = []
orders_schema = []
geolocation_schema = []

In [75]:
for col, coltype in zip(customers_cols, customers_coltypes):
    customers_schema.append(col + " " + coltype)
    
for col, coltype in zip(orders_cols, orders_coltypes):
    orders_schema.append(col + " " + coltype)
    
for col, coltype in zip(geolocation_cols, geolocation_coltypes):
    geolocation_schema.append(col + " " + coltype)

### Load to Postgres

These will use the schema specified in the previous section

In [79]:
customers_sdf.write \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://postgres-dest:5432/destdb") \
    .option("createTableColumnTypes", ', '.join(customers_schema)) \
    .option("dbtable", "public.customers") \
    .option("user", "destdb1") \
    .option("password", "destdb1") \
    .save()

                                                                                

In [77]:
orders_sdf.write \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://postgres-dest:5432/destdb") \
    .option("createTableColumnTypes", ', '.join(orders_schema)) \
    .option("dbtable", "public.orders") \
    .option("user", "destdb1") \
    .option("password", "destdb1") \
    .save()

                                                                                

In [78]:
geolocation_sdf.write \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://postgres-dest:5432/destdb") \
    .option("createTableColumnTypes", ', '.join(geolocation_schema)) \
    .option("dbtable", "public.geolocation") \
    .option("user", "destdb1") \
    .option("password", "destdb1") \
    .save()

                                                                                