# Starting Spark Session

In [1]:
import os
from pyspark.sql import SparkSession

# Get environment variables
minio_access_key = os.getenv('MINIO_ACCESS_KEY')
minio_secret_key = os.getenv('MINIO_SECRET_KEY')
minio_url = os.getenv('MINIO_URL')

user = os.getenv('POSTGRES_USER')
password = os.getenv('POSTGRES_USER')

# Spark session & context
spark = SparkSession.builder \
    .appName("read-postgres") \
    .config("spark.driver.extraClassPath", "/home/jovyan/work/jars/postgresql-42.4.0.jar") \
    .config("spark.hadoop.fs.s3a.endpoint", minio_url) \
    .config("spark.hadoop.fs.s3a.access.key", minio_access_key) \
    .config("spark.hadoop.fs.s3a.secret.key", minio_secret_key) \
    .config("spark.hadoop.fs.s3a.path.style.access", True) \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .getOrCreate()

In [2]:
# Actor Table; created from the dvdrental database

df_customers = (
    spark.read
    .format("jdbc")
    .option("url", "jdbc:postgresql://oasispostgresdb:5432/airflow")
    .option("dbtable", "public.customers")
    .option("user", user)
    .option("password", password)
    .load()
)

df_customers.show()

+---+-------------------+--------------------+----------+---------+--------------------+
| id|            created|             updated|first_name|last_name|               email|
+---+-------------------+--------------------+----------+---------+--------------------+
|  1|2021-02-16 00:16:06|2024-05-12 21:05:...|     Scott|   Haines|  scott@coffeeco.com|
|  2|2021-02-16 00:16:06|2024-05-12 21:05:...|      John|     Hamm|  john.hamm@acme.com|
|  3|2021-02-16 00:16:06|2024-05-12 21:05:...|      Milo|   Haines|mhaines@coffeeco.com|
|  4|2021-02-21 21:00:00|2024-05-12 21:05:...|     Penny|   Haines|  penny@coffeeco.com|
|  5|2021-02-21 22:00:00|2024-05-12 21:05:...|     Cloud|     Fast| cloud.fast@acme.com|
|  6|2021-02-21 23:00:00|2024-05-12 21:05:...|   Marshal|   Haines|   paws@coffeeco.com|
|  7|2021-02-24 09:00:00|2024-05-12 21:05:...|    Willow|   Haines| willow@coffeeco.com|
|  8|2021-02-24 09:00:00|2024-05-12 21:05:...|    Clover|   Haines|    pup@coffeeco.com|
+---+----------------

In [2]:
# Read the DataFrame from the `silver` bucket
df = spark.read.parquet("s3a://silver/data.parquet")

# Show the DataFrame
df.show()

+-----------+-----------+-------------------+
|category_id|       name|        last_update|
+-----------+-----------+-------------------+
|          1|     Action|2006-02-15 09:46:27|
|          2|  Animation|2006-02-15 09:46:27|
|          3|   Children|2006-02-15 09:46:27|
|          4|   Classics|2006-02-15 09:46:27|
|          5|     Comedy|2006-02-15 09:46:27|
|          6|Documentary|2006-02-15 09:46:27|
|          7|      Drama|2006-02-15 09:46:27|
|          8|     Family|2006-02-15 09:46:27|
|          9|    Foreign|2006-02-15 09:46:27|
|         10|      Games|2006-02-15 09:46:27|
|         11|     Horror|2006-02-15 09:46:27|
|         12|      Music|2006-02-15 09:46:27|
|         13|        New|2006-02-15 09:46:27|
|         14|     Sci-Fi|2006-02-15 09:46:27|
|         15|     Sports|2006-02-15 09:46:27|
|         16|     Travel|2006-02-15 09:46:27|
+-----------+-----------+-------------------+



## Read Postgres

### Actor Data

In [4]:
# Actor Table; created from the dvdrental database

df_actor = (
    spark.read
    .format("jdbc")
    .option("url", "jdbc:postgresql://oasispostgresdb:5432/dvdrental")
    .option("dbtable", "public.actor")
    .option("user", user)
    .option("password", password)
    .load()
)

In [5]:
# Show Top 10 rows 

df_actor.show(10)

+--------+----------+------------+--------------------+
|actor_id|first_name|   last_name|         last_update|
+--------+----------+------------+--------------------+
|       1|  Penelope|     Guiness|2013-05-26 14:47:...|
|       2|      Nick|    Wahlberg|2013-05-26 14:47:...|
|       3|        Ed|       Chase|2013-05-26 14:47:...|
|       4|  Jennifer|       Davis|2013-05-26 14:47:...|
|       5|    Johnny|Lollobrigida|2013-05-26 14:47:...|
|       6|     Bette|   Nicholson|2013-05-26 14:47:...|
|       7|     Grace|      Mostel|2013-05-26 14:47:...|
|       8|   Matthew|   Johansson|2013-05-26 14:47:...|
|       9|       Joe|       Swank|2013-05-26 14:47:...|
|      10| Christian|       Gable|2013-05-26 14:47:...|
+--------+----------+------------+--------------------+
only showing top 10 rows



### Category Data

In [6]:
# Category Table; created from the dvdrental database

df_category= (
    spark.read
    .format("jdbc")
    .option("url", "jdbc:postgresql://oasispostgresdb:5432/dvdrental")
    .option("dbtable", "public.category")
    .option("user", user)
    .option("password", password)
    .load()
)

In [6]:
df_category.show(10)

+-----------+-----------+-------------------+
|category_id|       name|        last_update|
+-----------+-----------+-------------------+
|          1|     Action|2006-02-15 09:46:27|
|          2|  Animation|2006-02-15 09:46:27|
|          3|   Children|2006-02-15 09:46:27|
|          4|   Classics|2006-02-15 09:46:27|
|          5|     Comedy|2006-02-15 09:46:27|
|          6|Documentary|2006-02-15 09:46:27|
|          7|      Drama|2006-02-15 09:46:27|
|          8|     Family|2006-02-15 09:46:27|
|          9|    Foreign|2006-02-15 09:46:27|
|         10|      Games|2006-02-15 09:46:27|
+-----------+-----------+-------------------+
only showing top 10 rows

