# Postgres SQL & Spark

- Learn to connect to Postgres, fetch table (schema) definitions, read, query, and write data using Apache Spark and Java Database Connectivity (JDBC).

## Import Libraries

In [2]:
import os
# Import required modules
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from datetime import datetime
from pyspark.sql.functions import col
from pyspark.sql import functions as f
from pyspark.sql import *
from pyspark.sql.types import *
# # Delta is a storage layer for data lakes
# from delta.tables import * 
# # DeltaTable is the main class for Delta tables
# from delta.tables import DeltaTable

## Initiate Spark Session

In [3]:
# Create a SparkSession and set the extraClassPath configuration
spark = SparkSession.builder.master("local[1]") \
    .appName("PostgresSpark") \
    .config("spark.driver.extraClassPath", "/home/jovyan/work/jars/*") \
    .getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

## Extract Data

In [4]:
# load the data from the postgres database
customers_sdf = spark.read.format("jdbc") \
    .option("url", "jdbc:postgresql://oasispostgres:5432/oasiscorp") \
    .option("dbtable", "bettercustomers") \
    .option("user", "oasis").option("password", "oasis") \
    .load()

# Show the contents of the DataFrame
customers_sdf.show()

+---+--------------------+--------------------+----------+---------+--------------------+
| id|             created|             updated|first_name|last_name|               email|
+---+--------------------+--------------------+----------+---------+--------------------+
|  1|2023-09-15 15:34:...|2023-09-15 15:34:...|      John|      Doe| johndoe@example.com|
|  2|2023-09-15 15:34:...|2023-09-15 15:34:...|      Jane|    Smith|janesmith@example...|
|  3|2023-09-15 15:34:...|2023-09-15 15:34:...|       Bob|  Johnson|bobjohnson@exampl...|
|  4|2023-09-15 15:34:...|2023-09-15 15:34:...|     Alice|      Lee|alicelee@example.com|
|  5|2023-09-15 15:34:...|2023-09-15 15:34:...|     David|      Kim|davidkim@example.com|
|  6|2023-09-15 15:34:...|2023-09-15 15:34:...|     Linda|   Nguyen|lindanguyen@examp...|
|  7|2023-09-15 15:34:...|2023-09-15 15:34:...|      Mike|   Garcia|mikegarcia@exampl...|
|  8|2023-09-15 15:34:...|2023-09-15 15:34:...|     Emily|     Chen|emilychen@example...|
|  9|2023-

### View The Schema

In [5]:
customers_sdf.printSchema()

root
 |-- id: integer (nullable = true)
 |-- created: timestamp (nullable = true)
 |-- updated: timestamp (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- email: string (nullable = true)



## Register the View

In [6]:
customers_sdf.createOrReplaceTempView("customers")

### Query the View

In [7]:
# Query the view
customers_sql = spark.sql("SELECT * FROM customers")
customers_sql.show()

+---+--------------------+--------------------+----------+---------+--------------------+
| id|             created|             updated|first_name|last_name|               email|
+---+--------------------+--------------------+----------+---------+--------------------+
|  1|2023-09-15 15:34:...|2023-09-15 15:34:...|      John|      Doe| johndoe@example.com|
|  2|2023-09-15 15:34:...|2023-09-15 15:34:...|      Jane|    Smith|janesmith@example...|
|  3|2023-09-15 15:34:...|2023-09-15 15:34:...|       Bob|  Johnson|bobjohnson@exampl...|
|  4|2023-09-15 15:34:...|2023-09-15 15:34:...|     Alice|      Lee|alicelee@example.com|
|  5|2023-09-15 15:34:...|2023-09-15 15:34:...|     David|      Kim|davidkim@example.com|
|  6|2023-09-15 15:34:...|2023-09-15 15:34:...|     Linda|   Nguyen|lindanguyen@examp...|
|  7|2023-09-15 15:34:...|2023-09-15 15:34:...|      Mike|   Garcia|mikegarcia@exampl...|
|  8|2023-09-15 15:34:...|2023-09-15 15:34:...|     Emily|     Chen|emilychen@example...|
|  9|2023-

In [8]:
first_name_c = spark.sql("SELECT * FROM customers WHERE first_name LIKE '%c%';")
first_name_c.show()

+---+--------------------+--------------------+----------+---------+--------------------+
| id|             created|             updated|first_name|last_name|               email|
+---+--------------------+--------------------+----------+---------+--------------------+
|  4|2023-09-15 15:34:...|2023-09-15 15:34:...|     Alice|      Lee|alicelee@example.com|
+---+--------------------+--------------------+----------+---------+--------------------+



## Aggregations

In [9]:
# example aggregations
stats_sdf = customers_sdf.agg(count("*").alias("total_rows"),
        sum("id").alias("id_sum"),
        avg("id").alias("id_avg"),
        min("id").alias("id_min"),
        max("id").alias("id_max"),
        countDistinct("email").alias("unique_emails"),
        first("first_name").alias("first_name_first"),
        last("last_name").alias("last_name_last"),
        collect_list("email").alias("all_emails"),
        collect_set("last_name").alias("unique_last_names")
    )

stats_sdf.show()

+----------+------+------+------+------+-------------+----------------+--------------+--------------------+--------------------+
|total_rows|id_sum|id_avg|id_min|id_max|unique_emails|first_name_first|last_name_last|          all_emails|   unique_last_names|
+----------+------+------+------+------+-------------+----------------+--------------+--------------------+--------------------+
|        10|    55|   5.5|     1|    10|           10|            Jane|          Zhao|[janesmith@exampl...|[Johnson, Smith, ...|
+----------+------+------+------+------+-------------+----------------+--------------+--------------------+--------------------+



## Write Results

In [None]:
# Statistical Table 

In [11]:
# write the results back to the postgres database
stats_sdf.write.format("jdbc").option("url", "jdbc:postgresql://oasispostgres:5432/oasiscorp") \
    .option("dbtable", "customers_stats") \
    .option("user", "oasis").option("password", "oasis").mode("overwrite").save()