# Postgres SQL & Spark

- Learn to connect to Postgres, fetch table (schema) definitions, read, query, and write data using Apache Spark and Java Database Connectivity (JDBC).

## Import Libraries

In [1]:
import os
# Import required modules
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from datetime import datetime
from pyspark.sql.functions import col
from pyspark.sql import functions as f
from pyspark.sql import *
from pyspark.sql.types import *
# # Delta is a storage layer for data lakes
# from delta.tables import * 
# # DeltaTable is the main class for Delta tables
# from delta.tables import DeltaTable

## Initiate Spark Session

In [2]:
# Create a SparkSession and set the extraClassPath configuration
spark = SparkSession.builder.master("local[1]") \
    .appName("PostgresSpark") \
    .config("spark.driver.extraClassPath", "/home/jovyan/work/jars/*") \
    .getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

## Extract Data

In [3]:
# load the data from the postgres database
customers_sdf = spark.read.format("jdbc") \
    .option("url", "jdbc:postgresql://oasispostgres:5432/oasiscorp") \
    .option("dbtable", "bettercustomers") \
    .option("user", "oasis").option("password", "oasis") \
    .load()

# Show the contents of the DataFrame
customers_sdf.show()

+---+--------------------+--------------------+----------+---------+--------------------+
| id|             created|             updated|first_name|last_name|               email|
+---+--------------------+--------------------+----------+---------+--------------------+
|  1|2023-09-15 15:34:...|2023-09-15 15:34:...|      John|      Doe| johndoe@example.com|
|  2|2023-09-15 15:34:...|2023-09-15 15:34:...|      Jane|    Smith|janesmith@example...|
|  3|2023-09-15 15:34:...|2023-09-15 15:34:...|       Bob|  Johnson|bobjohnson@exampl...|
|  4|2023-09-15 15:34:...|2023-09-15 15:34:...|     Alice|      Lee|alicelee@example.com|
|  5|2023-09-15 15:34:...|2023-09-15 15:34:...|     David|      Kim|davidkim@example.com|
|  6|2023-09-15 15:34:...|2023-09-15 15:34:...|     Linda|   Nguyen|lindanguyen@examp...|
|  7|2023-09-15 15:34:...|2023-09-15 15:34:...|      Mike|   Garcia|mikegarcia@exampl...|
|  8|2023-09-15 15:34:...|2023-09-15 15:34:...|     Emily|     Chen|emilychen@example...|
|  9|2023-

In [4]:
customers_sdf.select("created","updated").show(truncate=False)

+--------------------------+--------------------------+
|created                   |updated                   |
+--------------------------+--------------------------+
|2023-09-15 15:34:29.680532|2023-09-15 15:34:29.680532|
|2023-09-15 15:34:29.680532|2023-09-15 15:34:29.680532|
|2023-09-15 15:34:29.680532|2023-09-15 15:34:29.680532|
|2023-09-15 15:34:29.680532|2023-09-15 15:34:29.680532|
|2023-09-15 15:34:29.680532|2023-09-15 15:34:29.680532|
|2023-09-15 15:34:29.680532|2023-09-15 15:34:29.680532|
|2023-09-15 15:34:29.680532|2023-09-15 15:34:29.680532|
|2023-09-15 15:34:29.680532|2023-09-15 15:34:29.680532|
|2023-09-15 15:34:29.680532|2023-09-15 15:34:29.680532|
|2023-09-15 15:34:29.680532|2023-09-15 15:34:29.680532|
|2023-09-16 09:00:00       |2023-09-16 11:52:17.289663|
|2023-09-16 10:00:00       |2023-09-16 11:52:17.289691|
|2023-09-16 11:00:00       |2023-09-16 11:52:17.28971 |
|2023-09-16 12:00:00       |2023-09-16 11:52:17.289727|
|2023-09-16 13:00:00       |2023-09-16 11:52:17.

### View The Schema

In [5]:
customers_sdf.printSchema()

root
 |-- id: integer (nullable = true)
 |-- created: timestamp (nullable = true)
 |-- updated: timestamp (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- email: string (nullable = true)



## Register the View

In [6]:
customers_sdf.createOrReplaceTempView("customers")

### Query the View

In [7]:
# Query the view
customers_sql = spark.sql("SELECT * FROM customers")
customers_sql.show()

+---+--------------------+--------------------+----------+---------+--------------------+
| id|             created|             updated|first_name|last_name|               email|
+---+--------------------+--------------------+----------+---------+--------------------+
|  1|2023-09-15 15:34:...|2023-09-15 15:34:...|      John|      Doe| johndoe@example.com|
|  2|2023-09-15 15:34:...|2023-09-15 15:34:...|      Jane|    Smith|janesmith@example...|
|  3|2023-09-15 15:34:...|2023-09-15 15:34:...|       Bob|  Johnson|bobjohnson@exampl...|
|  4|2023-09-15 15:34:...|2023-09-15 15:34:...|     Alice|      Lee|alicelee@example.com|
|  5|2023-09-15 15:34:...|2023-09-15 15:34:...|     David|      Kim|davidkim@example.com|
|  6|2023-09-15 15:34:...|2023-09-15 15:34:...|     Linda|   Nguyen|lindanguyen@examp...|
|  7|2023-09-15 15:34:...|2023-09-15 15:34:...|      Mike|   Garcia|mikegarcia@exampl...|
|  8|2023-09-15 15:34:...|2023-09-15 15:34:...|     Emily|     Chen|emilychen@example...|
|  9|2023-

In [8]:
first_name_c = spark.sql("SELECT * FROM customers WHERE first_name LIKE '%c%';")
first_name_c.show()

+---+--------------------+--------------------+----------+---------+--------------------+
| id|             created|             updated|first_name|last_name|               email|
+---+--------------------+--------------------+----------+---------+--------------------+
|  4|2023-09-15 15:34:...|2023-09-15 15:34:...|     Alice|      Lee|alicelee@example.com|
| 13| 2023-09-16 11:00:00|2023-09-16 11:52:...|     Grace| Gonzalez|   grace@example.com|
+---+--------------------+--------------------+----------+---------+--------------------+



## Aggregations

In [9]:
# example aggregations
stats_sdf = customers_sdf.agg(count("*").alias("total_rows"),
        sum("id").alias("id_sum"),
        avg("id").alias("id_avg"),
        min("id").alias("id_min"),
        max("id").alias("id_max"),
        countDistinct("email").alias("unique_emails"),
        first("first_name").alias("first_name_first"),
        last("last_name").alias("last_name_last"),
        collect_list("email").alias("all_emails"),
        collect_set("last_name").alias("unique_last_names")
    )

stats_sdf.show()

+----------+------+------+------+------+-------------+----------------+--------------+--------------------+--------------------+
|total_rows|id_sum|id_avg|id_min|id_max|unique_emails|first_name_first|last_name_last|          all_emails|   unique_last_names|
+----------+------+------+------+------+-------------+----------------+--------------+--------------------+--------------------+
|        20|   210|  10.5|     1|    20|           20|           Grace|         Lewis|[grace@example.co...|[Smith, Lee, Walk...|
+----------+------+------+------+------+-------------+----------------+--------------+--------------------+--------------------+



## Write Results

In [None]:
# Statistical Table 

In [10]:
# write the results back to the postgres database
stats_sdf.write.format("jdbc").option("url", "jdbc:postgresql://oasispostgres:5432/oasiscorp") \
    .option("dbtable", "customers_stats") \
    .option("user", "oasis").option("password", "oasis").mode("overwrite").save()

In [None]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import IntegerType, StringType, TimestampType
from datetime import datetime
from faker import Faker
import random

Collecting faker
  Downloading Faker-19.6.1-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
Installing collected packages: faker
Successfully installed faker-19.6.1
Note: you may need to restart the kernel to use updated packages.


## New Records

In [18]:
# Define the timestamp conversion function
def ts(timeStr):
    return datetime.strptime(timeStr, '%Y-%m-%d %H:%M:%S')

# Create a function to get the current timestamp
def time():
    return datetime.now()



# Generate random customer records (rows 21-30) with new data
new_customer_records = [
    (21, ts("2023-09-16 09:00:00"), datetime.now(), "Alice", "Smith", "alice.smith@example.com"),
    (22, ts("2023-09-16 10:00:00"), datetime.now(), "Bob", "Johnson", "bob.johnson@example.com"),
    (23, ts("2023-09-16 11:00:00"), datetime.now(), "Charlie", "Brown", "charlie.brown@example.com"),
    (24, ts("2023-09-16 12:00:00"), datetime.now(), "David", "Wilson", "david.wilson@example.com"),
    (25, ts("2023-09-16 13:00:00"), datetime.now(), "Emma", "Davis", "emma.davis@example.com"),
    (26, ts("2023-09-16 14:00:00"), datetime.now(), "Frank", "Miller", "frank.miller@example.com"),
    (27, ts("2023-09-16 15:00:00"), datetime.now(), "Grace", "Brown", "grace.brown@example.com"),
    (28, ts("2023-09-16 16:00:00"), datetime.now(), "Henry", "Garcia", "henry.garcia@example.com"),
    (29, ts("2023-09-16 17:00:00"), datetime.now(), "Isabella", "Martinez", "isabella.martinez@example.com"),
    (30, ts("2023-09-16 18:00:00"), datetime.now(), "James", "Lee", "james.lee@example.com")
]


# Define the schema for the new customers DataFrame
schema = ["id", "created", "updated", "first_name", "last_name", "email"]

# Create a DataFrame for the new customers
new_customers_df = spark.createDataFrame(new_customer_records, schema)

# Cast the "id" column to IntegerType
new_customers_df = new_customers_df.withColumn("id", new_customers_df["id"].cast(IntegerType()))

# Show the new DataFrame
new_customers_df.show()

+---+-------------------+--------------------+----------+---------+--------------------+
| id|            created|             updated|first_name|last_name|               email|
+---+-------------------+--------------------+----------+---------+--------------------+
| 21|2023-09-16 09:00:00|2023-09-16 11:58:...|     Alice|    Smith|alice.smith@examp...|
| 22|2023-09-16 10:00:00|2023-09-16 11:58:...|       Bob|  Johnson|bob.johnson@examp...|
| 23|2023-09-16 11:00:00|2023-09-16 11:58:...|   Charlie|    Brown|charlie.brown@exa...|
| 24|2023-09-16 12:00:00|2023-09-16 11:58:...|     David|   Wilson|david.wilson@exam...|
| 25|2023-09-16 13:00:00|2023-09-16 11:58:...|      Emma|    Davis|emma.davis@exampl...|
| 26|2023-09-16 14:00:00|2023-09-16 11:58:...|     Frank|   Miller|frank.miller@exam...|
| 27|2023-09-16 15:00:00|2023-09-16 11:58:...|     Grace|    Brown|grace.brown@examp...|
| 28|2023-09-16 16:00:00|2023-09-16 11:58:...|     Henry|   Garcia|henry.garcia@exam...|
| 29|2023-09-16 17:00

In [19]:
new_customers_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- created: timestamp (nullable = true)
 |-- updated: timestamp (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- email: string (nullable = true)



## Write Results to db

In [20]:
# write the results back to the postgres database
new_customers_df.write.format("jdbc").option("url", "jdbc:postgresql://oasispostgres:5432/oasiscorp") \
    .option("dbtable", "bettercustomers") \
    .option("user", "oasis").option("password", "oasis").mode("append").save()

## Load New Table

In [21]:
# load the data from the postgres database
customers_sdf2 = spark.read.format("jdbc") \
    .option("url", "jdbc:postgresql://oasispostgres:5432/oasiscorp") \
    .option("dbtable", "bettercustomers") \
    .option("user", "oasis").option("password", "oasis") \
    .load()

# Show the contents of the DataFrame
customers_sdf2.show()

+---+--------------------+--------------------+----------+---------+--------------------+
| id|             created|             updated|first_name|last_name|               email|
+---+--------------------+--------------------+----------+---------+--------------------+
|  1|2023-09-15 15:34:...|2023-09-15 15:34:...|      John|      Doe| johndoe@example.com|
|  2|2023-09-15 15:34:...|2023-09-15 15:34:...|      Jane|    Smith|janesmith@example...|
|  3|2023-09-15 15:34:...|2023-09-15 15:34:...|       Bob|  Johnson|bobjohnson@exampl...|
|  4|2023-09-15 15:34:...|2023-09-15 15:34:...|     Alice|      Lee|alicelee@example.com|
|  5|2023-09-15 15:34:...|2023-09-15 15:34:...|     David|      Kim|davidkim@example.com|
|  6|2023-09-15 15:34:...|2023-09-15 15:34:...|     Linda|   Nguyen|lindanguyen@examp...|
|  7|2023-09-15 15:34:...|2023-09-15 15:34:...|      Mike|   Garcia|mikegarcia@exampl...|
|  8|2023-09-15 15:34:...|2023-09-15 15:34:...|     Emily|     Chen|emilychen@example...|
|  9|2023-

In [22]:
# Show the contents of the DataFrame
customers_sdf2.show(truncate=False)

+---+--------------------------+--------------------------+----------+---------+-----------------------+
|id |created                   |updated                   |first_name|last_name|email                  |
+---+--------------------------+--------------------------+----------+---------+-----------------------+
|1  |2023-09-15 15:34:29.680532|2023-09-15 15:34:29.680532|John      |Doe      |johndoe@example.com    |
|2  |2023-09-15 15:34:29.680532|2023-09-15 15:34:29.680532|Jane      |Smith    |janesmith@example.com  |
|3  |2023-09-15 15:34:29.680532|2023-09-15 15:34:29.680532|Bob       |Johnson  |bobjohnson@example.com |
|4  |2023-09-15 15:34:29.680532|2023-09-15 15:34:29.680532|Alice     |Lee      |alicelee@example.com   |
|5  |2023-09-15 15:34:29.680532|2023-09-15 15:34:29.680532|David     |Kim      |davidkim@example.com   |
|6  |2023-09-15 15:34:29.680532|2023-09-15 15:34:29.680532|Linda     |Nguyen   |lindanguyen@example.com|
|7  |2023-09-15 15:34:29.680532|2023-09-15 15:34:29.680

In [26]:
customers_sdf2.createOrReplaceTempView("customerss")

In [28]:
first_name_c = spark.sql("SELECT * FROM customerss WHERE id > 20")
first_name_c.show()

+---+-------------------+--------------------+----------+---------+--------------------+
| id|            created|             updated|first_name|last_name|               email|
+---+-------------------+--------------------+----------+---------+--------------------+
| 21|2023-09-16 09:00:00|2023-09-16 11:58:...|     Alice|    Smith|alice.smith@examp...|
| 22|2023-09-16 10:00:00|2023-09-16 11:58:...|       Bob|  Johnson|bob.johnson@examp...|
| 23|2023-09-16 11:00:00|2023-09-16 11:58:...|   Charlie|    Brown|charlie.brown@exa...|
| 24|2023-09-16 12:00:00|2023-09-16 11:58:...|     David|   Wilson|david.wilson@exam...|
| 25|2023-09-16 13:00:00|2023-09-16 11:58:...|      Emma|    Davis|emma.davis@exampl...|
| 26|2023-09-16 14:00:00|2023-09-16 11:58:...|     Frank|   Miller|frank.miller@exam...|
| 27|2023-09-16 15:00:00|2023-09-16 11:58:...|     Grace|    Brown|grace.brown@examp...|
| 28|2023-09-16 16:00:00|2023-09-16 11:58:...|     Henry|   Garcia|henry.garcia@exam...|
| 29|2023-09-16 17:00