In [23]:
from pyspark.sql import SparkSession

# Set your own jar path
jar_path = "path/to/postgresql-42.x.x.jar"

# SparkSession Start
spark = SparkSession.builder \
    .appName("TestSession") \
    .master("local[*]") \
    .config("spark.jars", jar_path) \
    .getOrCreate()

print(spark.version)

3.5.2


In [24]:
print(spark.conf.get("spark.jars"))

postgresql-42.7.5.jar


In [25]:
# Reading the data
shopping = spark.read.format("csv") \
  .option("header", "true") \
  .option("inferSchema", "true") \
  .load("data/shopping_trends.csv")

In [26]:
shopping.printSchema()

root
 |-- Customer ID: integer (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Item Purchased: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Purchase Amount (USD): integer (nullable = true)
 |-- Location: string (nullable = true)
 |-- Size: string (nullable = true)
 |-- Color: string (nullable = true)
 |-- Season: string (nullable = true)
 |-- Review Rating: double (nullable = true)
 |-- Subscription Status: string (nullable = true)
 |-- Payment Method: string (nullable = true)
 |-- Shipping Type: string (nullable = true)
 |-- Discount Applied: string (nullable = true)
 |-- Promo Code Used: string (nullable = true)
 |-- Previous Purchases: integer (nullable = true)
 |-- Preferred Payment Method: string (nullable = true)
 |-- Frequency of Purchases: string (nullable = true)



In [27]:
shopping.show(5)

+-----------+---+------+--------------+--------+---------------------+-------------+----+---------+------+-------------+-------------------+--------------+-------------+----------------+---------------+------------------+------------------------+----------------------+
|Customer ID|Age|Gender|Item Purchased|Category|Purchase Amount (USD)|     Location|Size|    Color|Season|Review Rating|Subscription Status|Payment Method|Shipping Type|Discount Applied|Promo Code Used|Previous Purchases|Preferred Payment Method|Frequency of Purchases|
+-----------+---+------+--------------+--------+---------------------+-------------+----+---------+------+-------------+-------------------+--------------+-------------+----------------+---------------+------------------+------------------------+----------------------+
|          1| 55|  Male|        Blouse|Clothing|                   53|     Kentucky|   L|     Gray|Winter|          3.1|                Yes|   Credit Card|      Express|             Yes|    

In [40]:
from pyspark.sql.functions import col

# Changing the column names for for later usage
df_filter = shopping.select(col("Customer ID").alias("id"), 
                            col("Age").alias("age"), 
                            col("Item Purchased").alias("item"), 
                            col("Category").alias("category"), 
                            col("Purchase Amount (USD)").alias("amount"), 
                            col("Location").alias("location"), 
                            col("Size").alias("size"), 
                            col("Color").alias("color"), 
                            col("Season").alias("season"), 
                            col("Review Rating").alias("rating"))

df_filter.show(4)

+---+---+-------+--------+------+-------------+----+------+------+------+
| id|age|   item|category|amount|     location|size| color|season|rating|
+---+---+-------+--------+------+-------------+----+------+------+------+
|  1| 55| Blouse|Clothing|    53|     Kentucky|   L|  Gray|Winter|   3.1|
|  2| 19|Sweater|Clothing|    64|        Maine|   L|Maroon|Winter|   3.1|
|  3| 50|  Jeans|Clothing|    73|Massachusetts|   S|Maroon|Spring|   3.1|
|  4| 21|Sandals|Footwear|    90| Rhode Island|   M|Maroon|Spring|   3.5|
+---+---+-------+--------+------+-------------+----+------+------+------+
only showing top 4 rows



In [41]:
# Set your data base properties
jdbc_url = "jdbc:postgresql://<host>:<port>/<database>"
db_properties = {
    "user": "<username>",
    "password": "<password>",
    "driver": "org.postgresql.Driver"
}
# Write the data to the database
df_filter.write \
    .format("jdbc") \
    .option("url", jdbc_url) \
    .option("dbtable", "shopping_trends") \
    .option("user", db_properties["user"]) \
    .option("password", db_properties["password"]) \
    .option("driver", db_properties["driver"]) \
    .mode("overwrite") \
    .save()

In [42]:
#Read the data from the database
read_df = spark.read \
    .format("jdbc") \
    .option("url", jdbc_url) \
    .option("dbtable", "shopping_trends") \
    .option("user", db_properties["user"]) \
    .option("password", db_properties["password"]) \
    .option("driver", db_properties["driver"]) \
    .load()


read_df.show()

+---+---+----------+-----------+------+-------------+----+---------+------+------+
| id|age|      item|   category|amount|     location|size|    color|season|rating|
+---+---+----------+-----------+------+-------------+----+---------+------+------+
|  1| 55|    Blouse|   Clothing|    53|     Kentucky|   L|     Gray|Winter|   3.1|
|  2| 19|   Sweater|   Clothing|    64|        Maine|   L|   Maroon|Winter|   3.1|
|  3| 50|     Jeans|   Clothing|    73|Massachusetts|   S|   Maroon|Spring|   3.1|
|  4| 21|   Sandals|   Footwear|    90| Rhode Island|   M|   Maroon|Spring|   3.5|
|  5| 45|    Blouse|   Clothing|    49|       Oregon|   M|Turquoise|Spring|   2.7|
|  6| 46|  Sneakers|   Footwear|    20|      Wyoming|   M|    White|Summer|   2.9|
|  7| 63|     Shirt|   Clothing|    85|      Montana|   M|     Gray|  Fall|   3.2|
|  8| 27|    Shorts|   Clothing|    34|    Louisiana|   L| Charcoal|Winter|   3.2|
|  9| 26|      Coat|  Outerwear|    97|West Virginia|   L|   Silver|Summer|   2.6|
| 10

In [21]:
spark.stop()