### Install Pyspark

In [2]:
#!pip install pyspark

### Import libraries

In [3]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, SQLContext
import os

### Set Java home

In [4]:
# set Java home
os.environ["JAVA_HOME"] = "C:\Program Files\Java\jdk-18.0.2.1"

### Initiate Spark Context

In [5]:
conf = SparkConf() \
    .setAppName("Example") \
    .setMaster("local") \
    .set("spark.driver.extraClassPath","C:/pyspark/*")

In [6]:
sc = SparkContext.getOrCreate(conf=conf)
spark = SparkSession(sc)

In [7]:
spark

### Read CSV File

In [8]:
df=spark.read.options(delimiter=",", header=True).csv(r"C:\Users\haq\OneDrive\Notebooks\data\AdvWorksData.csv")
df.show()

+---------------+------------------+--------------------+--------------+--------------+---------+------------+---------+---------+-------------+---------+
|productcategory|productsubcategory|             product| saleterritory|       Country|OrderDate|StandardCost|UnitPrice|ListPrice|OrderQuantity|    Sales|
+---------------+------------------+--------------------+--------------+--------------+---------+------------+---------+---------+-------------+---------+
|       Clothing|              Caps|        AWC Logo Cap|United Kingdom|United Kingdom|6/30/2012|      6.9223|   5.0136|   8.6442|           14|68.786592|
|    Accessories|             Locks|          Cable Lock|United Kingdom|United Kingdom|6/30/2012|     10.3125|       15|       25|            6|       90|
|       Clothing|            Gloves|Full-Finger Glove...|United Kingdom|United Kingdom|6/30/2012|     15.6709|   22.794|    37.99|            8|  182.352|
|       Clothing|            Gloves|Full-Finger Glove...|United Kingdo

In [9]:
df.printSchema()

root
 |-- productcategory: string (nullable = true)
 |-- productsubcategory: string (nullable = true)
 |-- product: string (nullable = true)
 |-- saleterritory: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- OrderDate: string (nullable = true)
 |-- StandardCost: string (nullable = true)
 |-- UnitPrice: string (nullable = true)
 |-- ListPrice: string (nullable = true)
 |-- OrderQuantity: string (nullable = true)
 |-- Sales: string (nullable = true)



### Common dataframe actions

In [11]:
france = df.filter(df.saleterritory == "France").show(truncate=False)

+---------------+------------------+------------------------------+-------------+-------+----------+------------+---------+---------+-------------+---------+
|productcategory|productsubcategory|product                       |saleterritory|Country|OrderDate |StandardCost|UnitPrice|ListPrice|OrderQuantity|Sales    |
+---------------+------------------+------------------------------+-------------+-------+----------+------------+---------+---------+-------------+---------+
|Components     |Handlebars        |HL Mountain Handlebars        |France       |France |6/30/2012 |53.3999     |65.6018  |109.3364 |1            |65.6018  |
|Clothing       |Gloves            |Full-Finger Gloves, L         |France       |France |9/30/2012 |15.6709     |22.794   |37.99    |3            |68.382   |
|Clothing       |Gloves            |Full-Finger Gloves, S         |France       |France |9/30/2012 |15.6709     |22.794   |37.99    |1            |22.794   |
|Bikes          |Mountain Bikes    |Mountain-200 Bla

In [12]:
df1 = df[['productcategory','saleterritory','OrderDate','Sales']]
df1.show()

+---------------+--------------+---------+---------+
|productcategory| saleterritory|OrderDate|    Sales|
+---------------+--------------+---------+---------+
|       Clothing|United Kingdom|6/30/2012|68.786592|
|    Accessories|United Kingdom|6/30/2012|       90|
|       Clothing|United Kingdom|6/30/2012|  182.352|
|       Clothing|United Kingdom|6/30/2012| 317.5964|
|       Clothing|United Kingdom|6/30/2012|  159.558|
|       Clothing|United Kingdom|6/30/2012|   45.588|
|       Clothing|United Kingdom|6/30/2012|   22.794|
|       Clothing|United Kingdom|6/30/2012|   22.794|
|       Clothing|United Kingdom|6/30/2012|  42.3867|
|       Clothing|United Kingdom|6/30/2012| 113.0312|
|       Clothing|United Kingdom|6/30/2012|  42.3867|
|     Components|United Kingdom|6/30/2012|  826.164|
|     Components|United Kingdom|6/30/2012|  149.676|
|     Components|United Kingdom|6/30/2012| 1472.291|
|     Components|United Kingdom|6/30/2012| 736.1455|
|     Components|United Kingdom|6/30/2012| 744

In [13]:
saleterritory = df.groupBy('saleterritory').count()
print(saleterritory.show())

+--------------+-----+
| saleterritory|count|
+--------------+-----+
|       Germany| 1864|
|        France| 3530|
|     Northwest| 7872|
|     Southeast| 5937|
|       Central| 5812|
|        Canada|11444|
|     Southwest|13379|
|     Australia| 1713|
|United Kingdom| 3520|
|     Northeast| 5809|
+--------------+-----+

None


### Use Spark SQL to query data

In [15]:
df.createOrReplaceTempView("sales")
output =  spark.sql("SELECT * from sales where productsubcategory='Caps'")
output.show()

+---------------+------------------+------------+--------------+--------------+----------+------------+---------+---------+-------------+---------+
|productcategory|productsubcategory|     product| saleterritory|       Country| OrderDate|StandardCost|UnitPrice|ListPrice|OrderQuantity|    Sales|
+---------------+------------------+------------+--------------+--------------+----------+------------+---------+---------+-------------+---------+
|       Clothing|              Caps|AWC Logo Cap|United Kingdom|United Kingdom| 6/30/2012|      6.9223|   5.0136|   8.6442|           14|68.786592|
|       Clothing|              Caps|AWC Logo Cap|United Kingdom|United Kingdom| 9/30/2012|      6.9223|   5.1865|   8.6442|            2|   10.373|
|       Clothing|              Caps|AWC Logo Cap|United Kingdom|United Kingdom| 3/30/2013|      6.9223|   5.1865|   8.6442|            5|  25.9325|
|       Clothing|              Caps|AWC Logo Cap|United Kingdom|United Kingdom| 4/30/2013|      6.9223|   5.1865

In [16]:
output = spark.sql('SELECT * from sales WHERE `UnitPrice` > 20 AND OrderQuantity >= 10')
output.show()

+---------------+------------------+--------------------+--------------+--------------+----------+------------+---------+---------+-------------+-----------+
|productcategory|productsubcategory|             product| saleterritory|       Country| OrderDate|StandardCost|UnitPrice|ListPrice|OrderQuantity|      Sales|
+---------------+------------------+--------------------+--------------+--------------+----------+------------+---------+---------+-------------+-----------+
|       Clothing|           Jerseys|Long-Sleeve Logo ...|United Kingdom|United Kingdom| 6/30/2012|     38.4923|   27.879|  48.0673|           14|  382.49988|
|       Clothing|            Tights|   Women's Tights, L|United Kingdom|United Kingdom| 6/30/2012|     30.9334|  43.4942|    74.99|           13| 554.116108|
|       Clothing|            Tights|   Women's Tights, S|United Kingdom|United Kingdom| 6/30/2012|     30.9334|  43.4942|    74.99|           13| 554.116108|
|       Clothing|             Vests|     Classic Ves

In [17]:
output = spark.sql('SELECT COUNT(*) as total, productcategory from sales GROUP BY productcategory')
output.show()

+-----+---------------+
|total|productcategory|
+-----+---------------+
|24800|          Bikes|
|12260|       Clothing|
| 5098|    Accessories|
|18683|     Components|
|   39|           NULL|
+-----+---------------+



### Persist data to database

In [18]:
dest_tbl = 'public."pyspark_sales_table"'
database = "AdventureWorks"
password = "demopass"#os.environ['PGPASS']
user = "etl"#os.environ['PGUID']

In [19]:
df.write.mode("overwrite") \
    .format("jdbc") \
    .option("url", f"jdbc:postgresql://localhost:5432/{database}") \
    .option("dbtable", dest_tbl) \
    .option("user", user) \
    .option("password", password) \
    .option("driver",  "org.postgresql.Driver") \
    .save()