In [1]:
from time import sleep

from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession. \
    builder. \
    appName("Data Sources"). \
    master("local"). \
    config("spark.jars", "data/jars/postgresql-42.2.19.jar"). \
    config("spark.sql.legacy.timeParserPolicy", "LEGACY"). \
    getOrCreate()


# config("spark.sql.warehouse.dir", "data/warehouse"). \
    

In [3]:
cars_df = spark.read.json("data/cars").cache()
cars_df.show()

assert(cars_df.count() != 0)



+------------+---------+------------+----------+----------------+--------------------+------+-------------+----------+
|Acceleration|Cylinders|Displacement|Horsepower|Miles_per_Gallon|                Name|Origin|Weight_in_lbs|      Year|
+------------+---------+------------+----------+----------------+--------------------+------+-------------+----------+
|        12.0|        8|       307.0|       130|            18.0|chevrolet chevell...|   USA|         3504|      null|
|        12.0|        8|       307.0|       130|            18.0|chevrolet chevell...|   USA|         3504|1970-01-01|
|        11.5|        8|       350.0|       165|            15.0|   buick skylark 320|   USA|         3693|1970-01-01|
|        11.0|        8|       318.0|       150|            18.0|  plymouth satellite|   USA|         3436|1970-01-01|
|        12.0|        8|       304.0|       150|            16.0|       amc rebel sst|   USA|         3433|1970-01-01|
|        10.5|        8|       302.0|       140|

In [7]:
# Spark DSL 
american_cars_df = cars_df.\
    filter(col("Origin") == "Japan").\
    select(col("Name"))

american_cars_df.show()

+--------------------+
|                Name|
+--------------------+
|toyota corona mar...|
|        datsun pl510|
|        datsun pl510|
|       toyota corona|
| toyota corolla 1200|
|         datsun 1200|
|toyota corona har...|
|     mazda rx2 coupe|
|     datsun 510 (sw)|
|toyouta corona ma...|
|toyota corolla 16...|
|       toyota carina|
|          datsun 610|
|           maxda rx3|
|      toyota mark ii|
|         datsun b210|
| toyota corolla 1200|
|       toyota corona|
|          datsun 710|
|         honda civic|
+--------------------+
only showing top 20 rows



In [5]:
# store as a Spark table dont write data to storage EXTERNAL TABLE
#  DataFrame => SQL metastore
print("DataFrame => SQL metastore = EXTERNAL TABLE")
cars_df.createOrReplaceTempView("cars")

DataFrame => SQL metastore = EXTERNAL TABLE


In [6]:
# Spark SQL != Spark DSL
# run SQL queries on top of DFs known to Spark under a certain name
american_cars_df_v2 = spark.sql("SELECT Name FROM cars WHERE Origin = 'Japan'")
american_cars_df_v2.show()

+--------------------+
|                Name|
+--------------------+
|toyota corona mar...|
|        datsun pl510|
|        datsun pl510|
|       toyota corona|
| toyota corolla 1200|
|         datsun 1200|
|toyota corona har...|
|     mazda rx2 coupe|
|     datsun 510 (sw)|
|toyouta corona ma...|
|toyota corolla 16...|
|       toyota carina|
|          datsun 610|
|           maxda rx3|
|      toyota mark ii|
|         datsun b210|
| toyota corolla 1200|
|       toyota corona|
|          datsun 710|
|         honda civic|
+--------------------+
only showing top 20 rows



In [8]:
# DROPPING EXTERNAL TABLE DOES NOT DELETE DATA, ONLY IN METASTORE
print("DROPPING EXTERNAL TABLE")
spark.sql("DROP TABLE cars")
spark.sql("SELECT Name FROM cars WHERE Origin = 'Japan'")


DROPPING EXTERNAL TABLE


AnalysisException: Table or view not found: cars; line 1 pos 17;
'Project ['Name]
+- 'Filter ('Origin = Japan)
   +- 'UnresolvedRelation [cars], [], false


In [9]:
# We still can work with source DF and read data again
american_cars_df.show()
cars_df_again = spark.read.json("data/cars")
cars_df_again.show()

+--------------------+
|                Name|
+--------------------+
|toyota corona mar...|
|        datsun pl510|
|        datsun pl510|
|       toyota corona|
| toyota corolla 1200|
|         datsun 1200|
|toyota corona har...|
|     mazda rx2 coupe|
|     datsun 510 (sw)|
|toyouta corona ma...|
|toyota corolla 16...|
|       toyota carina|
|          datsun 610|
|           maxda rx3|
|      toyota mark ii|
|         datsun b210|
| toyota corolla 1200|
|       toyota corona|
|          datsun 710|
|         honda civic|
+--------------------+
only showing top 20 rows

+------------+---------+------------+----------+----------------+--------------------+------+-------------+----------+
|Acceleration|Cylinders|Displacement|Horsepower|Miles_per_Gallon|                Name|Origin|Weight_in_lbs|      Year|
+------------+---------+------------+----------+----------------+--------------------+------+-------------+----------+
|        12.0|        8|       307.0|       130|            18.0|

In [10]:
# store DFs as Spark tables (files known to Spark) to Spark storage HDFS
#  DataFrame => SQL metastore + Spark storage
print("SAVING TO MANAGED TABLE")
cars_df.\
    write.\
    mode("overwrite").\
    saveAsTable("cars_managed_table")

# saveAsTable != save() or orc(...), parquet(....) 
# parquet("data/parquet"). \
#     save()

SAVING TO MANAGED TABLE


In [11]:
print("READ FROM MANAGED STORAGE 1")
assert(cars_df.count() != 0)
american_cars_df_v2 = spark.sql("SELECT * FROM cars_managed_table")
assert(american_cars_df_v2.count() != 0)
american_cars_df_v2.show()


READ FROM MANAGED STORAGE 1
+------------+---------+------------+----------+----------------+--------------------+------+-------------+----------+
|Acceleration|Cylinders|Displacement|Horsepower|Miles_per_Gallon|                Name|Origin|Weight_in_lbs|      Year|
+------------+---------+------------+----------+----------------+--------------------+------+-------------+----------+
|        12.0|        8|       307.0|       130|            18.0|chevrolet chevell...|   USA|         3504|      null|
|        12.0|        8|       307.0|       130|            18.0|chevrolet chevell...|   USA|         3504|1970-01-01|
|        11.5|        8|       350.0|       165|            15.0|   buick skylark 320|   USA|         3693|1970-01-01|
|        11.0|        8|       318.0|       150|            18.0|  plymouth satellite|   USA|         3436|1970-01-01|
|        12.0|        8|       304.0|       150|            16.0|       amc rebel sst|   USA|         3433|1970-01-01|
|        10.5|      

In [12]:
print("READ FROM MANAGED STORAGE 2")
cars_managed_df = spark.table("cars_managed_table")
# spark.table == spark.read.table
# cars_managed_df = spark.read.table("cars_managed_table")
assert (cars_managed_df.count() != 0)
cars_managed_df.show()


READ FROM MANAGED STORAGE 2
+------------+---------+------------+----------+----------------+--------------------+------+-------------+----------+
|Acceleration|Cylinders|Displacement|Horsepower|Miles_per_Gallon|                Name|Origin|Weight_in_lbs|      Year|
+------------+---------+------------+----------+----------------+--------------------+------+-------------+----------+
|        12.0|        8|       307.0|       130|            18.0|chevrolet chevell...|   USA|         3504|      null|
|        12.0|        8|       307.0|       130|            18.0|chevrolet chevell...|   USA|         3504|1970-01-01|
|        11.5|        8|       350.0|       165|            15.0|   buick skylark 320|   USA|         3693|1970-01-01|
|        11.0|        8|       318.0|       150|            18.0|  plymouth satellite|   USA|         3436|1970-01-01|
|        12.0|        8|       304.0|       150|            16.0|       amc rebel sst|   USA|         3433|1970-01-01|
|        10.5|      

In [13]:
# DROPPING MANGED TABLE Delete data from store
print("DROPPING MANGED TABLE")
spark.sql("DROP TABLE cars_managed_table")

DROPPING MANGED TABLE


DataFrame[]

In [14]:
# We can create and insert data inside Spark SQL
print("DDL DML INSIDE TABLE")
spark.sql("CREATE SCHEMA test")

spark.sql("CREATE TABLE test.students (name VARCHAR(64), address VARCHAR(64)) USING PARQUET PARTITIONED BY (student_id INT)")

spark.sql("INSERT INTO test.students VALUES('Bob Brown', '456 Taylor St, Cupertino', 222222),('Cathy Johnson', '789 Race Ave, Palo Alto', 333333)")

ddl_demo_df = spark.sql("SELECT * FROM test.students")
print("DDL_DEMO_DF TABLE")
assert (ddl_demo_df.count() != 0)
ddl_demo_df.show()


DDL DML INSIDE TABLE
DDL_DEMO_DF TABLE
+-------------+--------------------+----------+
|         name|             address|student_id|
+-------------+--------------------+----------+
|Cathy Johnson|789 Race Ave, Pal...|    333333|
|    Bob Brown|456 Taylor St, Cu...|    222222|
+-------------+--------------------+----------+



In [15]:
print("DROPPING MANGED TABLE")
spark.sql("DROP TABLE test.students")

 # https://spark.apache.org/docs/latest/sql-ref-syntax-dml-insert-into.html

DROPPING MANGED TABLE


DataFrame[]

Exersices:

1. show all employees and their max salary over time

2. show all employees who were never managers

3. for every employee, find the difference between their salary (current/latest) and the max salary of their department (departments table) (HARD)


In [None]:
# DOR THIS FROM PYCHARM PROJECT

driver = "org.postgresql.Driver"
url = "jdbc:postgresql://localhost:5432/spark"
user = "docker"
password = "docker"


def read_table(table_name):
    return spark.read. \
        format("jdbc"). \
        option("driver", driver). \
        option("url", url). \
        option("user", user). \
        option("password", password). \
        option("dbtable", "public." + table_name). \
        load()

employees_df = read_table("employees")
salaries_df = read_table("salaries")
dept_managers_df = read_table("dept_manager")
dept_emp_df = read_table("dept_emp")
departments_df = read_table("departments")

# save table names
employees_df.createOrReplaceTempView("employees")
salaries_df.createOrReplaceTempView("salaries")
dept_managers_df.createOrReplaceTempView("dept_manager")
dept_emp_df.createOrReplaceTempView("dept_emp")
departments_df.createOrReplaceTempView("departments")
