In [None]:
# Spark Session
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName("Spark SQL")
    .master("spark://spark-master:7077")
    .enableHiveSupport()
    .config("spark.sql.warehouse.dir", "/data/output/spark-warehouse")
    .getOrCreate()
)

spark

In [None]:
# Read Employee data
_schema = "first_name string, last_name string, job_title string, dob string, email string, phone string, salary double, department_id int"

emp = spark.read.format("csv").schema(_schema).option("header", True).load("/data/input/employee_records_skewed.csv")

In [None]:
# Read DEPT CSV data
_dept_schema = "department_id int, department_name string, description string, city string, state string, country string"

dept = spark.read.format("csv").schema(_dept_schema).option("header", True).load("/data/input/department_data.csv")

In [None]:
# Spark Catalog (Metadata) - in-memory/hive

spark.conf.get("spark.sql.catalogImplementation")

In [None]:
# Show databases
db = spark.sql("show databases")
db.show()

In [None]:
spark.sql("show tables in default").show()

In [None]:
# Register dataframes are temp views

emp.createOrReplaceTempView("emp_view")

dept.createOrReplaceTempView("dept_view")


In [None]:
# Show tables/view in catalog



In [None]:
# View data from table

emp_filtered = spark.sql("""
    select * from emp_view
    where department_id = 1
""")

In [None]:
emp_filtered.show()

In [None]:
# Create a new column dob_year and register as temp view

emp_temp = spark.sql("""
    select e.*, date_format(dob, 'yyyy') as dob_year from emp_view e
""")


In [None]:
emp_temp.createOrReplaceTempView("emp_temp_view")

In [None]:
spark.sql("select * from emp_temp_view").show()

In [None]:
# Join emp and dept - HINTs

emp_final = spark.sql("""
    select /*+ BROADCAST(d) */
    e.* , d.department_name
    from emp_view e left outer join dept_view d
    on e.department_id = d.department_id
""")

In [None]:
# Show emp data

emp_final.show()

In [None]:
# Write the data as Table

emp_final.write.format("parquet").saveAsTable("emp_final")

In [None]:
# Read the data from Table

emp_new = spark.sql("select * from emp_final")

In [None]:
emp_new.show()

In [None]:
# Persist metadata



In [None]:
# Show details of metadata

spark.sql("describe extended emp_final").show()