In [0]:
from pyspark.sql.functions import col, date_format, lit

# Load Silver
emp  = spark.sql("Select * from edl_hc_datamart.silver.employees WHERE is_active = True")

# DIM EMPLOYEE
# Only select columns matching the existing Delta table schema
# Remove 'employee_id' from the select statement

dim_employee = (emp
    .select(
        col("employee_id"),
        "first_name","last_name","email","date_of_birth_fmt","dept_code"
    ).filter(col("is_active") == True).withColumnRenamed("employee_id", "employee_key"))

# dim_employee.display()
dim_employee.write.format("delta").mode("overwrite").saveAsTable("edl_hc_datamart.gold.dim_employees")



In [0]:
# DIM DEPARTMENT
# Only select columns matching the existing Delta table schema

dept = spark.sql("Select * from edl_hc_datamart.silver.departments")

dim_department = dept.select(
    col("dept_code"),
    "dept_name"
).filter(col("is_active") == True)

dim_department.write.format("delta").mode("overwrite").saveAsTable("edl_hc_datamart.gold.dim_departments")



In [0]:
# DIM Date
# Only select columns matching the existing Delta table schema

jobs = spark.sql("SELECT * from edl_hc_datamart.silver.jobs")

# DIM DATE (derived from job start dates)
dim_date = (jobs
    .select("start_date").distinct()
    .withColumnRenamed("start_date","date_key")
    .withColumn("date_label", date_format(col("date_key"), "dd-MMM-yyyy")).filter(col("is_active") == True))

dim_date.write.format("delta").mode("overwrite").saveAsTable("edl_hc_datamart.gold.dim_date")



In [0]:

# FACT JOB HISTORY
fact_job_history = (jobs.alias("j")
    .join(emp.alias("e"), ["employee_id"], "left")
    .join(dept.alias("d"), col("d.dept_code")==col("e.dept_code"), "left")
    .select(
        col("j.employee_id").alias("employee_key"),
        col("j.position_id"),
        col("j.job_title"),
        col("j.start_date").alias("start_date_key"),
        col("j.end_date").alias("end_date_key"),
        col("j.status"),
        col("d.dept_code").alias("dept_key")
    ).filter(col("e.is_active")== True))
df_fact = fact_job_history.write.format("delta").mode("overwrite").saveAsTable("edl_hc_datamart.gold.fact_job_history")



In [0]:
%sql
CREATE OR REPLACE VIEW edl_hc_datamart.gold_views.employee_job_history_vw WITH SCHEMA EVOLUTION AS
SELECT * FROM edl_hc_datamart.gold.fact_job_history WHERE dept_key = 'ADM';

In [0]:
from pyspark.sql.functions import sum, avg, min, max
"""
Aggregates salary statistics by department and job title.

Steps:
1. Loads job and employee tables from Silver layer.
2. Joins jobs with employees to associate department codes.
3. Groups by department and job title, calculating total, average, minimum, and maximum salary.
4. Writes the aggregated results to the Gold layer as 'salary_agg_by_dept_job' Delta table.
"""

# Load jobs and employees tables and Join jobs with employees to get dept_code
jobs = spark.sql("SELECT employee_id, position_id, job_title, salary_amount FROM edl_hc_datamart.silver.jobs")
employees = spark.sql("SELECT employee_id, dept_code FROM edl_hc_datamart.silver.employees")
jobs_with_dept = jobs.join(employees, jobs["employee_id"] == employees["employee_id"], "left")

# Aggregate salary by department and job title
salary_agg = (jobs_with_dept
    .groupBy("dept_code", "job_title")
    .agg(
        sum("salary_amount").alias("total_salary"),
        avg("salary_amount").alias("avg_salary"),
        min("salary_amount").alias("min_salary"),
        max("salary_amount").alias("max_salary")
    )
    .orderBy("dept_code", "job_title")
)

salary_agg.write.format("delta").mode("overwrite").saveAsTable("edl_hc_datamart.gold.salary_agg_by_dept_job")

In [0]:
%sql
CREATE OR REPLACE VIEW edl_hc_datamart.gold_views.salary_agg_by_dept_job_vw WITH SCHEMA EVOLUTION AS
SELECT * FROM edl_hc_datamart.gold.salary_agg_by_dept_job WHERE dept_code = 'ADM';