In [0]:
#Creating Dataframe by reading files from volume

#Employees
df_emp = spark.read.option("header","true").option("inferSchema","true").csv("/Volumes/edl_hc_mart/landing/landing/employees.csv")
df_emp.write.format("delta").mode("overwrite").save("/Volumes/edl_hc_mart/bronze/bronze/employees")

#Departments
df_dept = spark.read.option("header","true").option("sep","\t").option("inferSchema","true").csv("/Volumes/edl_hc_mart/landing/landing/departments.tsv")
df_dept.write.format("delta").mode("overwrite").save("/Volumes/edl_hc_mart/bronze/bronze/departments")


#Attendance
df_att  = spark.read.option("header","true").option("inferSchema","true").csv("/Volumes/edl_hc_mart/landing/landing/attendance.csv")
df_att.write.format("delta").mode("overwrite").save("/Volumes/edl_hc_mart/bronze/bronze/attendance")

In [0]:
# Read the jobs.json file from the specified volume using a predefined schema,
# then write the resulting DataFrame to the bronze layer in Delta format.

from pyspark.sql.types import StructType, StructField, IntegerType, StringType

jobs_schema = StructType([
    StructField("employee_id", IntegerType(), True),
    StructField("position_id", StringType(), True),
    StructField("job_title", StringType(), True),
    StructField("start_date", StringType(), True),
    StructField("end_date", StringType(), True),
    StructField("status", StringType(), True)
])

df_jobs = (
    spark.read
    .schema(jobs_schema)
    .option("mode", "FAILFAST")  # Ensures that corrupt records cause the read to fail
    .json("/Volumes/edl_hc_mart/landing/landing/jobs.json")
)

df_jobs.write.format("delta").mode("overwrite").save("/Volumes/edl_hc_mart/bronze/bronze/jobs")

In [0]:
# Display the jobs DataFrame to visually inspect its contents
# display(df_jobs)

In [0]:
"""
Registers the loaded DataFrames as temporary views and displays the row count for each table.

This code creates temporary views for jobs, employees, departments, and attendance,
then uses Spark SQL to compute and display the count of rows in each view.
"""

# Register DataFrames as temporary views and display row counts for each table

df_jobs.createOrReplaceTempView("jobs")
df_emp.createOrReplaceTempView("employees")
df_dept.createOrReplaceTempView("departments")
df_att.createOrReplaceTempView("attendance")

#displayed output for count of each data sources by using sparksql

display(spark.sql("select count(*) from employees"))
display(spark.sql("select count(*) from departments"))
display(spark.sql("select count(*) from attendance"))
display(spark.sql("select count(*) from jobs"))

In [0]:
"""
Display the row count for the attendance DataFrame.

This code computes the total number of rows in the df_att DataFrame,
which represents the attendance records loaded from the source file.
"""

# df_jobs.count()
# df_emp.count()
# df_dept.count()
# df_att.count()

In [0]:

import json

with open("/Volumes/edl_hc_mart/landing/landing/jobs.json", "r") as f:
    data = json.load(f)

print(len(data))
