## Ingesting data from Silver Layer

In [9]:
from pyspark.sql import functions as F
from pyspark.sql.functions import *

StatementMeta(, 182160f5-f81d-487b-aeae-4a71abfb9635, 11, Finished, Available, Finished)

In [1]:
#Reading data from silver table
silver_df = spark.read.format("delta").table("SilverLakehouse.silver_table")

StatementMeta(, 182160f5-f81d-487b-aeae-4a71abfb9635, 3, Finished, Available, Finished)

## Analytical queries

In [12]:

# Query 1: Students who have not submitted Assignment_101

not_submitted_df = silver_df \
    .filter((col("AssignmentID") == "A101") & col("SubmissionID").isNull()) \
    .select("StudentID", "Name", "Email")

# Formatted JSON Output
not_submitted_result = {
    "status": "success",
    "message": "Students who have not submitted Assignment_101",
    "data": not_submitted_df.collect()
}

display(not_submitted_result)

StatementMeta(, 182160f5-f81d-487b-aeae-4a71abfb9635, 14, Finished, Available, Finished)

{'status': 'success',
 'message': 'Students who have not submitted Assignment_101',
 'data': [Row(StudentID='S1089', Name='Anthony Everett', Email='ewilson@example.com'),
  Row(StudentID='S1078', Name='Marie Gilbert', Email='umarshall@example.net'),
  Row(StudentID='S1048', Name='Deborah Rodriguez', Email='yreed@example.com'),
  Row(StudentID='S1071', Name='Marissa Webster', Email='jamessmith@example.org'),
  Row(StudentID='S1059', Name='Justin Lowery', Email='bryantlaurie@example.net'),
  Row(StudentID='S1015', Name='Valerie Gill', Email='harrellkenneth@example.net'),
  Row(StudentID='S1072', Name='Michael Elliott', Email='stevenscott@example.com'),
  Row(StudentID='S1007', Name='Daniel Adams', Email='lynchgeorge@example.net'),
  Row(StudentID='S1000', Name='Allison Hill', Email='donaldgarcia@example.net'),
  Row(StudentID='S1018', Name='Rose Spence', Email='davidalvarez@example.net'),
  Row(StudentID='S1082', Name='Justin Allen', Email='fbrewer@example.com'),
  Row(StudentID='S1087',

In [15]:
#Save dataframe to gold table
not_submitted_df.write.format("delta").mode("overwrite").saveAsTable("GoldLakehouse.gold_NotSubmitted")

StatementMeta(, 182160f5-f81d-487b-aeae-4a71abfb9635, 17, Finished, Available, Finished)

In [13]:
# Query 2: Submission rate per course (as a %)

# Total expected submissions per course
total_expected = silver_df.groupBy("CourseID", "CourseName").agg(count("*").alias("total_expected"))

# Actual submissions per course
actual_submissions = silver_df.filter(col("SubmissionID").isNotNull()) \
    .groupBy("CourseID").agg(count("*").alias("actual_submitted"))

# Join and calculate submission rate
submission_rate_df = total_expected.join(actual_submissions, "CourseID", "left") \
    .withColumn("actual_submitted", F.coalesce(col("actual_submitted"), lit(0))) \
    .withColumn("SubmissionRatePercent", round((col("actual_submitted") / col("total_expected")) * 100, 2)) \
    .select("CourseName", "SubmissionRatePercent")

# Format JSON Output
submission_rate_result = {
    "status": "success",
    "message": "Submission rate per course",
    "data": submission_rate_df.collect()
}

display(submission_rate_result)


StatementMeta(, 182160f5-f81d-487b-aeae-4a71abfb9635, 15, Finished, Available, Finished)

{'status': 'success',
 'message': 'Submission rate per course',
 'data': [Row(CourseName='Business-focused demand-driven flexibility', SubmissionRatePercent=8.62),
  Row(CourseName='Triple-buffered cohesive function', SubmissionRatePercent=11.36),
  Row(CourseName='Managed incremental solution', SubmissionRatePercent=8.54),
  Row(CourseName='Organic even-keeled contingency', SubmissionRatePercent=10.7),
  Row(CourseName='Customizable maximized capability', SubmissionRatePercent=12.57),
  Row(CourseName='Ergonomic multi-state hierarchy', SubmissionRatePercent=7.02),
  Row(CourseName='Automated logistical array', SubmissionRatePercent=12.9),
  Row(CourseName='Phased grid-enabled installation', SubmissionRatePercent=8.87)]}

In [16]:
#Save dataframe to gold table
submission_rate_df.write.format("delta").mode("overwrite").saveAsTable("GoldLakehouse.gold_SubmissionRate")

StatementMeta(, 182160f5-f81d-487b-aeae-4a71abfb9635, 18, Finished, Available, Finished)

In [14]:
# Query 3: Overdue assignment count per student

overdue_df = silver_df \
    .filter((col("SubmissionID").isNull()) & (col("DueDate") < current_date())) \
    .groupBy("StudentID", "Name") \
    .agg(count("*").alias("OverdueCount"))

# Format JSON Output
overdue_result = {
    "status": "success",
    "message": "Overdue assignment count per student",
    "data": overdue_df.collect()
}

display(overdue_result)

StatementMeta(, 182160f5-f81d-487b-aeae-4a71abfb9635, 16, Finished, Available, Finished)

{'status': 'success',
 'message': 'Overdue assignment count per student',
 'data': [Row(StudentID='S1095', Name='Rachel Weber', OverdueCount=18),
  Row(StudentID='S1038', Name='Grant Watts', OverdueCount=9),
  Row(StudentID='S1076', Name='Joseph Stanley', OverdueCount=12),
  Row(StudentID='S1078', Name='Marie Gilbert', OverdueCount=26),
  Row(StudentID='S1061', Name='Scott Cole', OverdueCount=5),
  Row(StudentID='S1005', Name='Juan Calderon', OverdueCount=16),
  Row(StudentID='S1036', Name='Joshua Turner', OverdueCount=16),
  Row(StudentID='S1020', Name='Jennifer Powers', OverdueCount=11),
  Row(StudentID='S1045', Name='Ashley Brennan', OverdueCount=12),
  Row(StudentID='S1071', Name='Marissa Webster', OverdueCount=17),
  Row(StudentID='S1029', Name='Rachel Mitchell', OverdueCount=11),
  Row(StudentID='S1023', Name='Carlos Walls', OverdueCount=11),
  Row(StudentID='S1027', Name='Krista Williams', OverdueCount=17),
  Row(StudentID='S1086', Name='Paige Carlson', OverdueCount=14),
  Row(S

In [17]:
#Save dataframe to gold table
overdue_df.write.format("delta").mode("overwrite").saveAsTable("GoldLakehouse.gold_Overdue")

StatementMeta(, 182160f5-f81d-487b-aeae-4a71abfb9635, 19, Finished, Available, Finished)