In [None]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when
from pyspark.sql import functions as F


# Create Spark session
spark = SparkSession.builder.appName("WaterQualityPipeline").getOrCreate()

In [None]:
# Load Silver layer data
silver_df = spark.read.format("delta").load("/mnt/datalake/silver/water_quality_cleaned")


In [None]:
silver_df.display(2)

Finding outliers (mean_value, minimum_value, maximum_value)


Outliers IN Min_Value column:

In [None]:
gold_df = silver_df.dropDuplicates()

In [None]:
gold_df.count()

In [None]:
gold_df.display()

In [None]:
# Calculate mean and standard deviation for 'minimum_value' column
mean_val = gold_df.select(F.mean("Minimum_Value")).first()[0]
stddev_val = gold_df.select(F.stddev("Minimum_Value")).first()[0]


# Calculate Z-Score and identify outliers
gold_df_with_zscore = gold_df.withColumn(
    "z_score", (F.col("Minimum_Value") - mean_val) / stddev_val
)
gold_df_with_outliers = gold_df_with_zscore.withColumn(
    "MinimumValue_outlier", F.when(
        F.abs(F.col("z_score")) > 3, 1
    ).otherwise(0)
)


# Show results
gold_df_with_outliers.show()


In [None]:
gold_df_with_outliers.display()

In [None]:
#Rows with outliers in the Min_Value column
outlier_rows = gold_df_with_outliers.filter(gold_df_with_outliers.MinimumValue_outlier == 1)
print("Rows with Minimumvalue_outlier = 1:")
outlier_rows.display()



In [None]:
gold_df = gold_df_with_outliers

Outliers In Max_  Value column:

In [None]:
# Calculate mean and standard deviation for 'Max_Value' column
mean_val = gold_df.select(F.mean("Maximum_Value")).first()[0]
stddev_val = gold_df.select(F.stddev("Maximum_Value")).first()[0]


# Calculate Z-Score and identify outliers
gold_df_with_zscore = gold_df.withColumn(
    "z_score", (F.col("Maximum_Value") - mean_val) / stddev_val
)
gold_df_with_outliers = gold_df_with_zscore.withColumn(
    "MaxValue_outlier", F.when(
        F.abs(F.col("z_score")) > 3, 1
    ).otherwise(0)
)


# Show results
gold_df_with_outliers.show()


In [None]:
gold_df_with_outliers.display()

In [None]:
#Check and remove the rows with outliers in Max_Value Column

#Rows with outliers in the Min_Value column
outlier_rows = gold_df_with_outliers.filter(gold_df_with_outliers.MaxValue_outlier == 1)
print("Rows with MaxValue_outlier = 1:")
outlier_rows.display()

In [None]:
gold_df = gold_df_with_outliers

gold_df.display()

Outliers In Mean_  Value column:

In [None]:
# Calculate mean and standard deviation for 'Max_Value' column
mean_val = gold_df.select(F.mean("Mean_Value")).first()[0]
stddev_val = gold_df.select(F.stddev("Mean_Value")).first()[0]


# Calculate Z-Score and identify outliers
gold_df_with_zscore = gold_df.withColumn(
    "z_score", (F.col("Mean_Value") - mean_val) / stddev_val
)
gold_df_with_outliers = gold_df_with_zscore.withColumn(
    "MeanValue_outlier", F.when(
        F.abs(F.col("z_score")) > 3, 1
    ).otherwise(0)
)


# Show results
gold_df_with_outliers.show()


In [None]:
gold_df_with_outliers.display()

In [None]:
#Check and remove the rows with outliers in Mean_Value Column

#Rows with outliers in the Min_Value column
#outlier_rows = gold_df_with_outliers.filter(gold_df_with_outliers.MeanValue_outlier == 1)
#print("Rows with MeanValue_outlier = 1:")
#outlier_rows.display()

In [None]:
# Step 2: Filter out rows where MinimumValue_outlier is 1
gold_df_no_outliers = gold_df_with_outliers.filter(gold_df_with_outliers.MeanValue_outlier == 0)

# Show the results after removing outliers
print("DataFrame after removing outliers:")
gold_df_no_outliers.show()

In [None]:
gold_df = gold_df_no_outliers

In [None]:
gold_df.display()

In [None]:
gold_df.count()

In [None]:
gold_df = gold_df.drop("z_score")

In [None]:
gold_df.display()

In [None]:
# Handle sampling period by splitting into start and end dates
gold_df = gold_df.withColumn("Start_Date", col("Sampling_Period").substr(1, 10)) \
    .withColumn("End_Date", col("Sampling_Period").substr(-10, 10))


In [None]:
# Write the final data to the Gold layer
gold_df.write.format("delta").mode("overwrite").save("/mnt/datalake/gold/water_quality_aggregated")

In [None]:
spark.sql("CREATE DATABASE IF NOT EXISTS waterdb")


In [None]:
gold_df.write.format("csv").mode("overwrite").saveAsTable("waterdb.gold_table")