Loading the csv file into a dataframe to read it


In [None]:
df = spark.read.format("csv").option("header", "true").load("Files/Bronze/covid_data.csv")
display(df)

printing the schema 

In [None]:
df.printSchema()

saving bronze dataset as a parquet file

In [None]:
df.write.mode("overwrite").parquet("Files/Bronze/BronzeDataset.parquet")

Saving the bronze dataset as a table

In [None]:
df.write.mode("overwrite").saveAsTable("BronzeTable")

creating our silver layer dataframe

In [None]:
silver_df = spark.read.parquet("Files/Bronze/BronzeDataset.parquet")
display(silver_df)

Casting Columns in our silver layer dataframe

In [None]:
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import *

silver_df = silver_df.withColumn("Confirmed", col("Confirmed").cast(IntegerType()))
silver_df = silver_df.withColumn("Deaths", col("Deaths").cast(IntegerType()))
silver_df = silver_df.withColumn("Recovered", col("Recovered").cast(IntegerType()))
silver_df = silver_df.withColumn("Last_Update", to_date(col("Last_Update"), "M/d/yyyy"))

silver_df.printSchema()

storing silver dataframe as a parquet file

In [None]:
silver_df.write.mode("overwrite").parquet("Files/Silver/SilverDataset.parquet")

storing the silver table

In [None]:
silver_df.write.mode("overwrite").saveAsTable("SilverTable")

Loading the silver layered data as a gold layered dataframe

In [None]:
gold_df = spark.read.parquet("Files/Silver/SilverDataset.parquet")
display(gold_df)

calculating aggregate values for year

In [None]:
from pyspark.sql.functions import year, sum as F_sum

# Extract the year from "Last_Update" and aggregate by year
new_gold_df = (
    gold_df
    .select("Last_Update", "Country_Region", "Confirmed", "Deaths", "Recovered")
    .withColumn("Year", year("Last_Update"))  # Add a Year column
    .groupBy("Year")  # Group by the Year column
    .agg(
        F_sum("Confirmed").alias("Total_Confirmed"),
        F_sum("Deaths").alias("Total_Deaths"),
        F_sum("Recovered").alias("Total_Recovered")
    )
    .orderBy("Year")
)

# Display the result
display(new_gold_df)

saving gold data as a parquet file

In [None]:
new_gold_df.write.mode("overwrite").parquet("Files/Gold/GoldDataset.parquet")

Saving this as a table

In [None]:
new_gold_df.write.mode("overwrite").saveAsTable("GoldTable")