## Bronze to Silver Layer
-------------------

##### Run notebook via %run magic command

In [0]:
%run "./1_Mount Point and Functions"

##### Take File Backup

In [0]:
backup_bronze_csv_to_backup_folder()

##### Assign Schema, Do Transformation, and Store cleaned data files into Silver Layer

In [0]:
file_name_list = list_csv_files_in_bronze(bronze_base_path)
print(f"File Name List : {file_name_list}")

# Loop through each file
cleaned_dfs = []

for file_name in file_name_list:
    
    # Get schema for the file
    schema = get_schema(file_name)

    # Read CSV with header and schema
    df = spark.read.format("csv") \
        .option("header", True) \
        .schema(schema) \
        .load(f"{bronze_base_path}{file_name}")

   # Define id columns and default values for each file
    if file_name == "customers.csv":
        id_cols = ["customer_id"]
        defaults = {"city": "Unknown", "state": "Unknown", "zip": "Unknown"}

    elif file_name == "accounts.csv":
        id_cols = ["account_id", "customer_id"]
        defaults = {"balance": 0.0}

    elif file_name == "loans.csv":
        id_cols = ["loan_id", "customer_id"]
        defaults = {"loan_amount": 0.0}

    elif file_name == "transactions.csv":
        id_cols = ["transaction_id", "account_id"]
        defaults = {"transaction_amount": 0.0, "transaction_type": "Unknown"}

    elif file_name == "loan_payments.csv":
        id_cols = ["payment_id", "loan_id"]
        defaults = {"payment_amount": 0}

    else:
        print(f"Skipping unknown file: {file_name}")
        continue

    # Apply transformations/data cleaning
    df_filtered = filter_nulls(df, id_cols)
    df_transformed = replace_nulls_with_defaults(df_filtered, defaults)
    df_final = remove_duplicates(df_transformed, id_cols)

    #write file to silver layer
    write_single_parquet_file(df_final, file_name, silver_base_path)
    
    #show preview
    #df_final.show(5)


DELTA Table for Data Visualization:
----------------------------------------

##### Inner Join Condition

In [0]:
#Read all files from silver layer and store in a dataframe
df_accounts = read_silver_layer_files("accounts")
df_customers = read_silver_layer_files("customers")
df_loans = read_silver_layer_files("loans")
df_loan_payments = read_silver_layer_files("loan_payments")
df_transactions = read_silver_layer_files("transactions")

df_combined = (
    df_accounts.join(df_customers, on="customer_id", how="inner")
        .join(df_transactions, on="account_id", how="inner")
        .join(df_loans, on="customer_id", how="inner")
        .join(df_loan_payments, on="loan_id", how="inner")
)

# Show result
#display(df_combined)

##### Select Required Columns

In [0]:
# Select Required Columns

df_selected_columns = df_combined.select(
    col("account_id").alias("Account_ID"),
    col("customer_id").alias("Customer_ID"),
    col("loan_id").alias("Loan_ID"),
    col("payment_id").alias("Payment_ID"),
    col("transaction_id").alias("Transaction_ID"),
    col("balance").alias("Account_Balance"),
    col("payment_amount").alias("Payment_Amount"),
    col("loan_amount").alias("Loan_Amount"),
    col("transaction_amount").alias("Transaction_Amount"),
    col("payment_date").alias("Payment_Date"),
    col("transaction_date").alias("Transaction_Date")
)
#display(df_selected_columns)

##### Store the output in DELTA format in Silver Layer

In [0]:
df_selected_columns.write.format("delta").mode("overwrite").save(f"{silver_base_path}PowerBiDataSource")

In [0]:
%sql
CREATE TABLE IF NOT EXISTS hive_metastore.bankdb.PowerBiDataSource
USING DELTA
LOCATION '/mnt/project2/silver/PowerBiDataSource';

In [0]:
%sql
--SELECT * FROM hive_metastore.bankdb.PowerBiDataSource;

## SCD Type 1 Logic

##### Run notebook via %run magic command

In [0]:
# %run "./3_Silver to Gold Layer SCD Type 1"