%md
##### Date Variable to check new data every day:

In [0]:
from datetime import datetime
todayDate = datetime.today().strftime("%Y-%m-%d")
 
print(f"Today's Date: {todayDate}")
year, month, day = todayDate.split("-")

bronze_base_path = f"/mnt/project2/bronze/{year}/{month}/{day}/"
silver_base_path = "/mnt/project2/silver/"
gold_base_path = "/mnt/project2/gold/"
backup_storage_path = f"/mnt/project2/client_backup_files/{year}/{month}/{day}/"

##### Mount Point Code:

In [0]:
# configs = {"fs.azure.account.auth.type": "OAuth",
#           "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
#           "fs.azure.account.oauth2.client.id": dbutils.secrets.get(scope="storage_accounts_scope",key="appid"),
#           "fs.azure.account.oauth2.client.secret": dbutils.secrets.get(scope="storage_accounts_scope",key="appsecret"),
#           "fs.azure.account.oauth2.client.endpoint": "https://login.microsoftonline.com/fcee7905-be7c-4a7c-b3f6-7c94700f97cb/oauth2/token"}

# # Optionally, you can add <directory-name> to the source URI of your mount point.
# dbutils.fs.mount(
#   source = "abfss://project2@adlsharpalvaghela.dfs.core.windows.net/",
#   mount_point = "/mnt/project2",
#   extra_configs = configs)

##### Imports all necessory Libraries:

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, DateType, IntegerType
from pyspark.sql.functions import col, when, lit, concat_ws, concat, col, current_timestamp, crc32
from delta.tables import DeltaTable

##### Read Silver Layer Files:

In [0]:
def read_silver_layer_files(file_name):
    return spark.read.parquet(f"{silver_base_path}{file_name}.parquet")

##### Write cleaned file to Silver Layer:

In [0]:
def write_single_parquet_file(df, file_name, silver_base_path): 
    name = file_name.replace(".csv", "")
    temp_path = f"{silver_base_path}{name}_tmp/"
    final_file_path = f"{silver_base_path}{name}.parquet"

    # Write to temporary folder as one part file
    df.coalesce(1).write.mode("overwrite").parquet(temp_path)

    # Find the actual part-xxxx.parquet file
    files = dbutils.fs.ls(temp_path)
    part_file = [f.path for f in files if f.name.endswith(".parquet")][0]

    # Move and rename to final single file
    dbutils.fs.mv(part_file, final_file_path, True)

    # Delete temp folder
    dbutils.fs.rm(temp_path, recurse=True)

    print(f"{file_name} saved to silver layer: {final_file_path}")

##### Backup files Logic:

In [0]:
from py4j.protocol import Py4JJavaError

def path_exists(path):
    try:
        dbutils.fs.ls(path)
        return True
    except Py4JJavaError as e:
        return "FileNotFoundException" not in str(e)

def backup_bronze_csv_to_backup_folder(): 
    if path_exists(bronze_base_path):
        for file in dbutils.fs.ls(bronze_base_path):
            if file.name.endswith(".csv"):
                file_name = file.name
                entity_name = file_name.replace(".csv", "")
                source_path = bronze_base_path + file_name
                temp_target_path = f"{backup_storage_path}{entity_name}_temp/"
                final_target_path = f"{backup_storage_path}{file_name}"

                # Read and write as single CSV file to a temp folder
                df = spark.read.option("header", True).csv(source_path)
                df.coalesce(1).write.option("header", True).mode("overwrite").csv(temp_target_path)

                # Rename part-xxxxx.csv to actual filename
                part_file = [f.name for f in dbutils.fs.ls(temp_target_path) if f.name.startswith("part-")][0]
                dbutils.fs.mv(temp_target_path + part_file, final_target_path)
                dbutils.fs.rm(temp_target_path, recurse=True)

                print(f"Backed up {file_name} to {final_target_path}")
    else:
        print(f"No data found in Bronze path: {bronze_base_path}")

##### Assign schema for each file in Bronze:

In [0]:
# Define Schemas for each file

def get_schema(file_name):
    if file_name == 'accounts.csv':
        return StructType([
            StructField("account_id", IntegerType(), True),
            StructField("customer_id", IntegerType(), True),    
            StructField("account_type", StringType(), True),
            StructField("balance", DoubleType(), True)
        ])

    elif file_name == 'customers.csv':
        return StructType([
            StructField("customer_id", IntegerType(), True),
            StructField("first_name", StringType(), True),
            StructField("last_name", StringType(), True),    
            StructField("address", StringType(), True),
            StructField("city", StringType(), True),
            StructField("state", StringType(), True),
            StructField("zip", StringType(), True)
        ])

    elif file_name == 'loans.csv':
        return StructType([
            StructField("loan_id", IntegerType(), True),
            StructField("customer_id", IntegerType(), True),
            StructField("loan_amount", DoubleType(), True),
            StructField("interest_rate", DoubleType(), True),    
            StructField("loan_term", IntegerType(), True)
        ])

    elif file_name == 'loan_payments.csv':
        return StructType([
            StructField("payment_id", IntegerType(), True),
            StructField("loan_id", IntegerType(), True),
            StructField("payment_date", DateType(), True),    
            StructField("payment_amount", DoubleType(), True)
        ])

    elif file_name == 'transactions.csv':
        return StructType([
            StructField("transaction_id", IntegerType(), True),
            StructField("account_id", IntegerType(), True),
            StructField("transaction_date", DateType(), True),
            StructField("transaction_amount", DoubleType(), True),
            StructField("transaction_type", StringType(), True)    
        ])

    else:
        raise ValueError(f"Unknown file name: {file_name}")

##### Read CSV files in Bronze Folder:

In [0]:
def list_csv_files_in_bronze(bronze_path): 
    files = dbutils.fs.ls(bronze_path)
    file_names = [file.name for file in files if file.name.endswith(".csv")]    
    return file_names

##### Data Transformation Functions:

In [0]:
from pyspark.sql.functions import col, when, lit

#-------------------------------Filter Nulls--------------------------
def filter_nulls(df, id_cols):    
    for col_name in id_cols:
        df = df.filter(col(col_name).isNotNull())
    return df

#-------------------------------Replace Nulls--------------------------
def replace_nulls_with_defaults(df, transform_cols_defaults):
    for col_name, default_val in transform_cols_defaults.items():
        df = df.withColumn(col_name, when(col(col_name).isNull(), lit(default_val)).otherwise(col(col_name)))
    return df

#-------------------------------Remove Duplicates--------------------------
def remove_duplicates(df, id_cols):   
    return df.dropDuplicates(id_cols)


###### Other

In [0]:
# %sql
# drop table hive_metastore.bankdb.loan_payments;
# drop table hive_metastore.bankdb.loans;
# drop table hive_metastore.bankdb.customers;
# drop table hive_metastore.bankdb.accounts;
# drop table hive_metastore.bankdb.transactions;

In [0]:
#spark.sql('DROP DATABASE hive_metastore.bankdb')