In [116]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum
from pyspark.sql import functions as F

In [4]:
spark = SparkSession \
    .builder \
    .appName("Exercise Data Pipeline with PySpark Week 6") \
    .getOrCreate()

In [5]:
spark

In [109]:
df_transaction = spark.read.csv("data/new_bank_transaction.csv", header=True, mode="DROPMALFORMED")

df_transaction.show(3, truncate = False, vertical = True)

-RECORD 0------------------------------
 TransactionID           | T642232     
 CustomerID              | C1010028    
 CustomerDOB             | 25/8/88     
 CustGender              | F           
 CustLocation            | DELHI       
 CustAccountBalance      | 296828.37   
 TransactionDate         | 29/8/16     
 TransactionTime         | 95212       
 TransactionAmount (INR) | 557         
-RECORD 1------------------------------
 TransactionID           | T87414      
 CustomerID              | C1010035    
 CustomerDOB             | 2/3/92      
 CustGender              | M           
 CustLocation            | MUMBAI      
 CustAccountBalance      | 7284.42     
 TransactionDate         | 1/8/16      
 TransactionTime         | 111917      
 TransactionAmount (INR) | 50          
-RECORD 2------------------------------
 TransactionID           | T560676     
 CustomerID              | C1010035_2  
 CustomerDOB             | 9/6/80      
 CustGender              | M           


In [None]:
directory = "data/new_bank_transaction.csv/"

df_ratings = spark.read.csv(directory + "part-*.csv", header=True)

df_ratings.show()

In [24]:
# init vars
DB_URL = "jdbc:postgresql://source_db:5432/source"
DB_TABLE = "marketing_campaign_deposit" 
DB_USER = "postgres"
DB_PASS = "postgres"

# set config
jdbc_url = DB_URL
table_name = DB_TABLE
connection_properties = {
    "user": DB_USER,
    "password": DB_PASS,
    "driver": "org.postgresql.Driver" # set driver postgres
}

In [35]:
df_marketing = spark \
              .read \
              .jdbc(url = jdbc_url,
                    table = table_name,
                    properties = connection_properties)

In [36]:
df_marketing.show(3, truncate = False, vertical = True)

-RECORD 0----------------------------------------
 loan_data_id       | 1                          
 age                | 58                         
 job                | management                 
 marital_id         | 1                          
 education_id       | 1                          
 default            | false                      
 balance            | $2143                      
 housing            | true                       
 loan               | false                      
 contact            | unknown                    
 day                | 5                          
 month              | may                        
 duration           | 261                        
 campaign           | 1                          
 pdays              | -1                         
 previous           | 0                          
 poutcome           | unknown                    
 subscribed_deposit | false                      
 created_at         | 2025-02-28 15:59:11.102813 


In [37]:
df_education = spark.read.jdbc(url=jdbc_url, table="education_status", properties=connection_properties)

In [38]:
df_education.printSchema()

root
 |-- education_id: integer (nullable = true)
 |-- value: string (nullable = true)
 |-- created_at: timestamp (nullable = true)
 |-- updated_at: timestamp (nullable = true)



In [39]:
df_education.show(3, truncate = False, vertical = True)

-RECORD 0----------------------------------
 education_id | 1                          
 value        | tertiary                   
 created_at   | 2025-02-28 15:31:04.358235 
 updated_at   | 2025-02-28 15:31:04.358235 
-RECORD 1----------------------------------
 education_id | 2                          
 value        | secondary                  
 created_at   | 2025-02-28 15:31:04.358235 
 updated_at   | 2025-02-28 15:31:04.358235 
-RECORD 2----------------------------------
 education_id | 3                          
 value        | unknown                    
 created_at   | 2025-02-28 15:31:04.358235 
 updated_at   | 2025-02-28 15:31:04.358235 
only showing top 3 rows



In [40]:
df_marital = spark.read.jdbc(url=jdbc_url, table="marital_status", properties=connection_properties)

In [41]:
df_marital.printSchema()

root
 |-- marital_id: integer (nullable = true)
 |-- value: string (nullable = true)
 |-- created_at: timestamp (nullable = true)
 |-- updated_at: timestamp (nullable = true)



In [44]:
df_marital.show(3, truncate = False, vertical = True)

-RECORD 0--------------------------------
 marital_id | 1                          
 value      | married                    
 created_at | 2025-02-28 15:31:01.502136 
 updated_at | 2025-02-28 15:31:01.502136 
-RECORD 1--------------------------------
 marital_id | 2                          
 value      | single                     
 created_at | 2025-02-28 15:31:01.502136 
 updated_at | 2025-02-28 15:31:01.502136 
-RECORD 2--------------------------------
 marital_id | 3                          
 value      | divorced                   
 created_at | 2025-02-28 15:31:01.502136 
 updated_at | 2025-02-28 15:31:01.502136 



In [117]:
# Empty list to store column expressions
null_counts = []

# Loop to iterate over each column in df_marketing
for c in df_marketing.columns:
    # Count the number of NULL values for column c
    null_count_expr = F.sum(F.col(c).isNull().cast("int")).alias(c)
    
    # Add the expression to the null_counts list
    null_counts.append(null_count_expr)

# Select the columns where the number of NULL values has been calculated
df_null_counts = df_marketing.select(null_counts)

# Display the result
df_null_counts.show(truncate=False, vertical=True)

-RECORD 0-------------------------
 loan_data_id               | 0   
 age                        | 0   
 job                        | 0   
 marital_id                 | 0   
 education_id               | 0   
 default                    | 0   
 balance                    | 0   
 housing                    | 0   
 loan                       | 0   
 contact                    | 0   
 day                        | 0   
 month                      | 0   
 duration                   | 0   
 campaign                   | 0   
 days_since_last_campaign   | 0   
 previous_campaign_contacts | 0   
 previous_campaign_outcome  | 0   
 subscribed_deposit         | 0   
 created_at                 | 0   
 updated_at                 | 0   
 duration_in_year           | 0   



In [79]:
df_marketing.select([sum(col(c).isNull().cast("int")).alias(c) for c in df_marketing.columns]).show(truncate = False, vertical = True)

-RECORD 0-------------------------
 loan_data_id               | 0   
 age                        | 0   
 job                        | 0   
 marital_id                 | 0   
 education_id               | 0   
 default                    | 0   
 balance                    | 0   
 housing                    | 0   
 loan                       | 0   
 contact                    | 0   
 day                        | 0   
 month                      | 0   
 duration                   | 0   
 campaign                   | 0   
 days_since_last_campaign   | 0   
 previous_campaign_contacts | 0   
 previous_campaign_outcome  | 0   
 subscribed_deposit         | 0   
 created_at                 | 0   
 updated_at                 | 0   
 duration_in_year           | 0   



In [118]:
# Empty list to store column expressions
null_counts = []

# Loop to iterate over each column in df_education
for c in df_education.columns:
    # Count the number of NULL values for column c
    null_count_expr = F.sum(F.col(c).isNull().cast("int")).alias(c)
    
    # Add the expression to the null_counts list
    null_counts.append(null_count_expr)

# Select the columns where the number of NULL values has been calculated
df_null_counts = df_education.select(null_counts)

# Display the result
df_null_counts.show(truncate=False, vertical=True)

-RECORD 0-----------
 education_id | 0   
 value        | 0   
 created_at   | 0   
 updated_at   | 0   



In [80]:
df_education.select([sum(col(c).isNull().cast("int")).alias(c) for c in df_education.columns]).show(truncate = False, vertical = True)

-RECORD 0-----------
 education_id | 0   
 value        | 0   
 created_at   | 0   
 updated_at   | 0   



In [119]:
# Empty list to store column expressions
null_counts = []

# Loop to iterate over each column in df_marital
for c in df_marital.columns:
    # Count the number of NULL values for column c
    null_count_expr = F.sum(F.col(c).isNull().cast("int")).alias(c)
    
    # Add the expression to the null_counts list
    null_counts.append(null_count_expr)

# Select the columns where the number of NULL values has been calculated
df_null_counts = df_marital.select(null_counts)

# Display the result
df_null_counts.show(truncate=False, vertical=True)

-RECORD 0---------
 marital_id | 0   
 value      | 0   
 created_at | 0   
 updated_at | 0   



In [81]:
df_marital.select([sum(col(c).isNull().cast("int")).alias(c) for c in df_marital.columns]).show(truncate = False, vertical = True)

-RECORD 0---------
 marital_id | 0   
 value      | 0   
 created_at | 0   
 updated_at | 0   



In [120]:
# Empty list to store column expressions
null_counts = []

# Loop to iterate over each column in df_transaction
for c in df_transaction.columns:
    # Count the number of NULL values for column c
    null_count_expr = F.sum(F.col(c).isNull().cast("int")).alias(c)
    
    # Add the expression to the null_counts list
    null_counts.append(null_count_expr)

# Select the columns where the number of NULL values has been calculated
df_null_counts = df_transaction.select(null_counts)

# Display the result
df_null_counts.show(truncate=False, vertical=True)

-RECORD 0-----------------------
 TransactionID           | 0    
 CustomerID              | 0    
 CustomerDOB             | 0    
 CustGender              | 1100 
 CustLocation            | 151  
 CustAccountBalance      | 2369 
 TransactionDate         | 0    
 TransactionTime         | 0    
 TransactionAmount (INR) | 0    



In [180]:
df_transaction.printSchema()

root
 |-- TransactionID: string (nullable = true)
 |-- CustomerID: string (nullable = true)
 |-- CustomerDOB: string (nullable = true)
 |-- CustGender: string (nullable = true)
 |-- CustLocation: string (nullable = true)
 |-- CustAccountBalance: string (nullable = true)
 |-- TransactionDate: string (nullable = true)
 |-- TransactionTime: string (nullable = true)
 |-- TransactionAmount (INR): string (nullable = true)



### **2. Source to Target Mapping**
---


## Column Mapping

### Education Status
source : table education_status

target : table education_status

| Source Column   | Target Column   | Transformation                                   |
|----------------|----------------|---------------------------------------------------------|
| `education_id` | `education_id` | -                       |
| `value`        | `value`        | - |
| `created_at`   | `created_at`   | -           |
| `updated_at`   | `updated_at`   | -          |


### Marital Status
source : table marital_status

target : table marital_status
| Source Column   | Target Column   | Transformation |
|----------------|----------------|---------------|
| `marital_id`   | `marital_id`   | - |
| `value`        | `value`        | - |
| `created_at`   | `created_at`   | - |
| `updated_at`   | `updated_at`   | - |


### Marketing Campaign for Deposit
source : table marketing_campaign_deposit

target : table marketing_campaign_deposit
| Source Column              | Target Column                | Transformation                                      |
|----------------------------|-----------------------------|----------------------------------------------------|
| `loan_data_id`             | `loan_data_id`              | - |
| `age`                      | `age`                       | - |
| `job`                      | `job`                       | - |
| `marital_id`               | `marital_id`                | - |
| `education_id`             | `education_id`              | - |
| `"default"`                | `"default"`                 | - |
| `balance`                  | `balance`                   | Remove `$` sign and convert to `INT` |
| `housing`                  | `housing`                   | - |
| `loan`                     | `loan`                      | - |
| `contact`                  | `contact`                   | - |
| `"day"`                    | `"day"`                     | - |
| `"month"`                  | `"month"`                   | - |
| `duration`                 | `duration`                  | - |
| `duration`                 | `duration_in_year`          | duration divide by `365`, round down, and cast to `INT` |
| `campaign`                 | `campaign`                  | - |
| `pdays`                    | `days_since_last_campaign`  | Rename column |
| `previous`                 | `previous_campaign_contacts`| Rename column |
| `poutcome`                 | `previous_campaign_outcome` | Rename column |
| `subscribed_deposit`       | `subscribed_deposit`        | - |
| `created_at`               | `created_at`                | - |
| `updated_at`               | `updated_at`                | - |

### Customers
source : file new_bank_transaction.csv

target : table customers

| Source Column          | Target Column      | Transformation                                      |
|------------------------|-------------------|----------------------------------------------------|
| `CustomerID`          | `customer_id`      | Rename column |
| `CustomerDOB`         | `birth_date`       | Convert to `DATE` format (`d/M/yy`), adjust years if > 2025 |
| `CustGender`          | `gender`           | Rename column; Map `M` → `Male`, `F` → `Female`, others → `Other` |
| `CustLocation`        | `location`         | Rename column |
| `CustAccountBalance`  | `account_balance`  | Rename column, cast to decimal number |

### Transactions
source : file new_bank_transaction.csv

target : table transactions

| Source Column                 | Target Column      | Transformation                                                   |
|--------------------------------|-------------------|-----------------------------------------------------------------|
| `TransactionID`               | `transaction_id`  | Rename column |
| `CustomerID`                  | `customer_id`     | Rename column |
| `TransactionDate`             | `transaction_date` | Convert to `DATE` format (`d/M/yy`), adjust years if > 2025 |
| `TransactionTime`             | `transaction_time` | Convert to `HH:MM:SS` format |
| `TransactionAmount (INR)`     | `transaction_amount` | Rename column, cast to decimal number |


### **3. Code Testing**
---

## helper.py (logging & init spark)

In [88]:
import logging
import os
from pyspark.sql import SparkSession


def logging_process(log_file="script/log/info.log"):
    # Configure logging
    os.makedirs(os.path.dirname(log_file), exist_ok=True)
    logging.basicConfig(
        filename=log_file,
        level=logging.INFO,
        format="%(asctime)s - %(levelname)s - %(message)s"
    )
    logger = logging.getLogger()
    return logger


def init_spark_session():
    spark = SparkSession.builder.appName(
        "Exercise Data Pipeline Week_6"
    ).getOrCreate()

    return spark

----

## extract_data.py

In [89]:
import logging
import pyspark

logging_process()


def extract_data(
    data_name: str, format_data: str
) -> pyspark.sql.DataFrame:
    """
    Function to extract movie data in csv or database table

    Parameters
    ----------
    data_name (str): name of data or table of data sources
    format_data (str): format data of data sources, currently on csv or db

    Returns
    -------
    df (pyspark.sql.DataFrame): dataframe of data sources
    """
    # create spark session
    spark = init_spark_session()

    # set variable for database
    DB_URL = "jdbc:postgresql://source_db:5432/source"
    DB_USER = "postgres"
    DB_PASS = "postgres"

    # set config
    jdbc_url = DB_URL
    connection_properties = {
        "user": DB_USER,
        "password": DB_PASS,
        "driver": "org.postgresql.Driver" # set driver postgres
    }

    try:
        if format_data.lower() == "csv":
            logging.info(f"===== Start Extracting {data_name} data =====")

            df = spark.read.csv(f"data/{data_name}.csv", header=True)

            logging.info(f"===== Finish Extracting {data_name} data =====")

            return df

        elif format_data.lower() == "db":
            logging.info(f"===== Start Extracting {data_name} data =====")

            df = spark.read.jdbc(
                url=jdbc_url, table=data_name, properties=connection_properties
            )

            logging.info(f"===== Finish Extracting {data_name} data =====")

            return df

        else:
            raise Exception("Format data not supported yet")

    except Exception as e:
        logging.error("====== Failed to Extract Data ======")
        logging.error(e)

        raise Exception(e)

----

## transform.py

### a. convert_date.py

In [502]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import to_date, col, when, concat, substring, lit, lpad, regexp_extract

def convert_date_columns(df: DataFrame, table_name: str) -> DataFrame:
    """
    Function to convert date and time columns based on table name.
    
    Args:
        df: Input DataFrame
        table_name: Name of the table being processed
        
    Returns:
        DataFrame with converted date and time columns
    """
    if table_name == "transactions":
        # Convert TransactionDate from d/M/yy to YYYY/MM/DD
        df = df.withColumn("transaction_date", to_date(col("transaction_date"), "d/M/yy"))
        
        # Convert TransactionTime from HHMMSS to HH:MM:SS format using lpad
        df = df.withColumn("padded_time", lpad(col("transaction_time"), 6, "0"))
        
        # Convert to HH:MM:SS format
        df = df.withColumn(
            "transaction_time",
            concat(
                substring(col("padded_time"), 1, 2), lit(":"),
                substring(col("padded_time"), 3, 2), lit(":"),
                substring(col("padded_time"), 5, 2)
            )
        )
        
        # Drop the temporary column
        df = df.drop("padded_time")
        
    elif table_name == "customers":
        # Convert CustomerDOB with year > 25 check using regex for safer extraction
        df = df.withColumn(
            "birth_date",
            when(
                regexp_extract(col("birth_date"), "(\\d{1,2})/(\\d{1,2})/(\\d{2})", 3).cast("int") > 25,
                to_date(
                    concat(
                        regexp_extract(col("birth_date"), "(\\d{1,2})/(\\d{1,2})/", 1), 
                        lit("/"),
                        regexp_extract(col("birth_date"), "\\d{1,2}/(\\d{1,2})/", 1),
                        lit("/19"),
                        regexp_extract(col("birth_date"), "\\d{1,2}/\\d{1,2}/(\\d{2})", 1)
                    ),
                    "d/M/yyyy"
                )
            ).otherwise(to_date(col("birth_date"), "d/M/yy"))
        )
        
    return df


### b. casting_data.py

In [416]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import col, round, regexp_replace

def casting_data_types(df: DataFrame, table_name: str) -> DataFrame:
    """
    Function to cast data types based on table name.
    """
    casting_mappings = {
        "marketing_campaign_deposit": {
            "balance": ("int", "\\$"),
            "duration_in_year": ("int", None, "duration", 365)
        },
        "transactions": {
            "transaction_amount": "double"
        },
        "customers": {
            "account_balance": "double"
        }
    }
    
    if table_name in casting_mappings:
        for col_name, cast_info in casting_mappings[table_name].items():
            if isinstance(cast_info, tuple):
                if len(cast_info) == 2 and cast_info[1]:
                    df = df.withColumn(col_name, regexp_replace(col(col_name), cast_info[1], "").cast(cast_info[0]))
                elif len(cast_info) == 4:
                    df = df.withColumn(col_name, round(col(cast_info[2]) / cast_info[3]).cast(cast_info[0]))
            else:
                df = df.withColumn(col_name, col(col_name).cast(cast_info))
    
    return df


### c. select_column.py

In [499]:

import logging
import pyspark

logging_process()


def select_columns_process(
    df_result: pyspark.sql.DataFrame, table_name: str
) -> pyspark.sql.DataFrame:
    """
    Function that selects columns based on the table name from the list of columns.

    Parameters
    ----------
    df_result (pyspark.sql.DataFrame): Input DataFrame for the specific table.
    table_name (str): The name of the table used to select the appropriate columns.

    Returns
    -------
    pyspark.sql.DataFrame: DataFrame with selected columns.
    """
    try:
        logging.info(f"===== Start Selecting Data process for table {table_name} =====")

        # Define columns for each table
        table_columns = {
            "marital_status": ["marital_id", "value"],
            "education_status": ["education_id", "value"],
            "marketing_campaign_deposit": [
                "loan_data_id", "age", "job", "marital_id", "education_id", "default", "balance",
                "housing", "loan", "contact", "day", "month", "duration", "duration_in_year", 
                "campaign", "days_since_last_campaign", "previous_campaign_contacts", 
                "previous_campaign_outcome", "subscribed_deposit"
            ],
            "customers": [
                "customer_id", "birth_date", "gender", "location", "account_balance"
            ],
            "transactions": [
                "transaction_id", "customer_id", "transaction_date", "transaction_time", 
                "transaction_amount"
            ]
        }

        # Check if the table_name is in the dictionary and select the columns
        if table_name in table_columns:
            selected_cols = table_columns[table_name]
            df_result = df_result.select(*selected_cols)
        else:
            raise ValueError(f"Table name '{table_name}' not recognized!")

        logging.info(f"===== Finish Selecting Data process for table {table_name} =====")

        return df_result

    except Exception as e:
        logging.error(f"===== Failed Selecting Data process for table {table_name} =====")
        logging.error(e)
        raise Exception(e)


### d. clean_data.py

In [505]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import col, trim, lower, when

def clean_data(df: DataFrame, table_name: str) -> DataFrame:
    """
    Function to clean data before processing.
    """
    if table_name == "customers":
        df = df.withColumn("gender", lower(trim(col("gender"))))
        df = df.withColumn("gender", when(col("gender") == "m", "Male")
                           .when(col("gender") == "f", "Female")
                           .otherwise("Other"))
       
    return df


### e. rename_column.py

In [513]:
from pyspark.sql import DataFrame

def rename_columns(df: DataFrame, table_name: str) -> DataFrame:
    """
    Function to rename columns based on standardized naming convention.
    """
    rename_mappings = {
        "marketing_campaign_deposit": {
            "pdays": "days_since_last_campaign",
            "previous": "previous_campaign_contacts",
            "poutcome": "previous_campaign_outcome"
        },        
        "transactions": {
            "TransactionID": "transaction_id",
            "CustomerID": "customer_id",
            "TransactionDate": "transaction_date",
            "TransactionTime": "transaction_time",
            "TransactionAmount (INR)": "transaction_amount"
        },
        "customers": {
            "CustomerID": "customer_id",
            "CustomerDOB": "birth_date",
            "CustGender": "gender",
            "CustLocation": "location",
            "CustAccountBalance": "account_balance"
        }
    }
    
    if table_name in rename_mappings:
        for old_col, new_col in rename_mappings[table_name].items():
            df = df.withColumnRenamed(old_col, new_col)
    
    return df


### transform_data.py

In [510]:
import logging
from pyspark.sql import DataFrame


logging_process()

def transform_data(df: DataFrame, table_name: str) -> DataFrame:
    """
    Function to apply all transformation steps on the dataframe.
    """
    try:
        logging.info(f"===== Start Transforming Data for {table_name} =====")
        df = rename_columns(df, table_name)
        df = convert_date_columns(df, table_name)
        df = casting_data_types(df, table_name)
        df = select_columns_process(df, table_name)
        df = clean_data(df, table_name)
        logging.info(f"===== Finished Transforming Data for {table_name} =====")
        return df
    except Exception as e:
        logging.error(f"===== Failed to Transform Data for {table_name} =====")
        logging.error(e)
        raise


----

## load_data.py

In [476]:
import logging
import psycopg2
from helper.utils import logging_process
import pyspark

logging_process()

def load_data(df_result: pyspark.sql.DataFrame, table_name: str) -> None:
    """
    Function that dumps the result to the database using PySpark
    and maintains data integrity by truncating the table before loading new data.

    Parameters
    ----------
    df_result (pyspark.sql.DataFrame): Final result of pyspark dataframe.
    table_name (str): The target table name in the database where data needs to be loaded.
    """
    try:
        # Set variable for database
        DB_URL = "jdbc:postgresql://data_warehouse:5432/data_warehouse"
        DB_USER = "postgres"
        DB_PASS = "postgres"
        JDBC_URL = DB_URL
        connection_properties = {
            "user": DB_USER,
            "password": DB_PASS,
            "driver": "org.postgresql.Driver"  # Set driver postgres
        }

        logging.info("===== Start Load Data to the Database =====")

        # Step 1: Truncate the target table (ensure data integrity by removing old records before loading new data)
        # Connect to PostgreSQL using psycopg2 for executing non-query SQL (such as TRUNCATE)
        with psycopg2.connect(
            host="data_warehouse", dbname="data_warehouse", user=DB_USER, password=DB_PASS
        ) as conn:
            with conn.cursor() as cursor:
                truncate_sql = f"TRUNCATE TABLE {table_name} CASCADE"
                cursor.execute(truncate_sql)
                conn.commit()  # Ensure changes are committed
                logging.info(f"===== Truncated table {table_name} successfully =====")

        # Step 2: Load new data using the 'append' method
        df_result.write.jdbc(
            url=JDBC_URL,
            table=table_name,
            mode="append",  # Use append to add data to the table without deleting existing data
            properties=connection_properties,
        )

        logging.info("===== Finished Load Data to the Database =====")

    except Exception as e:
        logging.error(f"===== Failed Load Data to the Database for table {table_name} =====")
        logging.error(e)
        raise Exception(e)

----

## run_pyspark_pipeline.py

In [530]:
import logging


# Initialize logging
logging_process()

if __name__ == "__main__":
    logging.info("===== Start Banking Data Pipeline =====")

    try:
        # Extract data from CSV and database
        df_transactions = extract_data(data_name="new_bank_transaction", format_data="csv")
        df_customers = extract_data(data_name="new_bank_transaction", format_data="csv")
        df_marketing = extract_data(data_name="marketing_campaign_deposit", format_data="db")
        df_education = extract_data(data_name="education_status", format_data="db")
        df_marital = extract_data(data_name="marital_status", format_data="db")

        # Transform each dataset separately
        df_transactions = transform_data(df_transactions, "transactions")
        df_customers = transform_data(df_customers, "customers")
        df_marketing = transform_data(df_marketing, "marketing_campaign_deposit")
        df_education = transform_data(df_education, "education_status")
        df_marital = transform_data(df_marital, "marital_status")

        # Load each transformed dataset into the data warehouse
        load_data(df_education, table_name="education_status")
        load_data(df_marital, table_name="marital_status")
        load_data(df_customers, table_name="customers")
        load_data(df_transactions, table_name="transactions")
        load_data(df_marketing, table_name="marketing_campaign_deposit")

        logging.info("===== Finish Banking Data Pipeline =====")

    except Exception as e:
        logging.error("===== Data Pipeline Failed =====")
        logging.error(e)
        raise


-----

## Output Check

In [507]:
df_customers = extract_data(data_name="new_bank_transaction", format_data="csv")

In [508]:
df_customers = transform_data(df_customers, "customers")

In [509]:
load_data(df_customers, table_name="customers")

In [541]:
df_customers.show(1, truncate = False, vertical = True)

-RECORD 0---------------------
 customer_id     | C1010028   
 birth_date      | 1988-08-25 
 gender          | Female     
 location        | DELHI      
 account_balance | 296828.37  
only showing top 1 row



In [542]:
min_bod_date = df_customers.select("birth_date").agg({"birth_date": "min"}).collect()[0][0]
max_bod_date = df_customers.select("birth_date").agg({"birth_date": "max"}).collect()[0][0]

print(f"Minimum Birth Date: {min_bod_date}")
print(f"Maximum Birth Date: {max_bod_date}")

Minimum Birth Date: 1800-01-01
Maximum Birth Date: 2025-05-06


In [544]:
df_customers.select("gender").distinct().show(truncate = False)

+------+
|gender|
+------+
|Female|
|Other |
|Male  |
+------+



In [540]:
df_transactions.show(1, truncate = False, vertical = True)

-RECORD 0------------------------
 transaction_id     | T642232    
 customer_id        | C1010028   
 transaction_date   | 2016-08-29 
 transaction_time   | 09:52:12   
 transaction_amount | 557.0      
only showing top 1 row



In [535]:
min_tr_date = df_transactions.select("transaction_date").agg({"transaction_date": "min"}).collect()[0][0]
max_tr_date = df_transactions.select("transaction_date").agg({"transaction_date": "max"}).collect()[0][0]

print(f"Minimum Transaction Date: {min_tr_date}")
print(f"Maximum Transaction Date: {max_tr_date}")

Minimum Transaction Date: 2016-08-01
Maximum Transaction Date: 2016-10-21


In [536]:
min_tr_time = df_transactions.select("transaction_time").agg({"transaction_time": "min"}).collect()[0][0]
max_tr_time = df_transactions.select("transaction_time").agg({"transaction_time": "max"}).collect()[0][0]

print(f"Minimum Transaction Time: {min_tr_time}")
print(f"Maximum Transaction Time: {max_tr_time}")

Minimum Transaction Time: 00:00:00
Maximum Transaction Time: 23:59:59


In [539]:
min_tr_amount = df_transactions.select("transaction_amount").agg({"transaction_amount": "min"}).collect()[0][0]
max_tr_amount = df_transactions.select("transaction_amount").agg({"transaction_amount": "max"}).collect()[0][0]

print(f"Minimum Transaction Amount: {min_tr_amount}")
print(f"Maximum Transaction Amount: {max_tr_amount}")

Minimum Transaction Amount: 0.0
Maximum Transaction Amount: 1560034.99


In [545]:
df_marketing.show(1, truncate = False, vertical = True)

-RECORD 0--------------------------------
 loan_data_id               | 1          
 age                        | 58         
 job                        | management 
 marital_id                 | 1          
 education_id               | 1          
 default                    | false      
 balance                    | 2143       
 housing                    | true       
 loan                       | false      
 contact                    | unknown    
 day                        | 5          
 month                      | may        
 duration                   | 261        
 duration_in_year           | 1          
 campaign                   | 1          
 days_since_last_campaign   | -1         
 previous_campaign_contacts | 0          
 previous_campaign_outcome  | unknown    
 subscribed_deposit         | false      
only showing top 1 row



In [546]:
df_marketing.select("previous_campaign_outcome").distinct().show(truncate = False)

+-------------------------+
|previous_campaign_outcome|
+-------------------------+
|success                  |
|unknown                  |
|other                    |
|failure                  |
+-------------------------+

