In [8]:
import pandas as pd
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
import os

spark = SparkSession.builder \
    .appName("SparkSQLExample") \
    .getOrCreate()

folder_path = "data"
for f in os.listdir(folder_path):
    if f.endswith('.csv'):
        df = spark.read.csv(f"data/{f}", header=True, inferSchema=True)
        print(f)
        df.show()
        print(df.columns)

features_attributes.csv
+-----------+-----------------+---+-----------+-------------+-------------+
|Customer_ID|             Name|Age|        SSN|   Occupation|snapshot_date|
+-----------+-----------------+---+-----------+-------------+-------------+
| CUS_0x1000|   Alistair Barrf| 18|913-74-1218|       Lawyer|   2023-05-01|
| CUS_0x1009|           Arunah| 26|063-67-6938|     Mechanic|   2025-01-01|
| CUS_0x100b|         Shirboni| 19|  #F%$D@*&8|Media_Manager|   2024-03-01|
| CUS_0x1011|        Schneyerh| 44|793-05-8223|       Doctor|   2023-11-01|
| CUS_0x1013|         Cameront| 44|930-49-9615|     Mechanic|   2023-12-01|
| CUS_0x1015|          Holtono| 27|810-97-7024|   Journalist|   2023-08-01|
| CUS_0x1018|      Felsenthalq| 15|731-19-8119|   Accountant|   2023-11-01|
| CUS_0x1026|          Josephv| 52|500-62-9044|      Manager|   2023-10-01|
| CUS_0x102d| Neil Chatterjeex| 31|692-71-7552| Entrepreneur|   2024-01-01|
| CUS_0x102e|            Rhysn| 26|  #F%$D@*&8|    Scientist|   

In [68]:
attributes = spark.read.csv("data/features_attributes.csv", header=True, inferSchema=True)
attributes.createOrReplaceTempView('attributes')

print(attributes.columns)

query = spark.sql(
    """
    SELECT * FROM attributes
    WHERE Customer_ID = "CUS_0x1000"
    ORDER BY Customer_ID, snapshot_date
    """
)

query.show()

['Customer_ID', 'Name', 'Age', 'SSN', 'Occupation', 'snapshot_date']
+-----------+--------------+---+-----------+----------+-------------+
|Customer_ID|          Name|Age|        SSN|Occupation|snapshot_date|
+-----------+--------------+---+-----------+----------+-------------+
| CUS_0x1000|Alistair Barrf| 18|913-74-1218|    Lawyer|   2023-05-01|
+-----------+--------------+---+-----------+----------+-------------+



In [None]:
clickstream = spark.read.csv("data/feature_clickstream.csv", header=True, inferSchema=True)
clickstream.createOrReplaceTempView('clickstream')


print(clickstream.columns)

query = spark.sql(
    """
    SELECT
        Customer_ID,
        snapshot_date,
        COUNT(snapshot_date) as distinct_entries
    FROM clickstream
    WHERE Customer_ID = "CUS_0x1000"
    GROUP BY Customer_ID, snapshot_date
    ORDER BY Customer_ID, snapshot_date
    """
)

query.show()

In [81]:
financials = spark.read.csv("data/features_financials.csv", header=True, inferSchema=True)
financials.createOrReplaceTempView('financials')

print(financials.columns)

query = spark.sql(
    """
    SELECT *
    FROM
        (
        SELECT
        DISTINCT Customer_ID,
        COUNT(*) OVER(PARTITION BY Customer_ID) as instances
        FROM attributes
        ) a
    WHERE instances > 1
    """
)

query.show(1000)

['Customer_ID', 'Annual_Income', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts', 'Num_Credit_Card', 'Interest_Rate', 'Num_of_Loan', 'Type_of_Loan', 'Delay_from_due_date', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit', 'Num_Credit_Inquiries', 'Credit_Mix', 'Outstanding_Debt', 'Credit_Utilization_Ratio', 'Credit_History_Age', 'Payment_of_Min_Amount', 'Total_EMI_per_month', 'Amount_invested_monthly', 'Payment_Behaviour', 'Monthly_Balance', 'snapshot_date']
+-----------+---------+
|Customer_ID|instances|
+-----------+---------+
+-----------+---------+



In [55]:
import os

folder_path = "data"
csv_files = [f[:-4] for f in os.listdir(folder_path) if f.endswith('.csv')]

csv_files

['features_attributes',
 'features_financials',
 'feature_clickstream',
 'lms_loan_daily']

In [58]:
bronze_directory = "datamart/bronze/"

file_name = "features_attributes/features_attributes.csv"
filepath = bronze_directory + file_name
df = spark.read.csv(filepath, header=True, inferSchema=True)
df.show()

import pyspark.sql.functions as F
from pyspark.sql.types import IntegerType

# SSN is assumed to have xxx-xx-xxxx format
ssn_pattern = r"^\d{3}-\d{2}-\d{4}$"
df_clean = df.withColumn(
    "SSN",
    F.when(F.col("SSN").rlike(ssn_pattern), F.col("SSN")).otherwise(None)
)

# Create ssn_valid boolean
df_clean = df_clean.withColumn("ssn_valid",F.when(F.col("SSN").isNull(), 0).otherwise(1))

# Blank Occupations are cleaned
invalid_occ = ["_______", "", None]
df_clean = df_clean.withColumn("Occupation_clean", F.trim(F.col("Occupation"))) \
    .withColumn(
        "Occupation_clean",
        F.when(
            F.col("Occupation_clean")
            .isin(invalid_occ), 
            None)
        .otherwise(F.col("Occupation_clean"))
    )

# Remove white spaces on Name column
df_clean = df_clean.withColumn("Name", F.trim(F.col("Name")))

# Clean Age - assumed reasonable age is 15 to 100 years old
df_clean = df_clean.withColumn("Age_int", F.col("Age").cast(IntegerType()))
df_clean = df_clean.filter((F.col("Age_int") >= 15) & (F.col("Age_int") <= 100))

df_clean = df.withColumn("ssn_valid", F.when(F.col("SSN").rlike(r"^\d{3}-\d{2}-\d{4}$"), 1).otherwise(0)) \
             .withColumn("occupation_known", F.when(F.col("Occupation").isin("_______", "", None), 0).otherwise(1)) \
             .withColumn("age_valid", F.when((F.col("Age").cast("int").isNotNull()) & (F.col("Age") >= 15), 1).otherwise(0))

name_counts = df_clean.groupBy("Name").agg(F.count("*").alias("name_shared_count"))
df_with_name_count = df_clean.join(name_counts, on="Name", how="left")
df_with_name_count = df_with_name_count.withColumn("is_name_shared", F.when(F.col("name_shared_count") > 1, 1).otherwise(0))
df_with_name_count = df_with_name_count.drop("Name", "SSN",)


+-----------+-----------------+---+-----------+-------------+-------------+
|Customer_ID|             Name|Age|        SSN|   Occupation|snapshot_date|
+-----------+-----------------+---+-----------+-------------+-------------+
| CUS_0x1000|   Alistair Barrf| 18|913-74-1218|       Lawyer|   2023-05-01|
| CUS_0x1009|           Arunah| 26|063-67-6938|     Mechanic|   2025-01-01|
| CUS_0x100b|         Shirboni| 19|  #F%$D@*&8|Media_Manager|   2024-03-01|
| CUS_0x1011|        Schneyerh| 44|793-05-8223|       Doctor|   2023-11-01|
| CUS_0x1013|         Cameront| 44|930-49-9615|     Mechanic|   2023-12-01|
| CUS_0x1015|          Holtono| 27|810-97-7024|   Journalist|   2023-08-01|
| CUS_0x1018|      Felsenthalq| 15|731-19-8119|   Accountant|   2023-11-01|
| CUS_0x1026|          Josephv| 52|500-62-9044|      Manager|   2023-10-01|
| CUS_0x102d| Neil Chatterjeex| 31|692-71-7552| Entrepreneur|   2024-01-01|
| CUS_0x102e|            Rhysn| 26|  #F%$D@*&8|    Scientist|   2024-04-01|
| CUS_0x1032

In [31]:
df_clean.groupBy("SSN").count().orderBy("count", ascending=False).show()

+-----------+-----+
|        SSN|count|
+-----------+-----+
|  #F%$D@*&8|  644|
|706-59-9144|    1|
|508-88-3060|    1|
|564-87-3414|    1|
|118-02-3131|    1|
|808-03-9422|    1|
|595-90-1107|    1|
|571-04-7703|    1|
|154-09-5858|    1|
|942-65-9443|    1|
|218-40-1399|    1|
|296-46-9959|    1|
|579-02-1738|    1|
|738-19-2481|    1|
|136-53-7387|    1|
|368-07-2548|    1|
|916-37-8123|    1|
|945-83-2116|    1|
|347-65-8366|    1|
|214-44-4143|    1|
+-----------+-----+
only showing top 20 rows



In [45]:
df.where(F.col("Name") == "Wahbap").show()

+-----------+------+---+-----------+----------+-------------+
|Customer_ID|  Name|Age|        SSN|Occupation|snapshot_date|
+-----------+------+---+-----------+----------+-------------+
| CUS_0x1032|Wahbap|40_|620-58-8045|    Lawyer|   2023-08-01|
| CUS_0x508c|Wahbap| 49|212-99-0909|  Engineer|   2024-07-01|
| CUS_0x6f81|Wahbap| 17|075-05-3919|    Doctor|   2025-01-01|
+-----------+------+---+-----------+----------+-------------+



In [39]:
df_clean.groupBy("Name").count().orderBy("count", ascending=False).show(100)

+--------------------+-----+
|                Name|count|
+--------------------+-----+
|              Langep|    6|
|            Jessicad|    6|
|              Stevex|    6|
|                Kimj|    5|
|             Danielz|    5|
|       Jessica Wohlt|    5|
|              Jonesb|    5|
|               Nicko|    5|
|       Valetkevitchv|    5|
|            Raymondr|    5|
|   Jonathan Stempelr|    5|
|         Phil Wahbag|    5|
|     Lucia Mutikanip|    5|
|               Phila|    5|
|Caroline Valetkev...|    5|
|                Huwk|    5|
|            Michaelr|    5|
|    Ryan Vlastelicad|    5|
|       Strupczewskid|    5|
|         Tim Hepherj|    4|
|     Katharina Barty|    4|
|            Patrickt|    4|
|              Jasonb|    4|
|           Jonathany|    4|
|        Anna Driveri|    4|
|  Deepa Seetharamanc|    4|
|  Dhanya Skariachanh|    4|
|               Paulr|    4|
|          Rothackern|    4|
|           Mutikanie|    4|
|           ra Alperq|    4|
|       Nick E

In [62]:
df = spark.read.parquet("datamart/silver/silver_attributes_cleaned.parquet")
df.show()

+-----------+----+-------------+-------------+---------+----------------+---------+-----------------+--------------+
|Customer_ID| Age|   Occupation|snapshot_date|ssn_valid|occupation_known|age_valid|name_shared_count|is_name_shared|
+-----------+----+-------------+-------------+---------+----------------+---------+-----------------+--------------+
| CUS_0x1000|  18|       Lawyer|   2023-05-01|        1|               1|        1|                2|             1|
| CUS_0x1009|  26|     Mechanic|   2025-01-01|        1|               1|        1|                2|             1|
| CUS_0x100b|  19|Media_Manager|   2024-03-01|        0|               1|        1|                1|             0|
| CUS_0x1011|  44|       Doctor|   2023-11-01|        1|               1|        1|                2|             1|
| CUS_0x1013|  44|     Mechanic|   2023-12-01|        1|               1|        1|                1|             0|
| CUS_0x1015|  27|   Journalist|   2023-08-01|        1|        