In [67]:
import pandas as pd
import pyspark.sql.functions as F
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("SparkSQLExample") \
    .getOrCreate()

clickstream = spark.read.csv("data/feature_clickstream.csv", header=True, inferSchema=True)
clickstream.createOrReplaceTempView('clickstream')


print(clickstream.columns)

query = spark.sql(
    """
    SELECT
        Customer_ID,
        snapshot_date,
        COUNT(snapshot_date) as distinct_entries
    FROM clickstream
    WHERE Customer_ID = "CUS_0x1000"
    GROUP BY Customer_ID, snapshot_date
    ORDER BY Customer_ID, snapshot_date
    """
)

query.show()

['fe_1', 'fe_2', 'fe_3', 'fe_4', 'fe_5', 'fe_6', 'fe_7', 'fe_8', 'fe_9', 'fe_10', 'fe_11', 'fe_12', 'fe_13', 'fe_14', 'fe_15', 'fe_16', 'fe_17', 'fe_18', 'fe_19', 'fe_20', 'Customer_ID', 'snapshot_date']
+-----------+-------------+----------------+
|Customer_ID|snapshot_date|distinct_entries|
+-----------+-------------+----------------+
| CUS_0x1000|   2023-01-01|               1|
| CUS_0x1000|   2023-02-01|               1|
| CUS_0x1000|   2023-03-01|               1|
| CUS_0x1000|   2023-04-01|               1|
| CUS_0x1000|   2023-05-01|               1|
| CUS_0x1000|   2023-06-01|               1|
| CUS_0x1000|   2023-07-01|               1|
| CUS_0x1000|   2023-08-01|               1|
| CUS_0x1000|   2023-09-01|               1|
| CUS_0x1000|   2023-10-01|               1|
| CUS_0x1000|   2023-11-01|               1|
| CUS_0x1000|   2023-12-01|               1|
| CUS_0x1000|   2024-01-01|               1|
| CUS_0x1000|   2024-02-01|               1|
| CUS_0x1000|   2024-03-01|    

In [68]:
attributes = spark.read.csv("data/features_attributes.csv", header=True, inferSchema=True)
attributes.createOrReplaceTempView('attributes')

print(attributes.columns)

query = spark.sql(
    """
    SELECT * FROM attributes
    WHERE Customer_ID = "CUS_0x1000"
    ORDER BY Customer_ID, snapshot_date
    """
)

query.show()

['Customer_ID', 'Name', 'Age', 'SSN', 'Occupation', 'snapshot_date']
+-----------+--------------+---+-----------+----------+-------------+
|Customer_ID|          Name|Age|        SSN|Occupation|snapshot_date|
+-----------+--------------+---+-----------+----------+-------------+
| CUS_0x1000|Alistair Barrf| 18|913-74-1218|    Lawyer|   2023-05-01|
+-----------+--------------+---+-----------+----------+-------------+



In [81]:
financials = spark.read.csv("data/features_financials.csv", header=True, inferSchema=True)
financials.createOrReplaceTempView('financials')

print(financials.columns)

query = spark.sql(
    """
    SELECT *
    FROM
        (
        SELECT
        DISTINCT Customer_ID,
        COUNT(*) OVER(PARTITION BY Customer_ID) as instances
        FROM attributes
        ) a
    WHERE instances > 1
    """
)

query.show(1000)

['Customer_ID', 'Annual_Income', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts', 'Num_Credit_Card', 'Interest_Rate', 'Num_of_Loan', 'Type_of_Loan', 'Delay_from_due_date', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit', 'Num_Credit_Inquiries', 'Credit_Mix', 'Outstanding_Debt', 'Credit_Utilization_Ratio', 'Credit_History_Age', 'Payment_of_Min_Amount', 'Total_EMI_per_month', 'Amount_invested_monthly', 'Payment_Behaviour', 'Monthly_Balance', 'snapshot_date']
+-----------+---------+
|Customer_ID|instances|
+-----------+---------+
+-----------+---------+



In [55]:
import os

folder_path = "data"
csv_files = [f[:-4] for f in os.listdir(folder_path) if f.endswith('.csv')]

csv_files

['features_attributes',
 'features_financials',
 'feature_clickstream',
 'lms_loan_daily']