In [0]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import col
from pyspark.sql.window import Window

In [0]:
spark = SparkSession.builder.appName("HomeCreditRiskML").getOrCreate()

In [0]:
application_train = spark.table("hive_metastore.default.application_train")
credit_card_balance = spark.table("hive_metastore.default.credit_card_balance")
pos_cash_balance = spark.table("hive_metastore.default.pos_cash_balance")
installments_payments = spark.table("hive_metastore.default.installments_payments")
application_train.createOrReplaceTempView("application_train_temp")
installments_payments.createOrReplaceTempView("installments_payments_temp")
pos_cash_balance.createOrReplaceTempView("pos_cash_balance_temp")
credit_card_balance.createOrReplaceTempView("credit_card_balance_temp")
 

In [0]:
pos_cash_balance_reduced = pos_cash_balance.alias("a") \
    .join(application_train.alias("b"), on="SK_ID_CURR", how="inner") \
    .withColumn("NAME_CONTRACT_TYPE", F.lit("Cash loans")) \
    .select("a.*", "NAME_CONTRACT_TYPE") 

In [0]:
if pos_cash_balance_reduced.count()<pos_cash_balance.count():
    print(True)
else:
    print(False)

True


In [0]:
credit_card_balance_reduced = credit_card_balance.alias("a") \
    .join(application_train.alias("b"), on="SK_ID_CURR", how="inner") \
    .withColumn("NAME_CONTRACT_TYPE", F.lit("Revolving loans")) \
    .select("a.*", "NAME_CONTRACT_TYPE") 

In [0]:
if credit_card_balance_reduced.count()<credit_card_balance.count():
    print(True)
else:
    print(False)

True


In [0]:
sql_query="""
Select distinct a.*, CASE 
        WHEN c.SK_ID_PREV IS NOT NULL AND c.SK_ID_CURR IS NOT NULL THEN 'Cash loans'
        WHEN d.SK_ID_PREV IS NOT NULL AND d.SK_ID_CURR IS NOT NULL THEN 'Revolving loans'
        ELSE NULL
    END AS NAME_CONTRACT_TYPE 
     from (
SELECT 
    a.* 
   
FROM 
    installments_payments_temp a 
JOIN 
    application_train_temp b 
ON 
    a.SK_ID_CURR = b.SK_ID_CURR ) a
LEFT JOIN 
    pos_cash_balance_temp c 
ON 
    a.SK_ID_PREV = c.SK_ID_PREV and a.SK_ID_CURR = c.SK_ID_CURR
LEFT JOIN 
    credit_card_balance_temp d 
ON 
    a.SK_ID_PREV = d.SK_ID_PREV  and a.SK_ID_CURR = d.SK_ID_CURR
"""
installments_payments_reduced=spark.sql(sql_query)

In [0]:
sql_query="""
Select a.SK_ID_CURR, d.SK_ID_CURR from
    pos_cash_balance_temp a
JOIN 
    credit_card_balance_temp d 
ON 
    a.SK_ID_PREV = d.SK_ID_PREV  and a.SK_ID_CURR = d.SK_ID_CURR
"""
temp=spark.sql(sql_query)
temp.show( 10)

+----------+----------+
|SK_ID_CURR|SK_ID_CURR|
+----------+----------+
+----------+----------+



In [0]:
if installments_payments_reduced.count()<installments_payments.count():
    print(True)
else:
    print(False,installments_payments.count(),installments_payments_reduced.count())

True


In [0]:
posh_cash_balance_reduced.show(10)
                                      

+----------+----------+--------------+--------------+---------------------+--------------------+------+----------+------------------+
|SK_ID_PREV|SK_ID_CURR|MONTHS_BALANCE|CNT_INSTALMENT|CNT_INSTALMENT_FUTURE|NAME_CONTRACT_STATUS|SK_DPD|SK_DPD_DEF|NAME_CONTRACT_TYPE|
+----------+----------+--------------+--------------+---------------------+--------------------+------+----------+------------------+
|   1897619|    143146|           -81|           6.0|                  4.0|              Active|     0|         0|        Cash loans|
|   2682972|    182522|           -85|          12.0|                  9.0|              Active|     0|         0|        Cash loans|
|   1307675|    272606|           -86|           6.0|                  4.0|              Active|     0|         0|        Cash loans|
|   1772094|    242128|           -85|          12.0|                  8.0|              Active|     0|         0|        Cash loans|
|   1305733|    181302|           -81|           6.0|         

In [0]:
pos_cash_features = pos_cash_balance_reduced.groupBy("SK_ID_CURR", "SK_ID_PREV").agg(
   F.max("MONTHS_BALANCE").alias("amtf_months_balance_max"),  # Most recent snapshot
    F.min("MONTHS_BALANCE").alias("amtf_months_balance_min"),  # Earliest snapshot
    F.avg("MONTHS_BALANCE").alias("amtf_months_balance_avg"),  # Average snapshot
    F.stddev("MONTHS_BALANCE").alias("amtf_months_balance_stddev"),  # Stddev of snapshots
    (F.max("MONTHS_BALANCE") - F.min("MONTHS_BALANCE") + 1).alias("amtf_loan_duration_months"),  # Loan duration

    # Installment Features
    F.max("CNT_INSTALMENT").alias("amtf_cnt_instalment_max"),  # Max number of installments
    F.min("CNT_INSTALMENT").alias("amtf_cnt_instalment_min"),  # Min number of installments
    F.avg("CNT_INSTALMENT").alias("amtf_cnt_instalment_avg"),  # Avg number of installments
    F.stddev("CNT_INSTALMENT").alias("amtf_cnt_instalment_stddev"),  # Stddev of installments
    F.min("CNT_INSTALMENT_FUTURE").alias("amtf_cnt_instalment_future_min"),  # Min future installments
    F.avg("CNT_INSTALMENT_FUTURE").alias("amtf_cnt_instalment_future_avg"),  # Avg future installments
    F.max("CNT_INSTALMENT_FUTURE").alias("amtf_cnt_instalment_future_max"),  # Max future installments
    F.stddev("CNT_INSTALMENT_FUTURE").alias("amtf_cnt_instalment_future_stddev"),  # Stddev of future installments
    (F.sum("CNT_INSTALMENT") - F.sum("CNT_INSTALMENT_FUTURE")).alias("amtf_instalments_completed"),  # Completed installments
    (F.sum("CNT_INSTALMENT_FUTURE") / F.sum("CNT_INSTALMENT")).alias("amtf_future_installment_ratio"),  # Remaining ratio

    # Delinquency and Default Risk Features
    F.max("SK_DPD").alias("amtf_max_dpd"),  # Max days past due
    F.avg("SK_DPD").alias("amtf_avg_dpd"),  # Average days past due
    F.stddev("SK_DPD").alias("amtf_stddev_dpd"),  # Stddev of days past due
    F.sum(F.when(F.col("SK_DPD") > 0, 1).otherwise(0)).alias("amtf_overdue_count"),  # Count of overdue records
    F.max("SK_DPD_DEF").alias("amtf_max_dpd_def"),  # Max days past due with deferrals
    F.avg("SK_DPD_DEF").alias("amtf_avg_dpd_def"),  # Average days past due with deferrals
    F.stddev("SK_DPD_DEF").alias("amtf_stddev_dpd_def"),  # Stddev of days past due with deferrals
    F.sum(F.when(F.col("SK_DPD_DEF") > 0, 1).otherwise(0)).alias("amtf_deferral_count"),  # Count of deferrals
    (F.sum(F.when(F.col("SK_DPD_DEF") > 0, 1).otherwise(0)) / F.count("*")).alias("amtf_deferral_proportion"),  # Proportion of deferrals

    # Contract Status Features
    F.sum(F.when(F.col("NAME_CONTRACT_STATUS") == "Active", 1).otherwise(0)).alias("amtf_active_status_count"),  # Active status count
    F.sum(F.when(F.col("NAME_CONTRACT_STATUS") == "Completed", 1).otherwise(0)).alias("amtf_completed_status_count"),  # Completed status count
    F.sum(F.when(F.col("NAME_CONTRACT_STATUS") == "Approved", 1).otherwise(0)).alias("amtf_approved_status_count"),  # Approved status count
    F.sum(F.when(F.col("NAME_CONTRACT_STATUS") == "Cancelled", 1).otherwise(0)).alias("amtf_cancelled_status_count"),  # Cancelled status count
    F.sum(F.when(F.col("NAME_CONTRACT_STATUS") == "Demand", 1).otherwise(0)).alias("amtf_demand_status_count"),  # Demand status count

    # Derived Ratios
    (F.sum(F.when(F.col("NAME_CONTRACT_STATUS") == "Active", 1).otherwise(0)) / F.count("*")).alias("amtf_active_status_proportion"),  # Active proportion
    (F.sum(F.when(F.col("NAME_CONTRACT_STATUS") == "Completed", 1).otherwise(0)) / F.count("*")).alias("amtf_completed_status_proportion"),  # Completed proportion

    # Status Duration Features
    F.sum(F.when(F.col("NAME_CONTRACT_STATUS") == "Active", F.col("MONTHS_BALANCE")).otherwise(0)).alias("amtf_active_status_duration"),  # Active duration
    F.sum(F.when(F.col("NAME_CONTRACT_STATUS") == "Completed", F.col("MONTHS_BALANCE")).otherwise(0)).alias("amtf_completed_status_duration")  # Completed duration
)
pos_cash_features_final = pos_cash_balance_reduced.select("SK_ID_CURR", "SK_ID_PREV", "NAME_CONTRACT_TYPE").distinct() \
    .join(pos_cash_features, on=["SK_ID_CURR", "SK_ID_PREV"], how="inner")
pos_cash_features_final.show(truncate=False)


+----------+----------+------------------+-----------------------+-----------------------+-----------------------+--------------------------+-------------------------+-----------------------+-----------------------+-----------------------+--------------------------+------------------------------+------------------------------+------------------------------+---------------------------------+--------------------------+-----------------------------+------------+--------------------+-------------------+------------------+----------------+--------------------+-------------------+-------------------+------------------------+------------------------+---------------------------+--------------------------+---------------------------+------------------------+-----------------------------+--------------------------------+---------------------------+------------------------------+
|SK_ID_CURR|SK_ID_PREV|NAME_CONTRACT_TYPE|amtf_months_balance_max|amtf_months_balance_min|amtf_months_balance_avg|amtf

In [0]:
pos_cash_balance_reduced.select("SK_ID_CURR", "SK_ID_PREV").distinct().count()

800337

In [0]:
pos_cash_features_final.count()

800337

In [0]:
credit_card_balance_reduced.show()

+----------+----------+--------------+-----------+-----------------------+------------------------+--------------------+--------------------------+------------------------+-----------------------+-------------------+-------------------------+------------------------+-------------+--------------------+------------------------+--------------------+--------------------------+------------------------+-------------------------+--------------------+------+----------+------------------+
|SK_ID_CURR|SK_ID_PREV|MONTHS_BALANCE|AMT_BALANCE|AMT_CREDIT_LIMIT_ACTUAL|AMT_DRAWINGS_ATM_CURRENT|AMT_DRAWINGS_CURRENT|AMT_DRAWINGS_OTHER_CURRENT|AMT_DRAWINGS_POS_CURRENT|AMT_INST_MIN_REGULARITY|AMT_PAYMENT_CURRENT|AMT_PAYMENT_TOTAL_CURRENT|AMT_RECEIVABLE_PRINCIPAL|AMT_RECIVABLE|AMT_TOTAL_RECEIVABLE|CNT_DRAWINGS_ATM_CURRENT|CNT_DRAWINGS_CURRENT|CNT_DRAWINGS_OTHER_CURRENT|CNT_DRAWINGS_POS_CURRENT|CNT_INSTALMENT_MATURE_CUM|NAME_CONTRACT_STATUS|SK_DPD|SK_DPD_DEF|NAME_CONTRACT_TYPE|
+----------+----------+-------

In [0]:

from pyspark.sql import functions as F

# Aggregating features for credit_card_balance_reduced
credit_card_features = credit_card_balance_reduced.groupBy("SK_ID_CURR", "SK_ID_PREV", "NAME_CONTRACT_TYPE").agg(
    # Amount-Financed Features
    F.max("AMT_BALANCE").alias("amtf_amt_balance_max"),  # Max balance
    F.min("AMT_BALANCE").alias("amtf_amt_balance_min"),  # Min balance
    F.avg("AMT_BALANCE").alias("amtf_amt_balance_avg"),  # Avg balance
    F.stddev("AMT_BALANCE").alias("amtf_amt_balance_stddev"),  # Stddev of balance
    F.sum("AMT_BALANCE").alias("amtf_amt_balance_sum"),  # Total balance

    F.max("AMT_CREDIT_LIMIT_ACTUAL").alias("amtf_credit_limit_max"),  # Max credit limit
    F.min("AMT_CREDIT_LIMIT_ACTUAL").alias("amtf_credit_limit_min"),  # Min credit limit
    F.avg("AMT_CREDIT_LIMIT_ACTUAL").alias("amtf_credit_limit_avg"),  # Avg credit limit
    (F.sum("AMT_BALANCE") / F.sum("AMT_CREDIT_LIMIT_ACTUAL")).alias("amtf_credit_utilization_ratio"),  # Avg credit utilization
    (F.max("AMT_BALANCE") / F.max("AMT_CREDIT_LIMIT_ACTUAL")).alias("amtf_credit_utilization_max"),  # Max credit utilization

    # Payment Features
    F.max("AMT_PAYMENT_CURRENT").alias("amtf_amt_payment_current_max"),  # Max payment
    F.min("AMT_PAYMENT_CURRENT").alias("amtf_amt_payment_current_min"),  # Min payment
    F.avg("AMT_PAYMENT_CURRENT").alias("amtf_amt_payment_current_avg"),  # Avg payment
    F.sum("AMT_PAYMENT_CURRENT").alias("amtf_amt_payment_current_sum"),  # Total payment
    F.max("AMT_PAYMENT_TOTAL_CURRENT").alias("amtf_amt_payment_total_current_max"),  # Max total payments
    F.sum("AMT_PAYMENT_TOTAL_CURRENT").alias("amtf_amt_payment_total_current_sum"),  # Total of all payments

    # Receivable Features
    F.max("AMT_RECEIVABLE_PRINCIPAL").alias("amtf_receivable_principal_max"),  # Max principal receivable
    F.min("AMT_RECEIVABLE_PRINCIPAL").alias("amtf_receivable_principal_min"),  # Min principal receivable
    F.avg("AMT_RECEIVABLE_PRINCIPAL").alias("amtf_receivable_principal_avg"),  # Avg principal receivable
    F.sum("AMT_RECIVABLE").alias("amtf_amt_receivable_sum"),  # Total receivable amount
    F.avg("AMT_RECIVABLE").alias("amtf_amt_receivable_avg"),  # Avg receivable amount

    # Drawing Features
    F.sum("AMT_DRAWINGS_ATM_CURRENT").alias("amtf_amt_drawings_atm_sum"),  # Total ATM drawings
    F.sum("AMT_DRAWINGS_POS_CURRENT").alias("amtf_amt_drawings_pos_sum"),  # Total POS drawings
    F.sum("AMT_DRAWINGS_OTHER_CURRENT").alias("amtf_amt_drawings_other_sum"),  # Total other drawings
    F.sum("AMT_DRAWINGS_CURRENT").alias("amtf_amt_drawings_total_sum"),  # Total drawings
    (F.sum("AMT_DRAWINGS_ATM_CURRENT") / F.sum("AMT_DRAWINGS_CURRENT")).alias("amtf_atm_to_total_drawings_ratio"),  # ATM-to-total ratio
    (F.sum("AMT_DRAWINGS_POS_CURRENT") / F.sum("AMT_DRAWINGS_CURRENT")).alias("amtf_pos_to_total_drawings_ratio"),  # POS-to-total ratio

    # Drawing Counts
    F.avg("CNT_DRAWINGS_ATM_CURRENT").alias("amtf_cnt_drawings_atm_avg"),  # Avg ATM withdrawals
    F.avg("CNT_DRAWINGS_POS_CURRENT").alias("amtf_cnt_drawings_pos_avg"),  # Avg POS transactions
    F.avg("CNT_DRAWINGS_CURRENT").alias("amtf_cnt_drawings_total_avg"),  # Avg total withdrawals

    # DPD Features
    F.max("SK_DPD").alias("amtf_dpd_max"),  # Max days past due
    F.avg("SK_DPD").alias("amtf_dpd_avg"),  # Avg days past due
    F.sum(F.when(F.col("SK_DPD") > 0, 1).otherwise(0)).alias("amtf_dpd_overdue_count"),  # Count of overdue records
    (F.sum(F.when(F.col("SK_DPD") > 0, 1).otherwise(0)) / F.count("*")).alias("amtf_dpd_proportion"),  # Proportion of overdue records
    F.max("SK_DPD_DEF").alias("amtf_dpd_def_max"),  # Max DPD with deferrals
    F.avg("SK_DPD_DEF").alias("amtf_dpd_def_avg"),  # Avg DPD with deferrals

    # Contract Status Features
    F.sum(F.when(F.col("NAME_CONTRACT_STATUS") == "Active", 1).otherwise(0)).alias("amtf_active_contract_count"),  # Count Active contracts
    F.sum(F.when(F.col("NAME_CONTRACT_STATUS") == "Completed", 1).otherwise(0)).alias("amtf_completed_contract_count"),  # Count Completed contracts
    (F.sum(F.when(F.col("NAME_CONTRACT_STATUS") == "Active", 1).otherwise(0)) / F.count("*")).alias("amtf_active_status_proportion"),  # Active proportion
    (F.sum(F.when(F.col("NAME_CONTRACT_STATUS") == "Completed", 1).otherwise(0)) / F.count("*")).alias("amtf_completed_status_proportion")  # Completed proportion
)

# Joining with NAME_CONTRACT_TYPE
credit_card_features_final = credit_card_balance_reduced.select("SK_ID_CURR", "SK_ID_PREV", "NAME_CONTRACT_TYPE").distinct() \
    .join(credit_card_features, on=["SK_ID_CURR", "SK_ID_PREV", "NAME_CONTRACT_TYPE"], how="inner")

# Show the final DataFrame
credit_card_features_final.show(truncate=False)



+----------+----------+------------------+--------------------+--------------------+--------------------+-----------------------+--------------------+---------------------+---------------------+---------------------+-----------------------------+---------------------------+----------------------------+----------------------------+----------------------------+----------------------------+----------------------------------+----------------------------------+-----------------------------+-----------------------------+-----------------------------+-----------------------+-----------------------+-------------------------+-------------------------+---------------------------+---------------------------+--------------------------------+--------------------------------+-------------------------+-------------------------+---------------------------+------------+--------------------+----------------------+--------------------+----------------+--------------------+--------------------------+-----

In [0]:
credit_card_balance_reduced.select("SK_ID_CURR", "SK_ID_PREV").distinct().count()

87452

In [0]:
credit_card_features_final.count()

87452

In [0]:
installments_payments_reduced.show()

+----------+----------+----------------------+---------------------+---------------+------------------+--------------+-----------+------------------+
|SK_ID_PREV|SK_ID_CURR|NUM_INSTALMENT_VERSION|NUM_INSTALMENT_NUMBER|DAYS_INSTALMENT|DAYS_ENTRY_PAYMENT|AMT_INSTALMENT|AMT_PAYMENT|NAME_CONTRACT_TYPE|
+----------+----------+----------------------+---------------------+---------------+------------------+--------------+-----------+------------------+
|   1000004|    260094|                   1.0|                    6|         -712.0|            -752.0|       3391.11|    3391.11|        Cash loans|
|   1000004|    260094|                   2.0|                    7|         -682.0|            -695.0|     13176.495|  13176.495|        Cash loans|
|   1000004|    260094|                   1.0|                    5|         -742.0|            -752.0|       3391.11|    3391.11|        Cash loans|
|   1000004|    260094|                   1.0|                    2|         -832.0|            -851

In [0]:
# Aggregate features for installments_payments_reduced
installments_features = installments_payments_reduced.groupBy("SK_ID_CURR", "SK_ID_PREV", "NAME_CONTRACT_TYPE").agg(
    # Amount-Financed Features
    F.sum("AMT_INSTALMENT").alias("amtf_amt_instalment_total"),                # Total amount due
    F.avg("AMT_INSTALMENT").alias("amtf_amt_instalment_avg"),                  # Average amount due
    F.max("AMT_INSTALMENT").alias("amtf_amt_instalment_max"),                  # Max amount due
    F.min("AMT_INSTALMENT").alias("amtf_amt_instalment_min"),                  # Min amount due
    F.sum("AMT_PAYMENT").alias("amtf_amt_payment_total"),                      # Total amount paid
    F.avg("AMT_PAYMENT").alias("amtf_amt_payment_avg"),                        # Average amount paid
    F.max("AMT_PAYMENT").alias("amtf_amt_payment_max"),                        # Max amount paid
    F.min("AMT_PAYMENT").alias("amtf_amt_payment_min"),                        # Min amount paid
    F.stddev("AMT_PAYMENT").alias("amtf_amt_payment_stddev"),                  # Stddev of payment amounts
    (F.sum("AMT_PAYMENT") / F.sum("AMT_INSTALMENT")).alias("amtf_payment_to_instalment_ratio"),  # Payment-to-installment ratio

    # Timing Features
    F.sum(F.when(F.col("DAYS_ENTRY_PAYMENT") > 0, 1).otherwise(0)).alias("amtf_late_payment_count"),  # Count of late payments
    F.avg(F.when(F.col("DAYS_ENTRY_PAYMENT") > 0, F.col("DAYS_ENTRY_PAYMENT")).otherwise(None)).alias("amtf_avg_days_late"),  # Avg days late
    F.max(F.when(F.col("DAYS_ENTRY_PAYMENT") > 0, F.col("DAYS_ENTRY_PAYMENT")).otherwise(None)).alias("amtf_max_days_late"),  # Max days late
    F.min(F.when(F.col("DAYS_ENTRY_PAYMENT") > 0, F.col("DAYS_ENTRY_PAYMENT")).otherwise(None)).alias("amtf_min_days_late"),  # Min days late
    F.sum(F.when(F.col("DAYS_ENTRY_PAYMENT") < 0, 1).otherwise(0)).alias("amtf_early_payment_count"),  # Count of early payments
    F.avg(F.when(F.col("DAYS_ENTRY_PAYMENT") < 0, F.col("DAYS_ENTRY_PAYMENT")).otherwise(None)).alias("amtf_avg_days_early"),  # Avg days early
    F.max(F.when(F.col("DAYS_ENTRY_PAYMENT") < 0, F.col("DAYS_ENTRY_PAYMENT")).otherwise(None)).alias("amtf_max_days_early"),  # Max days early
    F.min(F.when(F.col("DAYS_ENTRY_PAYMENT") < 0, F.col("DAYS_ENTRY_PAYMENT")).otherwise(None)).alias("amtf_min_days_early"),  # Min days early
    F.stddev("DAYS_ENTRY_PAYMENT").alias("amtf_days_entry_payment_stddev"),    # Stddev of payment timing

    # Installment-Specific Features
    F.countDistinct("NUM_INSTALMENT_VERSION").alias("amtf_distinct_version_count"),  # Distinct installment plan versions
    F.countDistinct("NUM_INSTALMENT_NUMBER").alias("amtf_distinct_installment_count"),  # Distinct installment numbers
    F.sum(F.when(F.col("AMT_PAYMENT") >= F.col("AMT_INSTALMENT"), 1).otherwise(0)).alias("amtf_exact_or_over_payment_count"),  # Count of exact or overpayments
    F.sum(F.when(F.col("AMT_PAYMENT") < F.col("AMT_INSTALMENT"), 1).otherwise(0)).alias("amtf_underpayment_count"),  # Count of underpayments
    F.sum(F.when(F.col("AMT_PAYMENT") < F.col("AMT_INSTALMENT"), F.col("AMT_INSTALMENT") - F.col("AMT_PAYMENT")).otherwise(0)).alias("amtf_underpayment_amount_total"),  # Total underpayment amount

    # Days-Installment Features
    F.max("DAYS_INSTALMENT").alias("amtf_days_installment_max"),               # Latest installment due date
    F.min("DAYS_INSTALMENT").alias("amtf_days_installment_min"),               # Earliest installment due date
    F.avg("DAYS_INSTALMENT").alias("amtf_days_installment_avg"),               # Avg days installment due

    # Loan Type-Specific Aggregates
    F.sum(F.when(F.col("NAME_CONTRACT_TYPE") == "Cash loans", "AMT_PAYMENT").otherwise(0)).alias("amtf_cash_loans_total_payment"),  # Total payment for cash loans
    F.sum(F.when(F.col("NAME_CONTRACT_TYPE") == "Revolving loans", "AMT_PAYMENT").otherwise(0)).alias("amtf_revolving_loans_total_payment"),  # Total payment for revolving loans
    F.count(F.when(F.col("NAME_CONTRACT_TYPE") == "Cash loans", 1).otherwise(None)).alias("amtf_cash_loans_count"),  # Count for cash loans
    F.count(F.when(F.col("NAME_CONTRACT_TYPE") == "Revolving loans", 1).otherwise(None)).alias("amtf_revolving_loans_count")  # Count for revolving loans
)

# Add additional features if necessary
installments_features = installments_features.withColumn(
    "amtf_late_payment_proportion",
    F.col("amtf_late_payment_count") / F.col("amtf_distinct_installment_count")  # Proportion of late payments
).withColumn(
    "amtf_early_payment_proportion",
    F.col("amtf_early_payment_count") / F.col("amtf_distinct_installment_count")  # Proportion of early payments
)

# Final DataFrame
installments_features_final = installments_payments_reduced.select("SK_ID_CURR", "SK_ID_PREV", "NAME_CONTRACT_TYPE").distinct() \
    .join(installments_features, on=["SK_ID_CURR", "SK_ID_PREV", "NAME_CONTRACT_TYPE"], how="inner")

# Show the final result
installments_features_final.show(truncate=False)


+----------+----------+------------------+-------------------------+-----------------------+-----------------------+-----------------------+----------------------+--------------------+--------------------+--------------------+-----------------------+--------------------------------+-----------------------+------------------+------------------+------------------+------------------------+-------------------+-------------------+-------------------+------------------------------+---------------------------+-------------------------------+--------------------------------+-----------------------+------------------------------+-------------------------+-------------------------+-------------------------+-----------------------------+----------------------------------+---------------------+--------------------------+----------------------------+-----------------------------+
|SK_ID_CURR|SK_ID_PREV|NAME_CONTRACT_TYPE|amtf_amt_instalment_total|amtf_amt_instalment_avg|amtf_amt_instalment_max|amtf

In [0]:
installments_payments_reduced.select("SK_ID_CURR", "SK_ID_PREV").distinct().count()

853344

In [0]:
installments_features_final.count()

853066

In [0]:
# Perform full outer join for all three tables on SK_ID_CURR and SK_ID_PREV
combined_features = pos_cash_features_final.alias("pos") \
    .join(credit_card_features_final.alias("cc"), ["SK_ID_CURR", "SK_ID_PREV"], "full_outer") \
    .join(installments_features_final.alias("inst"), ["SK_ID_CURR", "SK_ID_PREV"], "full_outer")

# Fill missing values (optional, based on requirements)
combined_amtf_features = combined_features.fillna(0)  # Replace NaN with 0 for numeric columns

# Show the combined result
combined_amtf_features.show(truncate=False)

# Count the number of rows and columns to verify completeness
print(f"Total Rows: {combined_amtf_features.count()}")
print(f"Total Features (Columns): {len(combined_amtf_features.columns)}")