In [0]:
from pyspark.sql import functions as F

In [0]:
# load the base data 
bu_base_1 = spark.sql("select * from default.bureau_base")
bu_base_1.createOrReplaceTempView("bu_base_1")

In [0]:
bu_base_1.limit(2).display()

In [0]:
# Create the 'ENDDATE_DIFF' column by subtracting 'DAYS_CREDIT_ENDDATE' from 'DAYS_ENDDATE_FACT'
bu_base_1 = bu_base_1.withColumn("ENDDATE_DIFF", F.col("DAYS_ENDDATE_FACT") - F.col("DAYS_CREDIT_ENDDATE"))
# Create the 'DAYS_CREDIT_PLAN' column by subtracting 'DAYS_CREDIT' from 'DAYS_CREDIT_ENDDATE'
bu_base_1 = bu_base_1.withColumn("DAYS_CREDIT_PLAN", F.col("DAYS_CREDIT_ENDDATE") - F.col("DAYS_CREDIT"))

# Calculate mean amount of credit by CREDIT_TYPE
mean_amt_credit = bu_base_1.groupBy("CREDIT_TYPE").agg(
    F.mean("AMT_CREDIT_SUM").alias("MEAN_AMT_CREDIT_BY_CREDIT_TYPE")
)

# Join back with the main DataFrame
bu_base_1 = bu_base_1.join(mean_amt_credit, on="CREDIT_TYPE", how="left")

# Calculate AMT_CREDIT_SUM_RATIO
bu_base_1 = bu_base_1.withColumn(
    "AMT_CREDIT_SUM_RATIO",
    F.col("AMT_CREDIT_SUM") / F.col("MEAN_AMT_CREDIT_BY_CREDIT_TYPE")
)


# Calculate AMT_CREDIT_DEBT_PERC
bu_base_1 = bu_base_1.withColumn(
    "AMT_CREDIT_DEBT_PERC",
    F.col("AMT_CREDIT_SUM_DEBT") / F.col("AMT_CREDIT_SUM")
)

# Calculate AMT_CREDIT_DEBT_DIFF
bu_base_1 = bu_base_1.withColumn(
    "AMT_CREDIT_DEBT_DIFF",
    F.col("AMT_CREDIT_SUM_DEBT") - F.col("AMT_CREDIT_SUM")
)

# Calculate AMT_CREDIT_ANNUITY_PERC
bu_base_1 = bu_base_1.withColumn(
    "AMT_CREDIT_ANNUITY_PERC",
    F.col("AMT_ANNUITY") / F.col("AMT_CREDIT_SUM")
)


In [0]:
# Define Aggregations for Bureau DataFrame
agg_funcs = {
    'DAYS_CREDIT': ['count', 'mean'],
    'CREDIT_DAY_OVERDUE': ['mean', 'sum'],
    'DAYS_CREDIT_ENDDATE': ['mean'],
    'DAYS_ENDDATE_FACT': ['mean'],
    'AMT_CREDIT_MAX_OVERDUE': ['mean', 'sum'],
    'CNT_CREDIT_PROLONG': ['mean', 'sum'],
    'AMT_CREDIT_SUM': ['mean', 'sum'],
    'AMT_CREDIT_SUM_DEBT': ['mean', 'sum'],
    'AMT_CREDIT_SUM_LIMIT': ['mean', 'sum'],
    'AMT_CREDIT_SUM_OVERDUE': ['mean', 'sum'],
    'CREDIT_TYPE': ['countDistinct'],  # Spark equivalent of nunique
    'DAYS_CREDIT_UPDATE': ['mean'],
    'AMT_ANNUITY': ['mean', 'sum'],
    'ENDDATE_DIFF': ['mean'],
    'AMT_CREDIT_SUM_RATIO': ['mean', 'max'],
    'DAYS_CREDIT_PLAN': ['mean', 'sum'],
    'AMT_CREDIT_DEBT_PERC': ['mean', 'min', 'max'],
    'AMT_CREDIT_DEBT_DIFF': ['mean', 'sum']
}


In [0]:
def perform_aggregation(df, agg_funcs, prefix):
            agg_exprs = []
            for col_name, funcs in agg_funcs.items():
                for func in funcs:
                    if func == 'countDistinct':
                        agg_exprs.append(F.countDistinct(col_name).alias(f"{prefix}{col_name}_countDistinct"))
                    else:
                        agg_exprs.append(getattr(F, func)(col_name).alias(f"{prefix}{col_name}_{func}"))
            return df.groupBy("SK_ID_CURR").agg(*agg_exprs)

In [0]:
# Perform Aggregations
bu_base_2 = perform_aggregation(bu_base_1, agg_funcs, "b_")

In [0]:
bu_base_2.limit(2).display()

In [0]:
bu_base_2.printSchema()

In [0]:
filters = {
            "CREDIT_ACTIVE == 'Active'": "b_active_",
            "CREDIT_ACTIVE == 'Closed'": "b_closed_",
            "CREDIT_TYPE == 'Consumer credit'": "b_consumer_",
            "CREDIT_TYPE == 'Credit card'": "b_credit_",
            "CREDIT_TYPE == 'Car loan'": "b_car_",
            "CREDIT_TYPE == 'Mortgage'": "b_mortage_",
            "CREDIT_TYPE == 'Microloan'": "b_micro_",
            "DAYS_CREDIT >= -720": "b_720_",
            "DAYS_CREDIT >= -365": "b_365_"
        }

In [0]:
for filter_expr, prefix in filters.items():
    filtered_df = bu_base_1.filter(filter_expr)
    filtered_agg = perform_aggregation(filtered_df, agg_funcs, prefix)
    bu_base_2 = bu_base_2.join(filtered_agg, on="SK_ID_CURR", how="left")

In [0]:
bu_base_2.limit(2).display()

In [0]:
bu_base_2.printSchema()

In [0]:
bu_base_2.count()
bu_base_2.createOrReplaceTempView("bu_base_2")

In [0]:
%sql
drop table if exists default.bu_features_level1_new;
create table default.bu_features_level1_new as
select
  *
from
  bu_base_2;

## Final Merge of Level 1 and Level 2 Features 

In [0]:
bu_features_level2 = spark.sql("select * from bu_features_level2")

In [0]:
bu_features_level2.printSchema()

In [0]:
bu_features_level2.count()

In [0]:
# Final join 
bu_features_final2 = bu_base_2.join(bu_features_level2, on="SK_ID_CURR", how="inner")

In [0]:
bu_features_final2.printSchema()

In [0]:
bu_features_final2.limit(5).display()
bu_features_final2.createOrReplaceTempView("bu_features_final2")

In [0]:
%sql
drop table if exists default.bu_features_final2;
create table default.bu_features_final2 as
select
  *
from
  bu_features_final2;