The objective of this notebook is to prepare the final data set for the meta model training. This dataset is a fusion of all the baseline model residuals, predicted probs along with selected features from different categories

In [0]:
appl_train = spark.table('hive_metastore.default.application_train')

#vintage features
vin_appl_train = spark.table('hive_metastore.default.vin_finalized_application_train')
vin_prev = spark.table('hive_metastore.default.ay_vin_prev_pred')
vin_bur = spark.table('hive_metastore.default.ay_vin_bur_pred')

#amtf features
amtf_pos_cash = spark.table('hive_metastore.default.amtf_pos_cash_final_prediction_data2')
amtf_cc = spark.table('hive_metastore.default.amtf_cc_final_pred_df')

#deq features
deq_df = spark.table('hive_metastore.default.bu_final_prediction_data2')

In [0]:
print("appl_train count:", appl_train.count())
print("vin_appl_train count:", vin_appl_train.count())
print("vin_prev count:", vin_prev.count())
print("vin_bur count:", vin_bur.count())
print("amtf_pos_cash count:", amtf_pos_cash.count())
print("amtf_cc count:", amtf_cc.count())
print("deq_df count:", deq_df.count())

In [0]:
#Dropping TARGET column from vin_appl_train
vin_appl_train = vin_appl_train.drop("TARGET")
vin_appl_train.limit(2).display()

In [0]:
amtf_cc = amtf_cc.withColumnRenamed("Prediction_Probability_pos_cash", "Prediction_Prob_cc")


In [0]:
deq_df.limit(2).display()

In [0]:
# Perform left outer joins
result_df = appl_train.join(vin_appl_train, on="SK_ID_CURR", how="left") \
                      .join(vin_prev, on="SK_ID_CURR", how="left") \
                      .join(vin_bur, on="SK_ID_CURR", how="left") \
                      .join(amtf_pos_cash, on="SK_ID_CURR", how="left") \
                      .join(amtf_cc, on="SK_ID_CURR", how="left") \
                      .join(deq_df, on="SK_ID_CURR", how="left")

# Display the final result
result_df.limit(2).display()

In [0]:
result_df.createOrReplaceTempView("result_df")

In [0]:
%sql
select count(distinct SK_ID_CURR) as cust_count, count(*) as row_count from result_df

In [0]:
%python
# Print the number of rows
num_rows = result_df.count()
print(f"Number of rows: {num_rows}")

# Print the number of columns
num_columns = len(result_df.columns)
print(f"Number of columns: {num_columns}")

In [0]:
%sql
drop table if exists default.ultimate_final_dataset;
create table default.ultimate_final_dataset as
select
  *
from
  result_df;

In [0]:
display(result_df)