<a href="https://colab.research.google.com/github/halimcan/Home-Credit-Default-Project/blob/train_test_full_branch9/aggregated_master_tables.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
import pandas as pd
import gc
import os

BASE = "/content/drive/MyDrive/HomeCredit/Aggregated_tables"


In [3]:
import polars as pl
import os

for file in os.listdir(BASE):
    if file.endswith(".csv"):
        csv_path = f"{BASE}/{file}"
        pq_path = csv_path.replace(".csv", ".parquet")
        print(f"→ Converting: {file}  -->  {pq_path}")

        df = pl.read_csv(csv_path)
        df.write_parquet(pq_path)

→ Converting: application_train.csv  -->  /content/drive/MyDrive/HomeCredit/Aggregated_tables/application_train.parquet
→ Converting: bureau.csv  -->  /content/drive/MyDrive/HomeCredit/Aggregated_tables/bureau.parquet
→ Converting: application_test.csv  -->  /content/drive/MyDrive/HomeCredit/Aggregated_tables/application_test.parquet
→ Converting: bureau_agg.csv  -->  /content/drive/MyDrive/HomeCredit/Aggregated_tables/bureau_agg.parquet
→ Converting: pos_agg.csv  -->  /content/drive/MyDrive/HomeCredit/Aggregated_tables/pos_agg.parquet
→ Converting: installments_agg.csv  -->  /content/drive/MyDrive/HomeCredit/Aggregated_tables/installments_agg.parquet
→ Converting: cc_agg.csv  -->  /content/drive/MyDrive/HomeCredit/Aggregated_tables/cc_agg.parquet
→ Converting: previous_agg.csv  -->  /content/drive/MyDrive/HomeCredit/Aggregated_tables/previous_agg.parquet
→ Converting: bureau_bal_loan.csv  -->  /content/drive/MyDrive/HomeCredit/Aggregated_tables/bureau_bal_loan.parquet


In [4]:
import polars as pl
import polars.selectors as cs

# Raw datas
bureau = pl.read_csv(f"{BASE}/bureau.csv")
balance = pl.read_csv(f"{BASE}/bureau_bal_loan.csv")

# SK_ID_CURR add
bb = balance.join(
    bureau.select(["SK_ID_BUREAU", "SK_ID_CURR"]),
    on="SK_ID_BUREAU",
    how="left"
)

# Selecting numerical columns
numeric_cols = bb.select(cs.numeric()).columns

# Aggregation list
agg_exprs = []
for col in numeric_cols:
    agg_exprs.extend([
        pl.col(col).mean().alias(f"{col}_MEAN"),
        pl.col(col).min().alias(f"{col}_MIN"),
        pl.col(col).max().alias(f"{col}_MAX"),
        pl.col(col).sum().alias(f"{col}_SUM")
    ])

# SK_ID_CURR aggregation
bb_agg_curr = (
    bb.group_by("SK_ID_CURR")
      .agg(agg_exprs + [pl.len().alias("BB_COUNT")])
)

# Save as parquet
bb_agg_curr.write_parquet(f"{BASE}/bureau_bal_loan.parquet")





In [5]:
train = pl.scan_parquet(f"{BASE}/application_train.parquet")
bureau_agg = pl.scan_parquet(f"{BASE}/bureau_agg.parquet")
previous_agg = pl.scan_parquet(f"{BASE}/previous_agg.parquet")
pos_agg = pl.scan_parquet(f"{BASE}/pos_agg.parquet")
installments_agg = pl.scan_parquet(f"{BASE}/installments_agg.parquet")
cc_agg = pl.scan_parquet(f"{BASE}/cc_agg.parquet")
bureau_bal_loan = pl.scan_parquet(f"{BASE}/bureau_bal_loan.parquet")


In [6]:
train_merged = (
    train
    .join(bureau_agg, on="SK_ID_CURR", how="left", suffix="_bur")
    .join(bureau_bal_loan, on="SK_ID_CURR", how="left", suffix="_bal")
    .join(previous_agg, on="SK_ID_CURR", how="left", suffix="_prev")
    .join(pos_agg, on="SK_ID_CURR", how="left", suffix="_pos")
    .join(installments_agg, on="SK_ID_CURR", how="left", suffix="_ins")
    .join(cc_agg, on="SK_ID_CURR", how="left", suffix="_cc")
)


In [7]:
train_final = train_merged.collect()



In [8]:
train_final.write_parquet(f"{BASE}/train_full.parquet")

In [9]:
train_final.shape


(307511, 567)

In [10]:
train_final.group_by("SK_ID_CURR").count().sort("count", descending=True).head(20)


  train_final.group_by("SK_ID_CURR").count().sort("count", descending=True).head(20)


SK_ID_CURR,count
i64,u32
398903,1
180228,1
353420,1
172981,1
324210,1
…,…
362479,1
365641,1
428516,1
319588,1


# Test_application merge

In [11]:
import polars as pl


# ---------------------------------------------
# 1) Parquet dosyalarını load et
# ---------------------------------------------
test  = pl.scan_parquet(f"{BASE}/application_test.parquet")

bureau_agg       = pl.scan_parquet(f"{BASE}/bureau_agg.parquet")
previous_agg     = pl.scan_parquet(f"{BASE}/previous_agg.parquet")
pos_agg          = pl.scan_parquet(f"{BASE}/pos_agg.parquet")
installments_agg = pl.scan_parquet(f"{BASE}/installments_agg.parquet")
cc_agg           = pl.scan_parquet(f"{BASE}/cc_agg.parquet")
bureau_bal_loan  = pl.scan_parquet(f"{BASE}/bureau_bal_loan.parquet")  # aggregation already done


# ---------------------------------------------
# 2) TEST JOIN PIPELINE  (train ile birebir aynı sıralama)
# ---------------------------------------------
test_merged = (
    test
    .join(bureau_agg,        on="SK_ID_CURR", how="left", suffix="_bur")
    .join(bureau_bal_loan,   on="SK_ID_CURR", how="left", suffix="_bal")
    .join(previous_agg,      on="SK_ID_CURR", how="left", suffix="_prev")
    .join(pos_agg,           on="SK_ID_CURR", how="left", suffix="_pos")
    .join(installments_agg,  on="SK_ID_CURR", how="left", suffix="_ins")
    .join(cc_agg,            on="SK_ID_CURR", how="left", suffix="_cc")
)

# ---------------------------------------------
# 3) Final test dataframe
# ---------------------------------------------
test_final = test_merged.collect()
test_final.write_parquet(f"{BASE}/test_full.parquet")

# Sonuç
print("TEST FINAL SHAPE:", test_final.shape)


TEST FINAL SHAPE: (48744, 566)


In [12]:
# ================================================================
# 5. TRAIN/TEST COLUMN ALIGNMENT CHECK
# ================================================================
train_cols = set(train.columns)
test_cols  = set(test.columns)

print("Columns in TRAIN but not TEST:", train_cols - test_cols)
print("Columns in TEST but not TRAIN:", test_cols - train_cols)

Columns in TRAIN but not TEST: {'TARGET'}
Columns in TEST but not TRAIN: set()


  train_cols = set(train.columns)
  test_cols  = set(test.columns)


In [13]:

# ================================================================
# 6. SAVE MERGED DATASETS
# ================================================================
#train.to_csv(f"{BASE}/master_train.csv", index=False)
#test.to_csv(f"{BASE}/master_test.csv", index=False)

#print("Saved master_train.csv and master_test.csv successfully.")