In [None]:
# Installments of required tables
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
#  BigQuery API activation
from google.colab import auth
auth.authenticate_user()


In [None]:
# big query add-on installation

!pip install --quiet google-cloud-bigquery
from google.cloud import bigquery


In [None]:
# BigQuery client oluştur
client = bigquery.Client(project="homecredit-478707")

# Tabloyu BigQuery'den çek
query = """
SELECT *
FROM `homecredit-478707.Homecredit_Tables.bureau_balance`
"""
bureau_balance = client.query(query).to_dataframe()

In [None]:
bureau_balance.head(5)

# Primary Key Check

In [None]:
duplicates = bureau_balance.duplicated(
    subset=["SK_ID_BUREAU", "MONTHS_BALANCE"]
).sum()

print("Duplicate PK rows:", duplicates)
print("Total rows:", bureau_balance.shape[0])


In [None]:
import pandas as pd
import numpy as np

# ============================================================
# STEP 1 – Encode STATUS and create monthly delinquency flags
# bureau_balance: one row per month per bureau credit (SK_ID_BUREAU)
# ============================================================

# Map STATUS to numeric risk levels
status_map = {
    "X": -1,   # No sufficient information (treated as unknown / neutral)
    "C": 0,    # Closed account (no current delinquency)
    "0": 0,    # Paid on time
    "1": 1,    # 1–30 days past due
    "2": 2,    # 31–60 days past due
    "3": 3,    # 61–90 days past due
    "4": 4,    # 91+ days past due (severe)
    "5": 5     # Bad debt / default
}

# Create numeric STATUS variable
bureau_balance["STATUS_NUM"] = bureau_balance["STATUS"].replace(status_map).astype(int)
# STATUS_NUM will be used for severity, worst status, mean risk level, etc.

# Delinquency flags
bureau_balance["BB_LATE_FLAG"] = (bureau_balance["STATUS_NUM"] > 0).astype(int)
# 1 if any positive delay (1–5), 0 otherwise

bureau_balance["BB_HEAVY_DELINQ_FLAG"] = (bureau_balance["STATUS_NUM"] >= 3).astype(int)
# 1 if STATUS in {3,4,5} → serious delinquency

bureau_balance["BB_DEFAULT_FLAG"] = (bureau_balance["STATUS"] == "5").astype(int)
# 1 if default-level status

bureau_balance["BB_GOOD_MONTH_FLAG"] = bureau_balance["STATUS"].isin(["0", "C"]).astype(int)
# 1 if month is fully on time or closed (no active delinquency)


# ============================================================
# STEP 2 – Aggregate at loan level (SK_ID_BUREAU)
# Each bureau credit (loan) → one row summarizing its full history
# ============================================================

loan_agg_dict = {
    "STATUS_NUM": ["mean", "max", "min"],          # average and worst status over time
    "BB_LATE_FLAG": ["mean", "sum"],               # share and count of late months
    "BB_HEAVY_DELINQ_FLAG": ["mean", "sum"],       # share and count of serious delinquency months
    "BB_DEFAULT_FLAG": ["mean", "sum"],            # any default events
    "BB_GOOD_MONTH_FLAG": ["mean", "sum"],         # share and count of good months (0 or C)
    "MONTHS_BALANCE": ["min", "max"],              # how far back the history goes
}

bureau_bal_loan = bureau_balance.groupby("SK_ID_BUREAU").agg(loan_agg_dict)

# Flatten multi-index columns into readable names
bureau_bal_loan.columns = [
    "BB_" + "_".join(col).upper() for col in bureau_bal_loan.columns
]

bureau_bal_loan.reset_index(inplace=True)

print("Loan-level bureau_balance shape:", bureau_bal_loan.shape)
# One row per SK_ID_BUREAU with delinquency history summaries

In [None]:
bureau_bal_loan.head(3)

In [None]:
bureau_bal_loan.to_csv("bureau_bal_loan.csv", index=True)