In [None]:
# Installments of required tables
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
#  BigQuery API activation
from google.colab import auth
auth.authenticate_user()


In [None]:
# big query add-on installation

!pip install --quiet google-cloud-bigquery
from google.cloud import bigquery

In [None]:
# BigQuery client initiation
client = bigquery.Client(project="homecredit-478707")

# From BigQuery
query = """
SELECT *
FROM `homecredit-478707.Homecredit_Tables.bureau`
"""
bureau = client.query(query).to_dataframe()

In [None]:
bureau.info()

In [None]:
bureau.describe()

# Primary Key Check

In [None]:
bureau['SK_ID_BUREAU'].is_unique    # Must be true

In [None]:
bureau['SK_ID_CURR'].is_unique    # Must be false (normal)

## Credit Behavior Insights (Domain EDA)

These are very important for credit scoring:

Percentage of loans in debt → powerful predictor

Percentage of loans at their credit limit → indicates high risk

Presence of any active loans → very important signal


These tells us which categories could be encoded.

In [None]:
bureau['CREDIT_ACTIVE'].value_counts()



In [None]:
bureau['CREDIT_TYPE'].value_counts()

In [None]:
# Check that SK_ID_CURR exists and how many unique customers

if 'SK_ID_CURR' in bureau.columns:
    total_rows = len(bureau)
    unique_customers = bureau['SK_ID_CURR'].nunique()
    print(f"Total bureau rows: {total_rows}, Unique customers: {unique_customers}")

## # Most critical EDA output is :

In [None]:

#How many external loans does each customer have?

bureau.groupby('SK_ID_CURR').size()

# then this goes into aggregated bureau as bureau_total_loans


In [None]:
bureau.isnull().mean().sort_values(ascending=False)
# AMT_ANNUITY,AMT_CREDIT_MAX_OVERDUE, DAYS_ENDDATE_FACT, AMT_CREDIT_SUM_LIMIT variables have high missing value ratios.

# These are loan level missings which is pretty normal.

# mean, min, max → could work!!

# count of non-nulls + missing rate → could be strong features.



In [None]:
# Create missing-value indicator flags in order for LightGBM algorithm to work better.

# For columns that are informative when missing, create *_MISSING flags
missing_cols = [
    "AMT_ANNUITY",               # 71% missing -> create flag
    "AMT_CREDIT_MAX_OVERDUE",    # 65% missing -> create flag
    "DAYS_ENDDATE_FACT",         # ~37% missing -> create flag
    "AMT_CREDIT_SUM_LIMIT",      # ~34% missing -> create flag (credit card limit)
    "AMT_CREDIT_SUM_DEBT",       # ~15% missing -> create flag
    "DAYS_CREDIT_ENDDATE"        # ~6% missing -> optional flag
]

for col in missing_cols:
    if col in bureau.columns:
        # create binary missing flag (1 = missing)
        bureau[col + "_MISSING"] = bureau[col].isna().astype(int)
    else:
        print(f"Warning: {col} not in bureau columns, skipping missing flag creation.")

In [None]:
# Small-fill for nearly-complete numeric columns

# AMT_CREDIT_SUM is almost complete (0.0008 missing). Fill with median to avoid NaNs in aggregations.
if 'AMT_CREDIT_SUM' in bureau.columns:
    n_missing_amt_sum = bureau['AMT_CREDIT_SUM'].isna().sum()
    if n_missing_amt_sum > 0:
        median_val = bureau['AMT_CREDIT_SUM'].median()
        # fill with median (safe because missing is negligible)
        bureau['AMT_CREDIT_SUM'] = bureau['AMT_CREDIT_SUM'].fillna(median_val)
        print(f"Filled {n_missing_amt_sum} missing AMT_CREDIT_SUM with median = {median_val:.2f}")

In [None]:
# Convert some columns to numeric if they are stored as objects

# This prevents skipping numeric operations
to_numeric_cols = [
    "DAYS_CREDIT", "DAYS_CREDIT_ENDDATE", "DAYS_ENDDATE_FACT",
    "AMT_CREDIT_SUM", "AMT_CREDIT_SUM_DEBT", "AMT_CREDIT_SUM_LIMIT",
    "AMT_CREDIT_MAX_OVERDUE", "AMT_ANNUITY", "CNT_CREDIT_PROLONG",
    "DAYS_CREDIT_UPDATE"
]
for col in to_numeric_cols:
    if col in bureau.columns:
        bureau[col] = pd.to_numeric(bureau[col], errors='coerce')

## Credit Behavior Profiling

CREDIT_ACTIVE → types.

CREDIT_TYPE → types

From loan-level data, i derived customer-level summary features:

Total debt → bureau_total_debt

Maximum overdue amount → bureau_max_overdue_amount

Number of active loans → bureau_total_active_loans

In [None]:
# Feature engineering: aggregate per SK_ID_CURR

# I'll produce a set of robust, interpretable features commonly used in credit risk.

agg_funcs = {
    # counts
    'SK_ID_BUREAU': ['count'],  # number of external credits per customer
    # credit amounts
    'AMT_CREDIT_SUM': ['sum', 'mean', 'max'],
    'AMT_CREDIT_SUM_DEBT': ['sum', 'mean'],
    'AMT_CREDIT_SUM_LIMIT': ['mean'],  # avg credit limit (mostly for cards)
    'AMT_CREDIT_MAX_OVERDUE': ['max'], # worst overdue amount
    'AMT_ANNUITY': ['mean'],
    # days / durations
    'DAYS_CREDIT': ['min','max','mean'],
    'DAYS_CREDIT_ENDDATE': ['min','max','mean'],
    'DAYS_ENDDATE_FACT': ['min','max','mean'],
    'DAYS_CREDIT_UPDATE': ['min','max','mean'],
    # counts of categorical-type signals
    'CNT_CREDIT_PROLONG': ['sum','max']
}

In [None]:
# Only keep keys present in dataframe to avoid KeyError

agg_funcs_filtered = {k:v for k,v in agg_funcs.items() if k in bureau.columns}

In [None]:
# Perform groupby aggregation
bureau_agg = bureau.groupby('SK_ID_CURR').agg(agg_funcs_filtered)

In [None]:
bureau_agg.head(3)

In [None]:
# Flatten multiindex columns
bureau_agg.columns = ['_'.join(col).strip() for col in bureau_agg.columns.values]

In [None]:
# Rename some columns for clarity
rename_map = {
    'SK_ID_BUREAU_count': 'bureau_total_loans',
    'AMT_CREDIT_SUM_sum': 'bureau_total_credit_amount',
    'AMT_CREDIT_SUM_mean': 'bureau_mean_credit_amount',
    'AMT_CREDIT_SUM_max': 'bureau_max_credit_amount',
    'AMT_CREDIT_SUM_DEBT_sum': 'bureau_total_debt',
    'AMT_CREDIT_SUM_DEBT_mean': 'bureau_mean_debt',
    'AMT_CREDIT_SUM_LIMIT_mean': 'bureau_mean_credit_limit',
    'AMT_CREDIT_MAX_OVERDUE_max': 'bureau_max_overdue_amount',
    'AMT_ANNUITY_mean': 'bureau_mean_annuity',
    'DAYS_CREDIT_min': 'bureau_earliest_credit_days',
    'DAYS_CREDIT_max': 'bureau_latest_credit_days',
    'DAYS_CREDIT_mean': 'bureau_mean_days_credit',
    'DAYS_CREDIT_ENDDATE_min': 'bureau_min_credit_enddate',
    'DAYS_CREDIT_ENDDATE_max': 'bureau_max_credit_enddate',
    'DAYS_CREDIT_ENDDATE_mean': 'bureau_mean_credit_enddate',
    'DAYS_ENDDATE_FACT_min': 'bureau_min_enddate_fact',
    'DAYS_ENDDATE_FACT_max': 'bureau_max_enddate_fact',
    'DAYS_ENDDATE_FACT_mean': 'bureau_mean_enddate_fact',
    'DAYS_CREDIT_UPDATE_min': 'bureau_min_days_credit_update',
    'DAYS_CREDIT_UPDATE_max': 'bureau_max_days_credit_update',
    'DAYS_CREDIT_UPDATE_mean': 'bureau_mean_days_credit_update',
    'CNT_CREDIT_PROLONG_sum': 'bureau_total_prolongs',
    'CNT_CREDIT_PROLONG_max': 'bureau_max_prolongs'
}

# apply rename for keys that exist
rename_map = {k:v for k,v in rename_map.items() if k in bureau_agg.columns}
bureau_agg = bureau_agg.rename(columns=rename_map)

In [None]:
# Additional aggregated indicators

# number of active credits (CREDIT_ACTIVE != 'Closed'), number of closed credits

if 'CREDIT_ACTIVE' in bureau.columns:
    act = bureau.groupby('SK_ID_CURR')['CREDIT_ACTIVE'].apply(
        lambda x: (x != 'Closed').sum()
    ).rename('bureau_active_credits')
    closed = bureau.groupby('SK_ID_CURR')['CREDIT_ACTIVE'].apply(
        lambda x: (x == 'Closed').sum()
    ).rename('bureau_closed_credits')
    bureau_agg = bureau_agg.join(act).join(closed)

In [None]:
bureau_agg.head(2)

In [None]:
# worst credit status (e.g., any overdue count) by mapping status columns if present
if 'AMT_CREDIT_MAX_OVERDUE' in bureau.columns:
    # fraction of loans with max overdue > 0
    overdue_flag = (bureau['AMT_CREDIT_MAX_OVERDUE'] > 0).astype(int)
    overdue_frac = overdue_flag.groupby(bureau['SK_ID_CURR']).mean().rename('bureau_frac_loans_with_overdue')
    bureau_agg = bureau_agg.join(overdue_frac)

In [None]:
# Aggregate missing flags: percentage of bureau records missing each field
for col in missing_cols:
    flag_col = col + "_MISSING"
    if flag_col in bureau.columns:
        pct_missing = bureau.groupby('SK_ID_CURR')[flag_col].mean().rename(flag_col + "_ratio")
        # e.g., AMT_ANNUITY_MISSING_ratio = fraction of that customer's bureau records missing annuity
        bureau_agg = bureau_agg.join(pct_missing)

In [None]:
# Derived ratios and features
# debt to credit ratio (sum debt / sum credit) - safe with small epsilon
if 'bureau_total_debt' in bureau_agg.columns and 'bureau_total_credit_amount' in bureau_agg.columns:
    eps = 1e-9
    bureau_agg['bureau_debt_to_credit_ratio'] = bureau_agg['bureau_total_debt'] / (bureau_agg['bureau_total_credit_amount'] + eps)


In [None]:
# average loan age (in days) approx using mean DAYS_CREDIT
if 'bureau_mean_days_credit' in bureau_agg.columns:
    bureau_agg['bureau_avg_loan_age_days'] = bureau_agg['bureau_mean_days_credit']

In [None]:
# Final cleanup: fill any inf / extremely tiny NaNs produced
bureau_agg = bureau_agg.replace([np.inf, -np.inf], np.nan)


In [None]:
# For features where missing is negligible, fill with 0
fill_zero_cols = [c for c in bureau_agg.columns if bureau_agg[c].isna().sum() / len(bureau_agg) < 0.05]
bureau_agg[fill_zero_cols] = bureau_agg[fill_zero_cols].fillna(0)

In [None]:
# Output
print("Aggregated bureau features shape:", bureau_agg.shape)
# Optionally save:
bureau_agg.to_csv("bureau_agg.csv", index=True)

# bureau_agg is now customer-level features to merge into master table by SK_ID_CURR

While merging I need to use this : # app_train = app_train.merge(bureau_agg, on='SK_ID_CURR', how='left')


Bureau Dataset – Key Points for Feature Engineering

Loan-level dataset → not customer-level

Each client has multiple external credit records

Classical EDA (histograms, outliers, correlations) is not meaningful

Focus on key tasks to support feature engineering:

Primary key validation (SK_ID_BUREAU uniqueness)

Missing value structure analysis (patterns, counts, predictive missingness)

Credit behavior profiling (CREDIT_ACTIVE, CREDIT_TYPE, overdue counts)

Aggregated customer-level features created:

Total debt (bureau_total_debt)

Number of active loans (bureau_total_loans)

Maximum overdue amounts (bureau_max_overdue_amount)

Why We Do Not Apply Full Classical EDA on the Bureau Dataset

The bureau dataset represents loan-level historical credit records from external institutions.
Each customer (SK_ID_CURR) can have multiple bureau records (SK_ID_BUREAU).
Therefore, this table is not customer-level, but transaction-level.

# Why This Approach Works

After aggregating the bureau dataset by SK_ID_CURR, the model receives customer-level summary features such as:

bureau_total_loans

bureau_total_debt

bureau_mean_credit_limit

bureau_max_overdue_amount

These features carry real predictive power and integrate cleanly into the training dataset.