## Part 4: Feature Engineering

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df_clean = pd.read_csv('application_train_cleaned.csv')
print(f"Loaded cleaned data: {df_clean.shape}")

Loaded cleaned data: (307511, 81)


In [5]:
print("="*60)
print("Quick Correlation Check with TARGET")
print("="*60)

# correlation with TARGET

numeric_df = df_clean.select_dtypes(include=[np.number])


target_corr = numeric_df.corr()['TARGET'].sort_values(ascending = False)
print("\n Top 15 Positively Correlated:")
print(target_corr.head(15))

print("\nTop 15 Negatively Correlated:")
print(target_corr.tail(15))

Quick Correlation Check with TARGET

 Top 15 Positively Correlated:
TARGET                         1.000000
DAYS_BIRTH                     0.078239
REGION_RATING_CLIENT_W_CITY    0.060893
REGION_RATING_CLIENT           0.058899
DAYS_LAST_PHONE_CHANGE         0.055218
DAYS_ID_PUBLISH                0.051457
REG_CITY_NOT_WORK_CITY         0.050994
FLAG_EMP_PHONE                 0.045982
REG_CITY_NOT_LIVE_CITY         0.044395
FLAG_DOCUMENT_3                0.044346
DAYS_REGISTRATION              0.041975
LIVE_CITY_NOT_WORK_CITY        0.032518
DEF_30_CNT_SOCIAL_CIRCLE       0.032394
DEF_60_CNT_SOCIAL_CIRCLE       0.031401
FLAG_WORK_PHONE                0.028524
Name: TARGET, dtype: float64

Top 15 Negatively Correlated:
AMT_REQ_CREDIT_BUREAU_MON    -0.014794
AMT_INCOME_TOTAL             -0.022463
FLAG_PHONE                   -0.023806
HOUR_APPR_PROCESS_START      -0.024166
FLAG_DOCUMENT_6              -0.028602
AMT_CREDIT                   -0.029467
TOTALAREA_MODE               -0.030214

In [8]:
weak_features = target_corr[abs(target_corr) < 0.01].index.tolist()
print(f"\n Weak features (|corr| < 0.01): {len(weak_features)}")
print("Consider dropping these later")
print(weak_features)


 Weak features (|corr| < 0.01): 34
Consider dropping these later
['OBS_30_CNT_SOCIAL_CIRCLE', 'OBS_60_CNT_SOCIAL_CIRCLE', 'CNT_FAM_MEMBERS', 'REG_REGION_NOT_WORK_REGION', 'REG_REGION_NOT_LIVE_REGION', 'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_21', 'LIVE_REGION_NOT_WORK_REGION', 'AMT_REQ_CREDIT_BUREAU_DAY', 'FLAG_MOBIL', 'FLAG_CONT_MOBILE', 'FLAG_DOCUMENT_20', 'AMT_REQ_CREDIT_BUREAU_HOUR', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_12', 'AMT_REQ_CREDIT_BUREAU_WEEK', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_7', 'FLAG_EMAIL', 'SK_ID_CURR', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_17', 'YEARS_BEGINEXPLUATATION_MODE', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_9', 'YEARS_BEGINEXPLUATATION_AVG', 'YEARS_BEGINEXPLUATATION_MEDI', 'AMT_REQ_CREDIT_BUREAU_QRT', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_14', 'AMT_ANNUITY']


In [9]:
print("\n")
print("="* 60)
print("Feature Engineering")
print("="* 60)



Feature Engineering


In [11]:
df_features = df_clean.copy()

# 1: Domain Specific Ratios and Calculations

# Credit to Income Ratio 
df_features["CREDIT_INCOME_RATIO"] = df_features["AMT_CREDIT"] / df_features["AMT_INCOME_TOTAL"]

# Annuity to Income Ratio (payment burden)
df_features["ANNUITY_INCOME_RATIO"] = df_features["AMT_ANNUITY"] / df_features["AMT_INCOME_TOTAL"]

# Credits to Good Price Difference 
df_features["CREDIT_GOODS_DIFF"] = df_features["AMT_CREDIT"] - df_features["AMT_GOODS_PRICE"]

# Goods Price to Credit Ratio
df_features["GOODS_PRICE_RATIO"] = df_features["AMT_GOODS_PRICE"] / df_features["AMT_CREDIT"]

# Income per Family Member
if "CNT_FAM_MEMBERS" in df_features.columns:
    df_features["INCOME_PER_PERSON"] = df_features["AMT_INCOME_TOTAL"] / df_features["CNT_FAM_MEMBERS"]
    
print("\n Created Features for Ratios")

# 2: Time-based features (converting days to years)
if "DAYS_EMPLOYED" in df_features.columns:
    df_features["EMPLOYED_YEARS"] = abs(df_features["DAYS_EMPLOYED"]) / 365 

if "DAYS_BIRTH" in df_features.columns:
    df_features["AGE_YEARS"] = abs(df_features["DAYS_BIRTH"]) / 365 

print("\n Created Time-based Features")

# 3: Based on Skewed features from part 3 analysis, applying log transform

skewed_features = ["AMT_INCOME_TOTAL", "AMT_CREDIT", "AMT_ANNUITY", "AMT_GOODS_PRICE"]

for col in skewed_features:
    if col in df_features.columns:
        df_features[f"{col}_LOG"] = np.log1p(df_features[col]) # as log(1+x) will handle zeros
        
print("\n Applied Log Transformations to Skewed Features")

# 4: Binning Continuous Feature

# age group

if "AGE_YEARS" in df_features.columns:
    df_features["AGE_GROUP"] = pd.cut(
        df_features["AGE_YEARS"],
        bins = [0, 25, 35, 45, 55, 100],
        labels = ["Young", "Adult", "Middle", "Senior", "Elderly"]
    )
    

print("\n Created 1 binned feature") # categorical feature


# 5: based on my functional understanding that an applicant is young and their credit risk ask is higher than median
if "AGE_YEARS" in df_features.columns and "AMT_CREDIT" in df_features.columns:
    df_features["YOUNG_HIGH_CREDIT"] = (
        (df_features["AGE_YEARS"] < 30) &
        (df_features["AMT_CREDIT"] > df_features["AMT_CREDIT"].median())
    ).astype(int)
    
    

original_features = df_clean.shape[1]
new_features = df_features.shape[1]
added_features = new_features - original_features

print(f"\n Original Fatures: {original_features}")
print(f"\n New Fatures: {new_features}")
print(f"\n Features Added: {added_features}")

new_features_names = [col for col in df_features.columns if col not in df_clean.columns]
print(f"\n New Features Created: {len(new_features_names)}:")
for feature in new_features_names:
    print(f"-> {feature}")



 Created Features for Ratios

 Created Time-based Features

 Applied Log Transformations to Skewed Features

 Created 1 binned feature

 Original Fatures: 81

 New Fatures: 94

 Features Added: 13

 New Features Created: 13:
-> CREDIT_INCOME_RATIO
-> ANNUITY_INCOME_RATIO
-> CREDIT_GOODS_DIFF
-> GOODS_PRICE_RATIO
-> INCOME_PER_PERSON
-> EMPLOYED_YEARS
-> AGE_YEARS
-> AMT_INCOME_TOTAL_LOG
-> AMT_CREDIT_LOG
-> AMT_ANNUITY_LOG
-> AMT_GOODS_PRICE_LOG
-> AGE_GROUP
-> YOUNG_HIGH_CREDIT


In [12]:
df_features.to_csv("application_train_featured.csv")
print("Saved to 'application_train_featured.csv'")
print(f"Shape: {df_features.shape}")

Saved to 'application_train_featured.csv'
Shape: (307511, 94)


In [13]:
# Next: Model Development
