# Feature Engineering - Loan Payback

This notebook builds a clean feature-engineered version of the train and test data and saves them for modeling.


In [1]:
# Imports
import pandas as pd
import numpy as np
import os

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)



## 1) Load raw data


In [2]:
DATA_DIR = '../data'
RAW_TRAIN = os.path.join(DATA_DIR, 'train.csv')
RAW_TEST = os.path.join(DATA_DIR, 'test.csv')
PROC_DIR = os.path.join(DATA_DIR, 'processed')
PROC_TRAIN = os.path.join(PROC_DIR, 'train_fe.csv')
PROC_TEST = os.path.join(PROC_DIR, 'test_fe.csv')

train_df = pd.read_csv(RAW_TRAIN)
test_df = pd.read_csv(RAW_TEST)

print(train_df.shape, test_df.shape)
train_df.head()


(593994, 13) (254569, 12)


Unnamed: 0,id,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade,loan_paid_back
0,0,29367.99,0.084,736,2528.42,13.67,Female,Single,High School,Self-employed,Other,C3,1.0
1,1,22108.02,0.166,636,4593.1,12.92,Male,Married,Master's,Employed,Debt consolidation,D3,0.0
2,2,49566.2,0.097,694,17005.15,9.76,Male,Single,High School,Employed,Debt consolidation,C5,1.0
3,3,46858.25,0.065,533,4682.48,16.1,Female,Single,High School,Employed,Debt consolidation,F1,1.0
4,4,25496.7,0.053,665,12184.43,10.21,Male,Married,High School,Employed,Other,D1,1.0


## 2) Feature engineering function
Adds engineered columns and returns a copy.


In [3]:
def add_features(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    
    # Extract grade and subgrade
    out['grade'] = out['grade_subgrade'].str[0]
    out['subgrade_num'] = out['grade_subgrade'].str[1:].astype(int)
    grade_map = {'A': 6, 'B': 5, 'C': 4, 'D': 3, 'E': 2, 'F': 1}
    out['grade_score'] = out['grade'].map(grade_map)
    
    # Employment indicators
    out['has_stable_income'] = out['employment_status'].isin(['Employed', 'Self-employed', 'Retired']).astype(int)
    out['is_unemployed'] = (out['employment_status'] == 'Unemployed').astype(int)
    
    # Derived ratios
    out['loan_to_income_ratio'] = out['loan_amount'] / out['annual_income']
    out['estimated_monthly_payment'] = out['loan_amount'] * (out['interest_rate'] / 100 / 12)
    out['payment_burden'] = (out['estimated_monthly_payment'] * 12) / out['annual_income']
    out['risk_score'] = out['debt_to_income_ratio'] + (out['interest_rate'] / 100)
    
    return out


## 3) Apply and save


In [4]:
train_fe = add_features(train_df)
test_fe = add_features(test_df)

print("Train with features:", train_fe.shape)
print("Test with features:", test_fe.shape)

os.makedirs(PROC_DIR, exist_ok=True)
train_fe.to_csv(PROC_TRAIN, index=False)
test_fe.to_csv(PROC_TEST, index=False)

print("Saved:")
print("  ", PROC_TRAIN)
print("  ", PROC_TEST)

train_fe.head()


Train with features: (593994, 22)
Test with features: (254569, 21)
Saved:
   ../data/processed/train_fe.csv
   ../data/processed/test_fe.csv


Unnamed: 0,id,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade,loan_paid_back,grade,subgrade_num,grade_score,has_stable_income,is_unemployed,loan_to_income_ratio,estimated_monthly_payment,payment_burden,risk_score
0,0,29367.99,0.084,736,2528.42,13.67,Female,Single,High School,Self-employed,Other,C3,1.0,C,3,4,1,0,0.086094,28.802918,0.011769,0.2207
1,1,22108.02,0.166,636,4593.1,12.92,Male,Married,Master's,Employed,Debt consolidation,D3,0.0,D,3,3,1,0,0.207757,49.452377,0.026842,0.2952
2,2,49566.2,0.097,694,17005.15,9.76,Male,Single,High School,Employed,Debt consolidation,C5,1.0,C,5,4,1,0,0.34308,138.308553,0.033485,0.1946
3,3,46858.25,0.065,533,4682.48,16.1,Female,Single,High School,Employed,Debt consolidation,F1,1.0,F,1,1,1,0,0.099929,62.823273,0.016089,0.226
4,4,25496.7,0.053,665,12184.43,10.21,Male,Married,High School,Employed,Other,D1,1.0,D,1,3,1,0,0.477883,103.669192,0.048792,0.1551
