# Stage 09 — Homework Starter Notebook

In the lecture, we learned how to create engineered features. Now it’s your turn to apply those ideas to your own project data.

In [1]:
import pandas as pd
import numpy as np
# Example synthetic data (replace with your project dataset)
np.random.seed(0)
n = 100
df = pd.DataFrame({
    'income': np.random.normal(60000, 15000, n).astype(int),
    'monthly_spend': np.random.normal(2000, 600, n).astype(int),
    'credit_score': np.random.normal(680, 50, n).astype(int)
})
df.head()

Unnamed: 0,income,monthly_spend,credit_score
0,86460,3129,661
1,66002,1191,668
2,74681,1237,734
3,93613,2581,712
4,88013,1296,712


In [2]:
from __future__ import annotations

In [3]:
df1 = pd.read_csv("/Users/hitakshikulhare/bootcamp_hitakshi_kulhare/project/data/german.data-numeric", sep=r"\s+", header=None)

if df1.shape[1] == 21:
    df1.columns = [
        "checking_status", "duration_months", "credit_history", "purpose", "credit_amount",
        "savings_status", "employment_since", "installment_rate_pct", "personal_status_sex", "other_debtors",
        "residence_since", "property_magnitude", "age_years", "other_installment_plans", "housing",
        "number_credits", "job", "people_liable", "telephone", "foreign_worker", "target"
    ]
else:
    df1.columns=[
        "checking_status", "duration_months", "credit_history", "purpose", "credit_amount",
        "savings_status", "employment_since", "installment_rate_pct", "personal_status_sex", "other_debtors",
        "residence_since", "property_magnitude", "age_years", "other_installment_plans", "housing",
        "number_existing_credits", "job", "people_liable_maintenance", "telephone", "foreign_worker",
        "extra_indicator_1", "extra_indicator_2", "extra_indicator_3", "extra_indicator_4",
        "target"
    ]

In [4]:
DEFAULT_COLS_21 = [
    "checking_status", "duration_months", "credit_history", "purpose", "credit_amount",
    "savings_status", "employment_since", "installment_rate_pct", "personal_status_sex", "other_debtors",
    "residence_since", "property_magnitude", "age_years", "other_installment_plans", "housing",
    "number_credits", "job", "people_liable", "telephone", "foreign_worker", "target"
]

In [5]:
def ensure_german_columns(df1: pd.DataFrame) -> pd.DataFrame:
    """If df has 21 unnamed integer columns 0..20, assign the standard German Credit names."""
    if df1.shape[1] == 21:
        cols = list(df1.columns)
        # if columns look like integers 0..20 or 'Unnamed: x' etc, rename
        if cols == list(range(21)) or set(cols) == set(range(21)) or all(str(c).startswith("Unnamed") for c in cols):
            df1 = df.copy()
            df1.columns = DEFAULT_COLS_21
    return df1

## TODO: Implement at least 2 engineered features here

In [6]:
def _assign_default_cols(df1: pd.DataFrame) -> pd.DataFrame:
    if df1.shape[1] == 21 and (df1.columns.astype(str).tolist() == list(range(21)) or set(df1.columns) == set(range(21)) or all(str(c).startswith("Unnamed") for c in df1.columns)):
        df1 = df1.copy()
        df1.columns = DEFAULT_COLS_21
    return df1

In [7]:
def _coerce_numeric(df1: pd.DataFrame, cols):
    for c in cols:
        if c in df1.columns:
            df1[c] = pd.to_numeric(df1[c], errors="coerce")
    return df1

In [8]:
def add_sme_features(df1: pd.DataFrame) -> pd.DataFrame:
   
    df1 = _assign_default_cols(df1)

    needed = ["credit_amount", "duration_months", "installment_rate_pct", "residence_since"]
    df1 = _coerce_numeric(df1, needed)

    # Avoid divide-by-zero
    duration = df1["duration_months"].replace(0, np.nan)
    df1["amt_per_month"] = df1["credit_amount"] / duration

    # Scale monthly burden by declared installment % of disposable income
    df1["installment_burden_index"] = (df1["installment_rate_pct"] / 100.0) * df1["amt_per_month"]

    # Stability signal from longer residence
    df1["is_long_resident"] = (df1["residence_since"] >= 4).astype(int)

    return df1

In [9]:
def binarize_target(df1: pd.DataFrame, target_col: str = "target") -> pd.Series:
    y = df1[target_col]
    uniques = set(pd.unique(y.dropna()))
    if uniques == {0,1}:
        return y.astype(int)
    if uniques == {1,2}:
        return (y == 2).astype(int)
    y = pd.to_numeric(y, errors="coerce")
    if set(pd.unique(y.dropna())) <= {0,1}:
        return y.fillna(0).astype(int)
    return pd.Series(index=df1.index, dtype=float)

In [10]:
# try:
#     df1
# except NameError:
#     # Adjust path to your dataset if different
#     DATA_PATH = Path("/Users/hitakshikulhare/bootcamp_hitakshi_kulhare/project/data/german.data-numeric")
#     if DATA_PATH.exists():
#         df1 = pd.read_csv(DATA_PATH, sep=r"\s+", header=None, engine="python")
#     else:
#         # As a fallback, create a tiny demo df with 21 columns
#         demo = {i: [1,2,3] for i in range(21)}
#         df1 = pd.DataFrame(demo)

# # Ensure correct column names (handles 0..20 case)
# df1 = ensure_german_columns(df1)
# df1.head()

In [11]:
df1_fe = add_sme_features(df1.copy())
df1_fe.head()

Unnamed: 0,checking_status,duration_months,credit_history,purpose,credit_amount,savings_status,employment_since,installment_rate_pct,personal_status_sex,other_debtors,...,telephone,foreign_worker,extra_indicator_1,extra_indicator_2,extra_indicator_3,extra_indicator_4,target,amt_per_month,installment_burden_index,is_long_resident
0,1,6,4,12,5,5,3,4,1,67,...,0,0,1,0,0,1,1,0.833333,0.033333,0
1,2,48,2,60,1,3,2,2,1,22,...,0,0,1,0,0,1,2,0.020833,0.000417,0
2,4,12,4,21,1,4,3,3,1,49,...,0,0,1,0,1,0,1,0.083333,0.0025,0
3,1,42,2,79,1,4,3,4,2,45,...,0,0,0,0,0,1,1,0.02381,0.000952,0
4,1,24,3,49,1,3,3,4,4,53,...,0,0,0,0,0,1,2,0.041667,0.001667,0


### Rationale for Feature 1
Explain why this feature may help a model. Reference your EDA.

- Captures repayment stress: This feature represents the approximate monthly repayment obligation of a borrower. A higher amt_per_month indicates a heavier financial burden, which may correlate with higher default risk.
- Normalizes loan size by time: Instead of looking at raw credit_amount or duration_months separately, this ratio standardizes repayment pressure across loans of different sizes and durations.
- EDA insight: From the exploratory analysis, we saw that applicants with high loan amounts and shorter repayment durations often had higher default rates. This feature encodes that relationship directly.
- Improves model interpretability: The feature makes intuitive sense to financial decision-makers, as monthly burden is a familiar concept in credit risk assessment.

In [12]:
df1['amt_per_month'] = df1['credit_amount'] / df1['duration_months']
df1[['credit_amount', 'duration_months', 'amt_per_month']].head()

Unnamed: 0,credit_amount,duration_months,amt_per_month
0,5,6,0.833333
1,1,48,0.020833
2,1,12,0.083333
3,1,42,0.02381
4,1,24,0.041667


### Rationale for Feature 2
Explain why this feature may help a model. Reference your EDA.

- Measures affordability: This ratio approximates how heavy the monthly repayment (amt_per_month) is compared to the borrower’s declared installment capacity (installment_rate_pct, which reflects percentage of disposable income).
- Flags high-risk cases: A high ratio indicates that the borrower’s monthly obligation is large relative to income, suggesting a higher risk of default.
- Combines multiple signals: It synthesizes loan characteristics (credit_amount, duration_months) with borrower financial capacity (installment_rate_pct), giving the model a more holistic view.
- EDA insight: Exploratory analysis suggested that defaults were more common among borrowers with high monthly payments relative to their income proxy. Encoding this relationship explicitly may improve model performance.

In [13]:
df1['debt_burden_ratio'] = df1['amt_per_month'] / df1['installment_rate_pct']