In [12]:
# libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import statsmodels.api as sm
import joblib
import xgboost
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression 
from sklearn.ensemble import StackingRegressor    
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings("ignore")  # Suppresses all warnings

In [2]:
# Read dataset
df = pd.read_csv(r"D:/End-to-end-Default-Risk-Pred-mlProject/notebook/data/default.csv", sep="\t")
df.head()

Unnamed: 0,credit_score,income,loan_amount,loan_term,interest_rate,debt_to_income_ratio,employment_years,savings_balance,age,default_risk_score
0,810,107410,11924,48,7.97,43.29,32,27181,58,7634.543366
1,418,37482,19291,24,6.94,11.01,33,15089,43,6249.833059
2,724,85641,39501,36,8.59,37.11,0,97459,33,2148.11799
3,444,73331,25714,36,13.09,33.39,18,2413,48,4979.385344
4,440,46723,35651,36,8.3,46.21,6,9716,42,2993.85195


In [3]:
# Doing a vertical split into X features and y target
X = df.drop(columns=['default_risk_score'], axis=1)
X.head()

Unnamed: 0,credit_score,income,loan_amount,loan_term,interest_rate,debt_to_income_ratio,employment_years,savings_balance,age
0,810,107410,11924,48,7.97,43.29,32,27181,58
1,418,37482,19291,24,6.94,11.01,33,15089,43
2,724,85641,39501,36,8.59,37.11,0,97459,33
3,444,73331,25714,36,13.09,33.39,18,2413,48
4,440,46723,35651,36,8.3,46.21,6,9716,42


In [4]:
# Traget variable
y = df['default_risk_score']
y[:5]

0    7634.543366
1    6249.833059
2    2148.117990
3    4979.385344
4    2993.851950
Name: default_risk_score, dtype: float64

In [6]:
# Carry out the train_test_split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_train shape: (640, 9)
y_train shape: (640,)
X_test shape: (160, 9)
y_test shape: (160,)


In [7]:
# Apply StandardScaler to the X_train and X_test
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [8]:
# Define base models (level_one estimators)
base_models = [
    ('dt', DecisionTreeRegressor(random_state=42)),
    ('rf', RandomForestRegressor(n_estimators=100, random_state=42)),
    ('knn', KNeighborsRegressor(n_neighbors=5)),
    ('lr', LinearRegression())
]

# Define meta-learner (level-2 model)
meta_learner = LinearRegression()

In [9]:
# Implementing the stacking regressor
stacking_regressor = StackingRegressor(
    estimators=base_models,
    final_estimator=meta_learner,
    cv=5,  # 5-fold cross-validation to generate meta-features
    n_jobs=-1,  # Use all CPU cores
    verbose=1  # Optional: see progress
)

In [10]:
# 4. Train the stacking model(.fit)
stacking_regressor.fit(X_train, y_train)

# Predict
y_pred = stacking_regressor.predict(X_test)

In [13]:
# 5. Evaluate the model
# Metrics
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"Stacking Regressor - RMSE: {rmse:.4f}")
print(f"Stacking Regressor - R²: {r2:.4f}")
# print('='*20)



Stacking Regressor - RMSE: 10.0692
Stacking Regressor - R²: 1.0000


In [14]:
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'KNN': KNeighborsRegressor(n_neighbors=5),
    'Stacking Regressor': stacking_regressor
}

# Evaluate each model
results = {}
cv = KFold(n_splits=5, shuffle=True, random_state=42)

for name, model in models.items():
    cv_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='r2')
    results[name] = cv_scores
    
    print(f"{name}: R² = {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

Linear Regression: R² = 1.0000 (+/- 0.0000)
Decision Tree: R² = 0.9992 (+/- 0.0002)
Random Forest: R² = 0.9995 (+/- 0.0001)
KNN: R² = 0.8431 (+/- 0.0235)
Stacking Regressor: R² = 1.0000 (+/- 0.0000)


In [23]:
# Make predictions on a sample data
data = X_test[:5]  # Use the first 5 rows of the test set
predictions = stacking_regressor.predict(data)
print("Sample predictions:", [round(pred, 3) for pred in predictions.tolist()])


Sample predictions: [3202.73, 6659.657, 6394.361, 3023.829, 7796.921]
