# Loan Approval Prediction Competition
The goal for this competition is to predict whether an applicant is approved for a loan.

Submissions are evaluated using area under the ROC curve using the predicted probabilities and the ground truth targets.

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.model_selection import cross_validate, KFold, train_test_split
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

### Data Overview and Feature Exploration

Before we explore what models may be suitible or well performing, lets establish what data we have and what format it is in.

In [None]:
train_df = pd.read_csv('../data/merged_dataset.csv')
test_df = pd.read_csv('../data/test.csv')

train_df.describe()

In [None]:
train_df['person_age']

In [4]:
def engineer_features(df):
    df = df.copy()
    if 'id' in df.columns:
        df.drop('id', axis=1, inplace=True)
    df['age_squared'] = df['person_age'] ** 2
    df['is_young'] = (df['person_age'] < 30).astype(int)
    
    df['income_log'] = np.log1p(df['person_income'])
    df['high_income'] = (df['person_income'] > df['person_income'].median()).astype(int)
    
    df = pd.get_dummies(df, columns=['person_home_ownership'], prefix='home_ownership')
    
    df['emp_length_log'] = np.log1p(df['person_emp_length'])
    df['is_experienced'] = (df['person_emp_length'] > 5).astype(int)
    
    df = pd.get_dummies(df, columns=['loan_intent'], prefix='intent')

    
    grade_order = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7}
    df['loan_grade_ordinal'] = df['loan_grade'].map(grade_order)
    df.drop('loan_grade', axis=1, inplace=True)
    
    df['loan_amnt_log'] = np.log1p(df['loan_amnt'])
    df['loan_amnt_to_income'] = df['loan_amnt'] / df['person_income']
   
    df['int_rate_squared'] = df['loan_int_rate'] ** 2
    
    df['high_percent_income'] = (df['loan_percent_income'] > 0.2).astype(int)
    
    le = LabelEncoder()
    df['default_history_encoded'] = le.fit_transform(df['cb_person_default_on_file'])
    df.drop('cb_person_default_on_file', axis=1, inplace=True)
    
    df['cred_hist_length_log'] = np.log1p(df['cb_person_cred_hist_length'])
    df['<5years_credit_history'] = (df['cb_person_cred_hist_length'] < 5).astype(int)
    
    df['income_emp_length_interaction'] = df['income_log'] * df['emp_length_log']
    df['loan_amnt_int_rate_interaction'] = df['loan_amnt_log'] * df['loan_int_rate']
    
    return df


In [None]:
engineered_train = engineer_features(train_df)
engineered_test = engineer_features(test_df)
engineered_train.head()

#### Is the data clean? are there any errors or bias?
We can see there are no null values in the dataset now lets move onto the spread of the data to see any bias


In [None]:
null_counts_train = train_df.isnull().sum()
null_counts_test = test_df.isnull().sum()

null_counts_combined = pd.concat([null_counts_train, null_counts_test], axis=1)
null_counts_combined.columns = ['Train', 'Test']

null_counts_combined


Now looking at the spread of how many loans are given, we can see there are far more loans rejected than accepted meaning:
- An evaluation metric such as accuracy is likely to perform poorly as if you predicted rejected for all loans then you would score an accuracy of 85.7%. 
- This inbalance in the dataset may mean that stratified sampling may be useful if cross validaiton is used.
- AUC ROC is a good starting point for the evaluation metric as it is also the metric chosen by the competition

In [None]:
train_df['loan_status'].value_counts()

In [8]:
# sns.pairplot(data=train_df, hue='loan_status')

First trying with PCA to see if there is a linear relationship

In [None]:
scaler = StandardScaler()
df_scaled = scaler.fit_transform(engineered_train.drop('loan_status', axis=1))

pca = PCA(n_components=len(engineered_train.columns) - 1)
pca_result = pca.fit_transform(df_scaled)

explained_variance = pca.explained_variance_ratio_
print(f"Explained Variance Ratio by each component: {explained_variance}")

plt.figure(figsize=(16,9))
plt.subplot(1, 2, 1)
plt.plot(np.cumsum(explained_variance), linestyle='-', color='b', label='Cumulative Variance Explained')
plt.axhline(y=0.95, color='red', linestyle='--', label='95% Variance Threshold')

plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance')
plt.ylim(0, 1)
plt.xlim(0, len(explained_variance)-1)
plt.title('Explained Variance by Principal Components')
plt.legend()

plt.subplot(1, 2, 2)
plt.bar(x = range(1, len(explained_variance)+1), height=explained_variance)
plt.xlabel('Number of Principal Components')
plt.ylabel('Explained Variance Ratio')
plt.ylim(0, 1)  


#### Gradient Boosting (e.g., XGBoost, LightGBM, CatBoost):
- Pros: Great for structured/tabular data, handles non-linear relationships very well, and has built-in feature selection. Generally, boosted trees perform better than 
Random Forests on complex tasks.
- Cons: Requires more tuning and is more computationally intensive.

In [None]:
X = engineered_train.drop(columns=['loan_status'])
y = engineered_train['loan_status']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

def objective(space):
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'eta': space['eta'],
        'max_depth': int(space['max_depth']),
        'subsample': space['subsample'],
        'colsample_bytree': space['colsample_bytree'],
        'alpha': space['alpha'],
        'lambda': space['lambda'],
        'n_estimators': int(space['n_estimators']),
        'seed': 42
    }
    
    model = xgb.XGBClassifier(**params)
    kf = KFold(n_splits=10, shuffle=True, random_state=42)
    
    cv_results = cross_validate(model, X_scaled, y, cv=kf, scoring='roc_auc', return_train_score=True)
    mean_test_auc = np.mean(cv_results['test_score'])
    mean_train_auc = np.mean(cv_results['train_score'])
    
    return {'loss': -mean_test_auc, 'status': STATUS_OK, 'train_auc': mean_train_auc, 'test_auc': mean_test_auc}

space = {
    'eta': hp.loguniform('eta', np.log(0.001), np.log(0.1)),
    'max_depth': hp.quniform('max_depth', 3, 10, 1),
    'subsample': hp.uniform('subsample', 0.6, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1.0),
    'alpha': hp.loguniform('alpha', np.log(1e-8), np.log(1.0)),
    'lambda': hp.loguniform('lambda', np.log(1e-8), np.log(1.0)),
    'n_estimators': hp.quniform('n_estimators', 100, 1000, 50)
}

trials = Trials()
best_hyperparams = fmin(fn=objective,
                        space=space,
                        algo=tpe.suggest,
                        max_evals=200,
                        trials=trials)

print("Best Hyperparameters:", best_hyperparams)
print("Best Test AUC:", -trials.best_trial['result']['loss'])
print("Corresponding Train AUC:", trials.best_trial['result']['train_auc'])


best_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'eta': best_hyperparams['eta'],
    'max_depth': int(best_hyperparams['max_depth']),
    'subsample': best_hyperparams['subsample'],
    'colsample_bytree': best_hyperparams['colsample_bytree'],
    'alpha': best_hyperparams['alpha'],
    'lambda': best_hyperparams['lambda'],
    'n_estimators': int(best_hyperparams['n_estimators']),
    'seed': 42
}

final_model = xgb.XGBClassifier(**best_params)
final_model.fit(X_scaled, y)


feature_importance = final_model.feature_importances_
feature_names = X.columns
for importance, name in sorted(zip(feature_importance, feature_names), reverse=True):
    print(f"{name}: {importance}")

In [None]:
X_test = engineered_test
X_test_scaled = scaler.transform(X_test)

y_pred_prob = final_model.predict_proba(X_test_scaled)[:, 1]


predictions_df = pd.DataFrame({
    'id': test_df['id'],
    'loan_status': y_pred_prob
})

predictions_df.to_csv('../submission/loan_predictions.csv', index=False)

print("Predictions saved to 'submission/loan_predictions.csv'")
