# Loan Approval Coursework - Machine Learning Analysis

## Part A: Loan Approval Status Prediction (Classification)
## Part B: Maximum Loan Amount Prediction (Regression)

## Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path

# Classification models
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Regression models
from sklearn.tree import DecisionTreeRegressor, plot_tree

# Model evaluation
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler

# Warnings
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

## Load Dataset

In [None]:
# Load the dataset
DATA_PATH = Path('../data/loan_approval_data.csv')

df = pd.read_csv(DATA_PATH, low_memory=False)
print(f"Dataset shape: {df.shape}")
df.head()

In [None]:
# Standardise column names for readability
rename_map = {
    'id': 'ID',
    'age': 'Age',
    'Sex': 'Sex',
    'Education_Qualifications': 'Education Qualifications',
    'income': 'Income',
    'home_ownership': 'Home Ownership',
    'emplyment_length': 'Employment Length',
    'loan_intent': 'Loan Intent',
    'loan_amount': 'Loan Amount',
    'loan_interest_rate': 'Loan Interest Rate',
    'loan_income_ratio': 'Loan-to-Income Ratio (LTI)',
    'payment_default_on_file': 'Payment Default on File',
    'credit_history_length': 'Credit History Length',
    'loan_approval_status': 'Loan Approval Status',
    'max_allowed_loan': 'Maximum Loan Amount',
    'Credit_Application_Acceptance': 'Credit Application Acceptance'
}

df = df.rename(columns=rename_map)

# Preview renamed columns
print(df.columns.tolist())

In [None]:
# Data quality overview for retained variables and target
print("Missing values per column:\n")
print(df[retained_vars].isnull().sum())

print("\nDuplicate rows in retained set:")
print(df[retained_vars].duplicated().sum())

print("\nUnique values in Loan Approval Status:")
print(df['Loan Approval Status'].value_counts(dropna=False))

## Task 2: Data Understanding

retained_vars = [
    'Education Qualifications', 'Income', 'Home Ownership', 'Employment Length',
    'Loan Intent', 'Loan Amount', 'Loan Interest Rate', 'Loan-to-Income Ratio (LTI)',
    'Payment Default on File', 'Credit History Length', 'Loan Approval Status',
    'Maximum Loan Amount'
]

df_retained = df[retained_vars].copy()
print("Basic statistics for retained variables:\n")
print(df_retained.describe(include='all'))
print("\nVariable Types:\n")
print(df_retained.dtypes)

## Task 2: Data Understanding

In [None]:
# Statistical description of retained variables
retained_vars = ['Education Qualifications', 'Income', 'Home Ownership', 'Employment Length',
                 'Loan Intent', 'Loan Amount', 'Loan Interest Rate', 'Loan-to-Income Ratio (LTI)',
                 'Payment Default on File', 'Credit History Length', 'Loan Approval Status',
                 'Maximum Loan Amount']

df_retained = df[retained_vars].copy()
print(df_retained.describe())
print("\nVariable Types:")
print(df_retained.dtypes)

In [None]:
## Task 3: Data Preparation

# Explore data quality issues
print("Missing values across retained variables:\n")
print(df[retained_vars].isnull().sum())

print("\nDuplicate rows considering retained variables:")
print(df[retained_vars].duplicated().sum())

print("\nLoan Approval Status raw categories:\n")
print(df['Loan Approval Status'].value_counts(dropna=False))

In [None]:
# TASK 3.b: Standardise Payment Default on File to binary values

print("Before cleaning - Payment Default on File:")
print(df['Payment Default on File'].value_counts(dropna=False))

# Standardise to 'Y' and 'N' only
df['Payment Default on File'] = df['Payment Default on File'].replace({
    'YES': 'Y',
    'NO': 'N'
})

# Impute missing values with mode (most frequent value)
mode_default = df['Payment Default on File'].mode()[0]
df['Payment Default on File'].fillna(mode_default, inplace=True)

print("\nAfter cleaning - Payment Default on File:")
print(df['Payment Default on File'].value_counts())

In [None]:
# TASK 3.c: Handle missing values in Loan Interest Rate

print(f"Missing values in Loan Interest Rate: {df['Loan Interest Rate'].isnull().sum()}")

# Impute with median (robust to outliers)
median_rate = df['Loan Interest Rate'].median()
df['Loan Interest Rate'].fillna(median_rate, inplace=True)

print(f"After imputation: {df['Loan Interest Rate'].isnull().sum()} missing values")

In [None]:
# TASK 3.d: Verify no missing values remain in retained variables

retained_vars = [
    'Education Qualifications', 'Income', 'Home Ownership', 'Employment Length',
    'Loan Intent', 'Loan Amount', 'Loan Interest Rate', 'Loan-to-Income Ratio (LTI)',
    'Payment Default on File', 'Credit History Length', 'Loan Approval Status',
    'Maximum Loan Amount'
]

print("Final check - Missing values in retained variables:")
print(df[retained_vars].isnull().sum())
print(f"\nDataset shape after cleaning: {df.shape}")

## Task 4: Modelling - Classification

# TASK 4.a: Algorithm details table
# This information will be used in the report:
# Algorithm | Type | Learnable Parameters | Hyperparameters | Package
# NB | Non-parametric | Class priors, feature means/variances | var_smoothing | sklearn.naive_bayes.GaussianNB
# LR | Parametric | Coefficients (weights), intercept | C, penalty, solver | sklearn.linear_model.LogisticRegression
# RF | Non-parametric | Split rules at nodes | n_estimators, max_depth, min_samples_split | sklearn.ensemble.RandomForestClassifier

In [None]:
# TASK 4.b: Prepare categorical features only for classification

# Select categorical features from retained variables
categorical_features = [
    'Education Qualifications',
    'Home Ownership',
    'Loan Intent',
    'Payment Default on File'
]

# Create feature matrix X with categorical features only
X_cat = df[categorical_features].copy()

# Target variable
y = df['Loan Approval Status'].copy()

# One-hot encode categorical features
X_encoded = pd.get_dummies(X_cat, drop_first=True)

print("Feature names used for classification:")
print(X_encoded.columns.tolist())
print(f"\nFeature matrix shape: {X_encoded.shape}")
print(f"Target variable shape: {y.shape}")
print(f"\nTarget distribution:\n{y.value_counts()}")

# TASK 4.b.ii: Train-test split with 80:20 ratio
# Justification: 80:20 is a standard split that provides sufficient training data 
# while preserving enough test data for reliable evaluation (Géron, 2019).

# TASK 4.b.iv: Ensure reproducibility and stratification
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, 
    y, 
    test_size=0.2,           # 80:20 split
    random_state=42,          # Ensures reproducibility - same split every time
    stratify=y                # Maintains class proportions in train and test
)

print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)
print("\nTraining set class distribution:")
print(y_train.value_counts(normalize=True))
print("\nTest set class distribution:")
print(y_test.value_counts(normalize=True))

In [None]:
# Build and train three classification models

# 1. Naive Bayes
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
print("Naive Bayes model trained successfully")

# 2. Logistic Regression
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train, y_train)
print("Logistic Regression model trained successfully")

# 3. Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
print("Random Forest model trained successfully")

## Task 5: Evaluation - Classification

# TASK 5.a: Generate predictions and confusion matrices for all models

# Naive Bayes predictions
y_pred_nb = nb_model.predict(X_test)
cm_nb = confusion_matrix(y_test, y_pred_nb)

print("Naive Bayes - Test Confusion Matrix:")
print(cm_nb)
print("\nClass labels:", nb_model.classes_)

# Logistic Regression predictions
y_pred_lr = lr_model.predict(X_test)
cm_lr = confusion_matrix(y_test, y_pred_lr)

print("\n" + "="*50)
print("Logistic Regression - Test Confusion Matrix:")
print(cm_lr)

# Random Forest predictions
y_pred_rf = rf_model.predict(X_test)
cm_rf = confusion_matrix(y_test, y_pred_rf)

print("\n" + "="*50)
print("Random Forest - Test Confusion Matrix:")
print(cm_rf)

In [None]:
# TASK 5.b: Calculate test performance metrics for all models

# Helper function to calculate all metrics
def calculate_metrics(y_true, y_pred, y_proba=None):
    """Calculate classification metrics"""
    metrics = {
        'Accuracy': accuracy_score(y_true, y_pred),
        'Recall': recall_score(y_true, y_pred, pos_label='Declined'),  # Focus on Declined
        'Precision': precision_score(y_true, y_pred, pos_label='Declined'),
        'F-Score': f1_score(y_true, y_pred, pos_label='Declined')
    }
    
    # AUC-ROC requires probability scores
    if y_proba is not None:
        # Convert to binary (Declined=1, Approved=0)
        y_binary = (y_true == 'Declined').astype(int)
        metrics['AUC-ROC'] = roc_auc_score(y_binary, y_proba[:, 1])
    
    return metrics

# Calculate for Naive Bayes
y_proba_nb = nb_model.predict_proba(X_test)
metrics_nb = calculate_metrics(y_test, y_pred_nb, y_proba_nb)

# Calculate for Logistic Regression
y_proba_lr = lr_model.predict_proba(X_test)
metrics_lr = calculate_metrics(y_test, y_pred_lr, y_proba_lr)

# Calculate for Random Forest
y_proba_rf = rf_model.predict_proba(X_test)
metrics_rf = calculate_metrics(y_test, y_pred_rf, y_proba_rf)

# Display results in a table
results_df = pd.DataFrame({
    'NB': metrics_nb,
    'LR': metrics_lr,
    'RF': metrics_rf
})

print("Test Performance Metrics (All Models):")
print(results_df.round(4))

# TASK 5.c: Identify best model based on success criteria
# Success criteria: High Recall and Precision for "Declined" class

print("Based on the metrics:")
print("- USE: Recall, Precision, F-Score (focused on Declined class)")
print("- USE: AUC-ROC (overall discriminative ability)")
print("- DO NOT USE: Accuracy alone (due to class imbalance)")
print("\nBest model selection will prioritize Recall and Precision for Declined predictions.")

# TASK 5.d: Check for overfitting/underfitting
# Compare training and test performance for each model

print("Checking model fit (Training vs Test scores):\n")

# Naive Bayes
y_train_pred_nb = nb_model.predict(X_train)
train_acc_nb = accuracy_score(y_train, y_train_pred_nb)
test_acc_nb = accuracy_score(y_test, y_pred_nb)
print(f"Naive Bayes - Train Accuracy: {train_acc_nb:.4f}, Test Accuracy: {test_acc_nb:.4f}")

# Logistic Regression
y_train_pred_lr = lr_model.predict(X_train)
train_acc_lr = accuracy_score(y_train, y_train_pred_lr)
test_acc_lr = accuracy_score(y_test, y_pred_lr)
print(f"Logistic Regression - Train Accuracy: {train_acc_lr:.4f}, Test Accuracy: {test_acc_lr:.4f}")

# Random Forest
y_train_pred_rf = rf_model.predict(X_train)
train_acc_rf = accuracy_score(y_train, y_train_pred_rf)
test_acc_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest - Train Accuracy: {train_acc_rf:.4f}, Test Accuracy: {test_acc_rf:.4f}")

print("\nInterpretation:")
print("- If train >> test: Overfitting (model memorizes training data)")
print("- If train ≈ test and both low: Underfitting (model too simple)")
print("- If train ≈ test and both high: Good fit")

In [None]:
# TASK 5.e: Hyperparameter tuning with GridSearchCV
# Tuning Random Forest (assuming it's the best model)

print("Performing hyperparameter tuning on Random Forest...\n")

# Define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# TASK 5.e.i: GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,  # 5-fold cross-validation
    scoring='recall',  # Optimize for recall on Declined class
    n_jobs=-1,
    verbose=1
)

print("K-folds used: 5")
print("This will take a moment...\n")

grid_search.fit(X_train, y_train)

print("\nTuning complete!")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

# TASK 5.e.ii-v: Compare tuned model with original

# Get best tuned model
rf_tuned = grid_search.best_estimator_

# Predictions from tuned model
y_pred_rf_tuned = rf_tuned.predict(X_test)
cm_rf_tuned = confusion_matrix(y_test, y_pred_rf_tuned)

print("TASK 5.e.ii: Original vs Tuned Hyperparameters")
print("="*60)
print("Original RF hyperparameters:")
print(f"  n_estimators: 100 (default)")
print(f"  max_depth: None (default)")
print(f"  min_samples_split: 2 (default)")
print(f"  min_samples_leaf: 1 (default)")
print("\nTuned RF hyperparameters:")
for param, value in grid_search.best_params_.items():
    print(f"  {param}: {value}")

print("\n" + "="*60)
print("TASK 5.e.iii: Confusion Matrix Comparison")
print("="*60)
print("Original RF Confusion Matrix:")
print(cm_rf)
print("\nTuned RF Confusion Matrix:")
print(cm_rf_tuned)

# Calculate metrics for tuned model
y_proba_rf_tuned = rf_tuned.predict_proba(X_test)
metrics_rf_tuned = calculate_metrics(y_test, y_pred_rf_tuned, y_proba_rf_tuned)

print("\n" + "="*60)
print("TASK 5.e.iv: Performance Metrics Comparison")
print("="*60)
comparison_df = pd.DataFrame({
    'Original RF': metrics_rf,
    'Tuned RF': metrics_rf_tuned
})
print(comparison_df.round(4))

print("\n" + "="*60)
print("TASK 5.e.v: Impact of Tuning")
print("="*60)
if metrics_rf_tuned['Recall'] > metrics_rf['Recall']:
    print("✓ Tuning IMPROVED the model's ability to detect Declined applications")
elif metrics_rf_tuned['Recall'] < metrics_rf['Recall']:
    print("✗ Tuning REDUCED the model's ability to detect Declined applications")
else:
    print("= Tuning had NO CHANGE on Recall for Declined applications")

In [None]:
# Distribution plots for regression variables

## Task 1: Domain Understanding - Regression

# Filter dataset for APPROVED loans only
df_approved = df[df['Loan Approval Status'] == 'Approved'].copy()

print(f"Total records: {len(df)}")
print(f"Approved loans: {len(df_approved)}")
print(f"Percentage approved: {len(df_approved)/len(df)*100:.2f}%")

# Features for regression (exclude target and non-predictive variables)
regression_features = [
    'Education Qualifications', 'Income', 'Home Ownership', 'Employment Length',
    'Loan Intent', 'Loan Amount', 'Loan Interest Rate', 'Loan-to-Income Ratio (LTI)',
    'Payment Default on File', 'Credit History Length'
]

print(f"\nDimensions of regression dataset: {df_approved[regression_features + ['Maximum Loan Amount']].shape}")
print("\nFeatures for regression modelling:")
for i, feat in enumerate(regression_features, 1):
    print(f"{i}. {feat}")

In [None]:
## Task 2: Data Understanding - Regression

# Plot distributions of numerical features and target
numerical_features = [
    'Income', 'Employment Length', 'Loan Amount', 'Loan Interest Rate',
    'Loan-to-Income Ratio (LTI)', 'Credit History Length', 'Maximum Loan Amount'
]

fig, axes = plt.subplots(3, 3, figsize=(15, 12))
axes = axes.flatten()

for i, feature in enumerate(numerical_features):
    axes[i].hist(df_approved[feature].dropna(), bins=50, edgecolor='black')
    axes[i].set_title(f'Distribution of {feature}')
    axes[i].set_xlabel(feature)
    axes[i].set_ylabel('Frequency')
    axes[i].grid(alpha=0.3)

# Remove extra subplots
for j in range(len(numerical_features), len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

print("Summary statistics for regression features:")
print(df_approved[numerical_features].describe())

## Task 3: Data Preprocessing - Regression

# TASK 3.a: Investigate need for scaling

print("Checking ranges and scales of numerical features:\n")
scale_check = df_approved[numerical_features].describe().loc[['min', 'max', 'mean', 'std']]
print(scale_check)

print("\n" + "="*60)
print("RECOMMENDATION:")
print("="*60)
print("Decision Tree regressors are SCALE-INVARIANT (they use splits, not distances).")
print("Therefore, scaling is NOT required for this task.")
print("\nHowever, if using distance-based algorithms (e.g., KNN, SVM),")
print("scaling would be essential due to the vastly different ranges:")
print(f"  - Income ranges from {df_approved['Income'].min():,.0f} to {df_approved['Income'].max():,.0f}")
print(f"  - LTI ratio ranges from {df_approved['Loan-to-Income Ratio (LTI)'].min():.2f} to {df_approved['Loan-to-Income Ratio (LTI)'].max():.2f}")

In [None]:
## Task 4: Modelling - Regression

# TASK 4.a: Why Decision Tree for financial prediction?
print("Benefits of Decision Tree Regressor:")
print("- Interpretable: Easy to explain decisions to financial analysts")
print("- Non-linear: Captures complex relationships in loan data")
print("- No scaling needed: Works directly with different feature ranges\n")

# TASK 4.b: Prepare features for two regression models

# Model 1 (DT1): Numeric features only
numeric_features_only = [
    'Income', 'Employment Length', 'Loan Amount', 'Loan Interest Rate',
    'Loan-to-Income Ratio (LTI)', 'Credit History Length'
]

X_dt1 = df_approved[numeric_features_only].copy()
y_reg = df_approved['Maximum Loan Amount'].copy()

print("DT1 - Numeric features only:")
print(f"Features: {X_dt1.columns.tolist()}")
print(f"Shape: {X_dt1.shape}\n")

# Model 2 (DT2): All features (numeric + categorical encoded)
categorical_reg_features = [
    'Education Qualifications', 'Home Ownership', 'Loan Intent', 'Payment Default on File'
]

X_dt2_cat = df_approved[categorical_reg_features].copy()
X_dt2_num = df_approved[numeric_features_only].copy()

# One-hot encode categorical features
X_dt2_cat_encoded = pd.get_dummies(X_dt2_cat, drop_first=True)

# Combine numeric and encoded categorical
X_dt2 = pd.concat([X_dt2_num, X_dt2_cat_encoded], axis=1)

print("DT2 - All features (numeric + categorical):")
print(f"Features: {X_dt2.columns.tolist()}")
print(f"Shape: {X_dt2.shape}")

In [None]:
# Build and train Decision Tree regression models

# DT1: Numeric features only
dt1_model = DecisionTreeRegressor(random_state=42)
dt1_model.fit(X_train_dt1, y_train_reg)
print("DT1 model trained successfully (numeric features only)")

# DT2: All features
dt2_model = DecisionTreeRegressor(random_state=42)
dt2_model.fit(X_train_dt2, y_train_reg2)
print("DT2 model trained successfully (all features)")

## Task 5: Evaluation - Regression

In [None]:
# TASK 5.a: Calculate test performance metrics

# Make predictions
y_pred_dt1 = dt1_model.predict(X_test_dt1)
y_pred_dt2 = dt2_model.predict(X_test_dt2)

# Calculate metrics for DT1
mse_dt1 = mean_squared_error(y_test_reg, y_pred_dt1)
mae_dt1 = mean_absolute_error(y_test_reg, y_pred_dt1)
r2_dt1 = r2_score(y_test_reg, y_pred_dt1)

# Calculate metrics for DT2
mse_dt2 = mean_squared_error(y_test_reg2, y_pred_dt2)
mae_dt2 = mean_absolute_error(y_test_reg2, y_pred_dt2)
r2_dt2 = r2_score(y_test_reg2, y_pred_dt2)

# Display results
regression_results = pd.DataFrame({
    'Metric': ['MSE', 'MAE', 'R-Square'],
    'DT1 (Numeric)': [mse_dt1, mae_dt1, r2_dt1],
    'DT2 (All Features)': [mse_dt2, mae_dt2, r2_dt2]
})

print("TASK 5.a: Test Performance Metrics")
print("="*60)
print(regression_results.to_string(index=False))

print("\n" + "="*60)
print("METRIC SELECTION:")
print("="*60)
print("USE: R-Square - Explains how well features predict maximum loan amount")
print("DO NOT USE: MSE alone - Large values hard to interpret in original units")
print("DO NOT USE: MAE alone - Doesn't show proportion of variance explained")

In [None]:
# TASK 5.b: Caveats of R-Square

print("TASK 5.b: Caveats of R-Square Metric")
print("="*60)
print("1. R² can be artificially inflated by adding more features")
print("2. R² doesn't indicate if predictions are biased (systematic errors)")
print("3. R² near 1.0 might indicate overfitting if train/test scores differ greatly")
print("4. R² alone doesn't reveal if residuals meet regression assumptions")

In [None]:
# TASK 5.c: Select best model

print("TASK 5.c: Best Model Selection")
print("="*60)

if r2_dt2 > r2_dt1:
    print(f"Best Model: DT2 (All Features)")
    print(f"  R² Score: {r2_dt2:.4f}")
    print(f"\nJustification:")
    print(f"DT2 has higher R², meaning it explains more variance in maximum loan amounts.")
    print(f"Categorical features (Home Ownership, Loan Intent, etc.) add predictive value.")
    best_model = dt2_model
    X_train_best = X_train_dt2
    X_test_best = X_test_dt2
    y_test_best = y_test_reg2
    best_name = "DT2"
else:
    print(f"Best Model: DT1 (Numeric Only)")
    print(f"  R² Score: {r2_dt1:.4f}")
    print(f"\nJustification:")
    print(f"DT1 performs as well as or better than DT2 with fewer features (simpler model).")
    best_model = dt1_model
    X_train_best = X_train_dt1
    X_test_best = X_test_dt1
    y_test_best = y_test_reg
    best_name = "DT1"

In [None]:
# TASK 5.d: Rebuild best model with pre-pruning (max_depth=4)

print("TASK 5.d: Pruning the Best Model")
print("="*60)

# Train pruned version
if best_name == "DT2":
    pruned_model = DecisionTreeRegressor(max_depth=4, random_state=42)
    pruned_model.fit(X_train_dt2, y_train_reg2)
    y_pred_pruned = pruned_model.predict(X_test_dt2)
    r2_pruned = r2_score(y_test_reg2, y_pred_pruned)
else:
    pruned_model = DecisionTreeRegressor(max_depth=4, random_state=42)
    pruned_model.fit(X_train_dt1, y_train_reg)
    y_pred_pruned = pruned_model.predict(X_test_dt1)
    r2_pruned = r2_score(y_test_reg, y_pred_pruned)

print(f"Original {best_name} R² Score: {r2_dt2 if best_name=='DT2' else r2_dt1:.4f}")
print(f"Pruned {best_name} R² Score (max_depth=4): {r2_pruned:.4f}")

if r2_pruned < (r2_dt2 if best_name=='DT2' else r2_dt1):
    print("\nImpact: Pruning DECREASED performance (simpler but less accurate)")
else:
    print("\nImpact: Pruning MAINTAINED or IMPROVED performance (better generalization)")

In [None]:
# Plot the pruned tree

plt.figure(figsize=(20, 10))
plot_tree(pruned_model, 
          feature_names=X_train_best.columns.tolist(),
          filled=True,
          rounded=True,
          fontsize=10)
plt.title(f'Pruned Decision Tree ({best_name}, max_depth=4)', fontsize=16)
plt.tight_layout()
plt.show()

print("Tree structure is now limited to 4 levels for easier interpretation.")

In [None]:
# TASK 5.e: Predict maximum loan amount for client 60256

print("TASK 5.e: Prediction for Client 60256")
print("="*60)

# Client details
client_data = {
    'Income': 57000,
    'Employment Length': 15,
    'Loan Amount': 25700,
    'Loan Interest Rate': 23.0,
    'Loan-to-Income Ratio (LTI)': 0.10,
    'Credit History Length': 35
}

# If DT2 was best, add categorical features
if best_name == "DT2":
    client_data.update({
        'Education Qualifications': 'Unknown',
        'Home Ownership': 'Rent',
        'Loan Intent': 'Medical',
        'Payment Default on File': 'N'
    })
    
    # Create DataFrame
    client_df = pd.DataFrame([client_data])
    
    # Encode categorical features same way as training
    client_cat = client_df[['Education Qualifications', 'Home Ownership', 'Loan Intent', 'Payment Default on File']]
    client_num = client_df[numeric_features_only]
    
    client_cat_encoded = pd.get_dummies(client_cat, drop_first=True)
    
    # Align columns with training data
    for col in X_train_dt2.columns:
        if col not in client_cat_encoded.columns and col not in client_num.columns:
            client_cat_encoded[col] = 0
    
    client_encoded = pd.concat([client_num, client_cat_encoded], axis=1)
    client_encoded = client_encoded[X_train_dt2.columns]  # Ensure same column order
else:
    # DT1 - numeric only
    client_df = pd.DataFrame([client_data])
    client_encoded = client_df[numeric_features_only]

# Predict using pruned model
predicted_max_loan = pruned_model.predict(client_encoded)[0]

print(f"Client ID: 60256")
print(f"Predicted Maximum Loan Amount: £{predicted_max_loan:,.2f}")
print("\nNote: This prediction uses the pruned model (max_depth=4)")

---
# COURSEWORK COMPLETE
---

## Summary

### Part A: Classification (Loan Approval Prediction)
- **Models Built:** Naive Bayes, Logistic Regression, Random Forest
- **Best Model:** Selected based on Recall and Precision for "Declined" class
- **Tuning:** GridSearchCV with 5-fold cross-validation
- **Key Metrics:** Recall, Precision, F-Score, AUC-ROC

### Part B: Regression (Maximum Loan Amount Prediction)
- **Models Built:** DT1 (numeric features), DT2 (all features)
- **Best Model:** Selected based on R² score
- **Pruning:** Applied max_depth=4 for interpretability
- **Prediction:** Client 60256 maximum loan amount estimated

## Next Steps for Student Report
1. Take screenshots of all outputs (statistical tables, plots, confusion matrices, metrics)
2. Paste screenshots into report document
3. Add brief interpretations for each task
4. Complete the summary tables as shown in coursework instructions
5. Submit report (max 23 pages) and this .ipynb file

In [None]:
# TASK 4.b.i: Train-test split with reproducibility

# Split for DT1 (numeric only)
X_train_dt1, X_test_dt1, y_train_reg, y_test_reg = train_test_split(
    X_dt1, y_reg, test_size=0.2, random_state=42  # Ensures reproducibility
)

# Split for DT2 (all features) - using same random_state for consistency
X_train_dt2, X_test_dt2, y_train_reg2, y_test_reg2 = train_test_split(
    X_dt2, y_reg, test_size=0.2, random_state=42  # Same random_state = same split
)

print("TASK 4.b.i: Reproducibility ensured with random_state=42")
print("\nTASK 4.b.ii: Dataset dimensions:\n")

print("DT1 (Numeric only):")
print(f"  Training set: {X_train_dt1.shape}")
print(f"  Test set: {X_test_dt1.shape}")
print(f"  Features: {list(X_train_dt1.columns)}\n")

print("DT2 (All features):")
print(f"  Training set: {X_train_dt2.shape}")
print(f"  Test set: {X_test_dt2.shape}")
print(f"  Features: {list(X_train_dt2.columns)}")