<a href="https://colab.research.google.com/github/hpink97/loan_default_predictor/blob/main/03_ml_preprocessing_and_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#!pip install bayesian-optimization
#!pip install miceforest --no-cache-dir
import pandas as pd
import numpy as np 
import gc #free up memory
import matplotlib.pyplot as plt
import miceforest as mf ##forest based imputation


import xgboost as xgb
from bayes_opt import BayesianOptimization
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score

from sklearn.impute import KNNImputer
from sklearn.metrics import precision_recall_curve, roc_curve, roc_auc_score, f1_score, auc
from sklearn.preprocessing import RobustScaler,StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
df_raw = pd.read_csv('all_data_merged.csv')


In [None]:
df = df_raw.copy()
df.shape

(307511, 200)

In [None]:
lr_results = pd.read_csv('logistic_regression_results.csv')
important_features = lr_results.feature[lr_results.deviance_reduction >0.15]

In [None]:
na_count = df.isna().sum()
na_perc = 100*(na_count/df.shape[0])
missing_data = na_perc[na_count>0]
missing_data.loc[missing_data.index.isin(important_features)]


amt_goods_price              0.090403
own_car_age                 65.990810
ext_source_1                56.381073
ext_source_2                 0.214626
ext_source_3                19.825307
apartments_avg              50.749729
elevators_avg               53.295980
floorsmax_avg               49.760822
floorsmin_avg               67.848630
livingarea_avg              50.193326
apartments_mode             50.749729
elevators_mode              53.295980
floorsmax_mode              49.760822
floorsmin_mode              67.848630
livingarea_mode             50.193326
apartments_medi             50.749729
elevators_medi              53.295980
floorsmax_medi              49.760822
floorsmin_medi              67.848630
livingarea_medi             50.193326
totalarea_mode              48.268517
def_30_cnt_social_circle     0.332021
def_60_cnt_social_circle     0.332021
days_last_phone_change       0.000325
dtype: float64

In [None]:
##set car age to high if they don't have a car
no_car = df.flag_own_car=='N'
old_car_age = df.own_car_age.dropna().quantile(0.95)
df.loc[no_car, 'own_car_age'] = old_car_age


In [None]:
manual_fill = missing_data.loc[missing_data.index.isin(important_features)==False]
for col in manual_fill.index:
    median_value = df[col].median()
    df[col].fillna(median_value, inplace=True) 

In [None]:
numeric_cols = [col for col in df.columns if df[col].dtype in ['int64', 'float64']]
df_numeric = df.copy()[numeric_cols]
df_numeric = df_numeric.drop(['target'],axis=1)



kernal = mf.ImputationKernel(
  df_numeric,
  random_state=42
)

# Run the MICE algorithm for 2 iterations
kernal.mice(iterations=2,n_estimators=40)

# Return the completed dataset.
df_numeric_imputed = kernal.complete_data()



In [None]:
df[df_numeric_imputed.columns]=df_numeric_imputed

In [None]:
is_na = df.isna().sum()
is_na[is_na>0]

Series([], dtype: int64)

In [None]:
# Identify binary columns with only 2 unique values
y = df.target
X = df.drop(['target','sk_id_curr'],axis=1)

binary_cols = [col for col in X.columns if X[col].nunique() == 2]

label_encoders = {}

# Label encode binary columns
for col in binary_cols:
    label_encoder = LabelEncoder()
    X[col] = label_encoder.fit_transform(X[col])
    # Store the label encoder for later use
    label_encoders[col] = label_encoder




In [None]:
categorical_cols = [col for col in X.columns if X[col].dtype == 'object']
X = pd.get_dummies(X, columns=categorical_cols)
X

Unnamed: 0,name_contract_type,flag_own_car,flag_own_realty,cnt_children,amt_income_total,amt_credit,amt_annuity,amt_goods_price,region_population_relative,days_registration,...,"wallsmaterial_mode_Stone, brick",wallsmaterial_mode_Wooden,wallsmaterial_mode_other,emergencystate_mode_No,emergencystate_mode_Yes,emergencystate_mode_other,age_group_30-45,age_group_45-60,age_group_60+,age_group_under 30
0,0,0,1,0,202500.0,406597.5,24700.5,351000.0,0.018801,-3648.0,...,1,0,0,1,0,0,0,0,0,1
1,0,0,0,0,270000.0,1293502.5,35698.5,1129500.0,0.003541,-1186.0,...,0,0,0,1,0,0,0,1,0,0
2,1,1,1,0,67500.0,135000.0,6750.0,135000.0,0.010032,-4260.0,...,0,0,1,0,0,1,0,1,0,0
3,0,0,1,0,135000.0,312682.5,29686.5,297000.0,0.008019,-9833.0,...,0,0,1,0,0,1,0,1,0,0
4,0,0,1,0,121500.0,513000.0,21865.5,513000.0,0.028663,-4311.0,...,0,0,1,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,0,0,0,0,157500.0,254700.0,27558.0,225000.0,0.032561,-8456.0,...,1,0,0,1,0,0,0,0,0,1
307507,0,0,1,0,72000.0,269550.0,12001.5,225000.0,0.025164,-4388.0,...,1,0,0,1,0,0,0,1,0,0
307508,0,0,1,0,153000.0,677664.0,29979.0,585000.0,0.005002,-6737.0,...,0,0,0,1,0,0,1,0,0,0
307509,0,0,1,0,171000.0,370107.0,20205.0,319500.0,0.005313,-2562.0,...,1,0,0,1,0,0,1,0,0,0


In [None]:
numeric_cols = [col for col in X.columns if X[col].dtype in ['int64', 'float64']]


scaler = StandardScaler()
scaler= scaler.fit(X[numeric_cols])
X[numeric_cols] = scaler.transform(X[numeric_cols])

In [None]:
# Calculate mean
scaled_data = X[numeric_cols]
mean_values = scaled_data.mean()
median_values = scaled_data.median()
std_values = scaled_data.std()
percentile_5 = scaled_data.quantile(0.25)
percentile_95 = scaled_data.quantile(0.75)
min_values = scaled_data.min()
max_values = scaled_data.max()

# Create a summary DataFrame
summary_stats = pd.DataFrame({
    'Mean': mean_values,
    'Median':median_values,
    'Standard Deviation': std_values,
    'percentile_25': percentile_5,
    'percentile_75': percentile_95,
    'Minimum': min_values,
    'Maximum': max_values
})

# Print the summary statistics
summary_stats

Unnamed: 0,Mean,Median,Standard Deviation,percentile_25,percentile_75,Minimum,Maximum
name_contract_type,-5.388378e-17,-0.324395,1.000002,-0.324395,-0.324395,-0.324395,3.082659
flag_own_car,1.280086e-17,-0.717914,1.000002,-0.717914,1.392925,-0.717914,1.392925
flag_own_realty,4.556553e-17,0.664531,1.000002,-1.504820,0.664531,-1.504820,0.664531
cnt_children,-3.207148e-17,-0.577538,1.000002,-0.577538,0.807273,-0.577538,25.733871
amt_income_total,-4.116379e-17,-0.091294,1.000002,-0.237421,0.142129,-0.603687,492.703449
...,...,...,...,...,...,...,...
rejected_median_downpayment_rate,-2.213579e-17,-0.216425,1.000002,-0.216425,-0.216425,-0.216425,20.828977
rejected_prop_w_downpayment,1.261601e-17,-0.239761,1.000002,-0.239761,-0.239761,-0.239761,5.332712
most_recent_rejected,-2.079563e-17,0.442702,1.000002,-0.034854,0.460147,-5.911627,0.460147
accepted_rate,6.132400e-17,0.252321,1.000002,-0.731877,0.908453,-2.372208,0.908453


In [None]:
##make train test splits
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.25, random_state=1
)

print(f"{X_train.shape[0]} training samples and {X_test.shape[0]} testing samples")
print(f"{y_train.sum()} ({y_train.mean()*100:.3f}%) positives in training set, {y_test.sum()} ({y_test.mean()*100:.3f})% positives in testing set")


230633 training samples and 76878 testing samples
18565 (8.050%) positives in training set, 6260 (8.143)% positives in testing set


In [None]:
import pickle as pkl
#with open('data.pkl', 'wb') as f:
#    pkl.dump((X_train, X_test, y_train, y_test), f)


with open('data.pkl', 'rb') as f:
    X_train, X_test, y_train, y_test = pkl.load(f)

In [None]:
from sklearn.ensemble import RandomForestClassifier



# Instantiate the Random Forest classifier
class_weights = {0: 12,
                 1: 1}
rf_classifier = RandomForestClassifier(n_estimators=120, random_state=42, 
                                       class_weight=class_weights)


# Train the model
rf_classifier.fit(X_train, y_train)

# Extract feature importances
feature_importances = rf_classifier.feature_importances_


KeyboardInterrupt: ignored

In [None]:


# Get feature importances from the random forest model
feature_importances = rf_classifier.feature_importances_

# Get the corresponding feature names
feature_names = X.columns

# Sort feature importances and feature names in descending order
sorted_indices = feature_importances.argsort()[::-1]
sorted_feature_importances = feature_importances[sorted_indices]
sorted_feature_names = feature_names[sorted_indices]

# Create a bar plot of sorted feature importances
plt.figure(figsize=(8, 12))
plt.barh(sorted_feature_names[0:35], sorted_feature_importances[0:35])
plt.xlabel('Importance')
plt.ylabel('Features')
plt.title('Feature Importances (Sorted)')
plt.tight_layout()
plt.show()

In [None]:
def evaluate_performance(model, X, y_true):
    """
    Evaluate model performance for binary classification and plot visualizations.

    Parameters:
        model: Trained binary classification model.
        X (array-like): Input features.
        y_true (array-like): True binary labels.
    """
    # Predict probabilities for the positive class
    y_pred_prob = model.predict_proba(X)[:, 1]

    # Compute precision, recall, and thresholds
    precision, recall, thresholds_pr = precision_recall_curve(y_true, y_pred_prob)

    # Compute ROC curve and AUC
    fpr, tpr, thresholds_roc = roc_curve(y_true, y_pred_prob)
    ##calculate best threshold 
    j_stat = tpr - fpr
    index_for_best_threshold = np.argmax(j_stat)
    best_threshold = thresholds_roc[index_for_best_threshold]
    y_pred = np.where(y_pred_prob >= best_threshold, 1, 0)



    roc_auc = roc_auc_score(y_true, y_pred_prob)

    # Compute F1 score
    
    f1 = f1_score(y_true, y_pred)

    # Compute confusion matrix
    #cm = confusion_matrix(y_true, y_pred)

    # Create subplots
    fig, axs = plt.subplots(1, 2, figsize=(12, 6))

    # Plot Precision-Recall curve
    axs[0].plot(recall, precision, marker='.')
    axs[0].set_xlabel('Recall')
    axs[0].set_ylabel('Precision')
    axs[0].set_title('Precision-Recall Curve')
    axs[0].grid(True)

    # Plot ROC curve
    axs[1].plot(fpr, tpr, marker='.')
    axs[1].plot([0, 1], [0, 1], 'k--')
    axs[1].set_xlabel('False Positive Rate')
    axs[1].set_ylabel('True Positive Rate')
    axs[1].set_title('ROC Curve (AUC = {:.2f})'.format(roc_auc))
    axs[1].grid(True)

    # Adjust spacing between subplots
    plt.tight_layout()

    # Show the plot
    plt.show()

    # Print performance metrics
    #print(cm)
    print('Precision: {:.4f}'.format(precision[1]))
    print('Recall: {:.4f}'.format(recall[1]))
    print('F1 Score: {:.4f}'.format(f1))
    print('ROC-AUC: {:.4f}'.format(roc_auc))


evaluate_performance(rf_classifier, X_test, y_test)

In [None]:
# Define the objective function for Bayesian optimization
def xgb_objective(n_estimators, gamma, scale_pos_weight,colsample_bynode,
                  learning_rate, max_depth, subsample, colsample_bytree):
    # Convert hyperparameters to appropriate types
    learning_rate = float(learning_rate)
    max_depth = int(max_depth)
    subsample = max(min(float(subsample), 1), 0)
    colsample_bytree = max(min(float(colsample_bytree), 1), 0)
    colsample_bynode = max(min(float(colsample_bytree), 1), 0)



    # Create the XGBClassifier model with the specified hyperparameters
    model = xgb.XGBClassifier(
        n_estimators=n_estimators,
        gamma = gamma,
        scale_pos_weight=scale_pos_weight,
        colsample_bynode=colsample_bynode,
        learning_rate=learning_rate,
        max_depth=max_depth,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        random_state=42
    )

    # Perform cross-validation and calculate the ROC AUC score
    roc_auc = cross_val_score(model, X, y, cv=5, scoring='roc_auc').mean()

    # Return the negative ROC AUC score (Bayesian optimization minimizes the objective function)
    return -roc_auc

# Define the parameter ranges for Bayesian optimization
pbounds = {
    'n_estimators':(100, 5000),
    'gamma':(0.2, 1),
    'scale_pos_weight':(1,50),
    'colsample_bynode' : (0.3,1),
    'learning_rate': (0.001, 0.1),
    'max_depth': (3, 14),
    'subsample': (0.25, 1),
    'colsample_bytree': (0.25, 1)
}

# Perform Bayesian optimization
optimizer = BayesianOptimization(
    f=xgb_objective,
    pbounds=pbounds,
    random_state=42,
    verbose=2
)
optimizer.maximize(init_points=10, n_iter=30)

# Get the best hyperparameters and maximum ROC AUC score
best_params = optimizer.max['params']
best_score = -optimizer.max['target']




In [None]:
# Train the final XGBClassifier model with the best hyperparameters
best_model = xgb.XGBClassifier(
    
    learning_rate=best_params['learning_rate'],
    max_depth=int(best_params['max_depth']),
    subsample=best_params['subsample'],
    colsample_bytree=best_params['colsample_bytree'],
    random_state=42
)
best_model.fit(X, y)