In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import RFECV
from imblearn.under_sampling import RandomUnderSampler
import joblib

# Load the dataset
data = pd.read_csv('/content/loan_data.csv')

# Split the dataset into features and target
X = data.drop('loan_status', axis=1)
y = data['loan_status']

# Identify categorical columns
categorical_columns = [col for col in X.columns if X[col].dtype == 'object']

# Create a preprocessor to handle categorical variables
preprocessor = ColumnTransformer(
    transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)],
    remainder='passthrough')

# Split into training and testing sets with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12345, stratify=y)

# Fit the preprocessor and transform the data
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Apply RandomUnderSampler for undersampling
rus = RandomUnderSampler(random_state=12345, sampling_strategy='auto')
X_train_resampled, y_train_resampled = rus.fit_resample(X_train_transformed, y_train)

# Print class distribution after undersampling
print("Class distribution after undersampling:")
print(pd.Series(y_train_resampled).value_counts())

# Define a function to train and evaluate the model
def train_and_evaluate(X_train_data, y_train_data, X_test_data, y_test_data):
    # Initialize the base RandomForestClassifier
    rf_base = RandomForestClassifier(random_state=12345, class_weight='balanced', n_estimators=100)

    # Use RFECV for feature selection
    rfecv = RFECV(estimator=rf_base, step=1, cv=3, scoring='accuracy', n_jobs=-1)
    X_train_selected = rfecv.fit_transform(X_train_data, y_train_data)
    X_test_selected = rfecv.transform(X_test_data)

    # Get feature names after one-hot encoding
    feature_names = preprocessor.get_feature_names_out()

    # Filter selected features using RFECV support mask
    selected_features = np.array(feature_names)[rfecv.support_]

    print("\nSelected Features:")
    print(selected_features)

    # Print the number of selected features
    print(f"\nOptimal number of features: {rfecv.n_features_}")

    # Define the parameter grid for Random Forest
    param_grid = {
        'n_estimators': [100, 200, 500],
        'max_depth': [10, 15, 20],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2],
        'criterion': ['gini'],
        'class_weight': [None, 'balanced']
    }

    # Initialize GridSearchCV with feature-selected data
    grid_search = GridSearchCV(
        estimator=RandomForestClassifier(random_state=12345),
        param_grid=param_grid,
        cv=3,
        scoring='accuracy',
        n_jobs=-1
    )

    grid_search.fit(X_train_selected, y_train_data)

    # Print the best parameters
    print("Best parameters from GridSearchCV:", grid_search.best_params_)

    # Train the final model with the best parameters
    rf_model = RandomForestClassifier(
        n_estimators=grid_search.best_params_['n_estimators'],
        max_depth=grid_search.best_params_['max_depth'],
        min_samples_split=grid_search.best_params_['min_samples_split'],
        min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
        criterion=grid_search.best_params_['criterion'],
        class_weight=grid_search.best_params_['class_weight'],
        random_state=12345
    )

    rf_model.fit(X_train_selected, y_train_data)

    # Predict on training and test sets
    y_train_pred = rf_model.predict(X_train_selected)
    y_test_pred = rf_model.predict(X_test_selected)

    # Compute and print training and test metrics
    def compute_metrics(y_true, y_pred):
        cm = confusion_matrix(y_true, y_pred)
        tn, fp, fn, tp = cm.ravel()

        accuracy = (tp + tn) / (tp + tn + fp + fn)
        recall = tp / (tp + fn)  # Sensitivity (True Positive Rate)
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        specificity = tn / (tn + fp)  # True Negative Rate

        return accuracy, recall, precision, specificity

    # Compute and print training metrics
    train_accuracy, train_recall, train_precision, train_specificity = compute_metrics(y_train_data, y_train_pred)
    print("\nTraining Set Metrics:")
    print(f"Accuracy: {train_accuracy:.4f}")
    print(f"Recall (Sensitivity): {train_recall:.4f}")
    print(f"Precision: {train_precision:.4f}")
    print(f"Specificity: {train_specificity:.4f}")

    # Compute and print test metrics
    test_accuracy, test_recall, test_precision, test_specificity = compute_metrics(y_test_data, y_test_pred)
    print("\nTest Set Metrics:")
    print(f"Accuracy: {test_accuracy:.4f}")
    print(f"Recall (Sensitivity): {test_recall:.4f}")
    print(f"Precision: {test_precision:.4f}")
    print(f"Specificity: {test_specificity:.4f}")

    # Print classification report
    print("\nClassification Report (Training Set):")
    print(classification_report(y_train_data, y_train_pred))

    print("\nClassification Report (Test Set):")
    print(classification_report(y_test_data, y_test_pred))

    # Print confusion matrices for both training and test sets
    print("\nConfusion Matrix (Training Set):")
    print(confusion_matrix(y_train_data, y_train_pred))

    print("\nConfusion Matrix (Test Set):")
    print(confusion_matrix(y_test_data, y_test_pred))

    return rf_model

# Train and evaluate on the original (imbalanced) data
print("\n--- Results for Original (Imbalanced) Class Distribution ---")
rf_model_imbalanced = train_and_evaluate(X_train_transformed, y_train, X_test_transformed, y_test)

# Train and evaluate on the undersampled data
print("\n--- Results for Undersampled Class Distribution ---")
rf_model_undersampled = train_and_evaluate(X_train_resampled, y_train_resampled, X_test_transformed, y_test)

# Save both models
joblib.dump(rf_model_imbalanced, '/content/random_forest_model_imbalanced.pkl')
joblib.dump(rf_model_undersampled, '/content/random_forest_model_undersampled.pkl')

Class distribution after undersampling:
loan_status
0    8000
1    8000
Name: count, dtype: int64

--- Results for Original (Imbalanced) Class Distribution ---

Selected Features:
['cat__person_gender_female' 'cat__person_education_Bachelor'
 'cat__person_home_ownership_MORTGAGE' 'cat__person_home_ownership_OWN'
 'cat__person_home_ownership_RENT' 'cat__loan_intent_DEBTCONSOLIDATION'
 'cat__loan_intent_EDUCATION' 'cat__loan_intent_HOMEIMPROVEMENT'
 'cat__loan_intent_MEDICAL' 'cat__loan_intent_VENTURE'
 'cat__previous_loan_defaults_on_file_No'
 'cat__previous_loan_defaults_on_file_Yes' 'remainder__person_age'
 'remainder__person_income' 'remainder__person_emp_exp'
 'remainder__loan_amnt' 'remainder__loan_int_rate'
 'remainder__loan_percent_income' 'remainder__cb_person_cred_hist_length'
 'remainder__credit_score']

Optimal number of features: 20
Best parameters from GridSearchCV: {'class_weight': None, 'criterion': 'gini', 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, '

['/content/random_forest_model_undersampled.pkl']

In [None]:
# Save the preprocessor
joblib.dump(preprocessor, '/content/preprocessor.pkl')

# Save the feature names after preprocessing
feature_names = preprocessor.get_feature_names_out()
joblib.dump(feature_names, '/content/feature_names.pkl')

# Save categorical columns (for reference)
joblib.dump(categorical_columns, '/content/categorical_columns.pkl')

# Save class labels (for reference)
class_labels = y.unique()
joblib.dump(class_labels, '/content/class_labels.pkl')

['/content/class_labels.pkl']

In [None]:
!pip install flask joblib pandas scikit-learn



In [None]:
from flask import Flask, request, jsonify
import joblib
import pandas as pd

# Load the trained model
model = joblib.load('/content/random_forest_model_undersampled.pkl')

# Load the preprocessor (if needed)
preprocessor = joblib.load('/content/preprocessor.pkl')  # Save this during training if required

# Initialize Flask app
app = Flask(__name__)

# Define a route for predictions
@app.route('/predict', methods=['POST'])
def predict():
    try:
        # Get JSON data from the request
        input_data = request.json

        # Convert input data into a DataFrame
        input_df = pd.DataFrame(input_data)

        # Preprocess the input data (if required)
        input_transformed = preprocessor.transform(input_df)

        # Make predictions
        predictions = model.predict(input_transformed)

        # Return predictions as JSON
        return jsonify({'predictions': predictions.tolist()})

    except Exception as e:
        return jsonify({'error': str(e)})

# Run the Flask app
if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5000)

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://172.28.0.12:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
