In [None]:
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB
from tabulate import tabulate
from tqdm import tqdm
import time
import warnings
warnings.filterwarnings('ignore')

# Function to create JSON-serializable dictionaries
def make_serializable(obj):
    if isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, pd.DataFrame):
        return obj.to_dict(orient='records')
    else:
        return str(obj)

# Function to display table from results data
def display_results_table(results, table_name, headers):
    print(f"\n{table_name} Results:")
    table_data = []
    for result in results:
        row = [result[key] for key in headers.keys()]
        table_data.append(row)
    print(tabulate(table_data, headers=headers.values(), tablefmt="pretty", floatfmt=".4f"))

# Function to display feature importance from dictionary
def display_feature_importance(feature_importance_dict, title):
    print(f"\n{title} Feature Importance:")
    for model_name, importance_data in feature_importance_dict.items():
        df = pd.DataFrame(importance_data)
        print(f"\nTop 5 Features for {model_name}:")
        print(tabulate(df.head(5), headers='keys', tablefmt="pretty", floatfmt=".4f"))

# Function to display hyperparameters
def display_hyperparameters(hyperparams_dict, title):
    print(f"\n{title} Best Hyperparameters:")
    for model_name, params in hyperparams_dict.items():
        print(f"\n{model_name}:")
        if isinstance(params, str):
            print(params)
        else:
            # Convert to a table format
            param_table = [[k, v] for k, v in params.items()]
            print(tabulate(param_table, headers=['Parameter', 'Value'], tablefmt="pretty"))

# Function to display confusion matrix
def display_confusion_matrix(cm):
    df_cm = pd.DataFrame(
        cm,
        index=['Actual 0', 'Actual 1'],
        columns=['Predicted 0', 'Predicted 1']
    )
    return tabulate(df_cm, headers='keys', tablefmt="pretty")

# Function to display PCA parameters
def display_pca_parameters(pca_params):
    print("\nPCA Parameters:")
    param_table = [
        ["Number of Components", pca_params["n_components"]],
        ["Explained Variance", f"{sum(pca_params['explained_variance_ratio'])*100:.2f}%"]
    ]
    print(tabulate(param_table, headers=['Parameter', 'Value'], tablefmt="pretty"))

# Function to create confusion matrix and get classification metrics
def get_classification_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    cm = confusion_matrix(y_true, y_pred)
    return accuracy, precision, recall, f1, cm

# Function to get regression metrics
def get_regression_metrics(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return rmse, mae, r2

# Function to calculate feature importance
def get_feature_importance(model, feature_names, pca_components=None):
    """
    Calculate feature importance, including support for PCA transformed data
    """
    if pca_components is not None:
        # For PCA-transformed data, we need to map back to original features
        if hasattr(model, 'feature_importances_'):
            pca_importance = model.feature_importances_
            importance = np.abs(np.dot(pca_importance, pca_components))
        elif hasattr(model, 'coef_'):
            pca_importance = np.abs(model.coef_).mean(axis=0) if model.coef_.ndim > 1 else np.abs(model.coef_)
            importance = np.abs(np.dot(pca_importance, pca_components))
        else:
            return None
    elif hasattr(model, 'feature_importances_'):
        importance = model.feature_importances_
    elif hasattr(model, 'coef_'):
        importance = np.abs(model.coef_).mean(axis=0) if model.coef_.ndim > 1 else np.abs(model.coef_)
    else:
        return None

    feature_importance = pd.DataFrame({
        'Feature': feature_names,
        'Importance': importance
    })
    return feature_importance.sort_values('Importance', ascending=False)

# Main function to run all experiments and save progress
def run_experiments(loan_data_path, real_estate_path, sample_fraction=0.3, random_state=42):
    progress = {
        "Table_1": {"Results": [], "Feature_Importance": {}},
        "Table_2": {"Results": []},
        "Table_3": {"Results": [], "Best_Hyperparameters": {}, "Feature_Importance": {}},
        "Table_4": {"Results": [], "Feature_Importance": {}, "PCA_Parameters": {}},
        "Regression_Table_1": {"Results": [], "Feature_Importance": {}},
        "Regression_Table_2": {"Results": [], "Best_Hyperparameters": {}, "Feature_Importance": {}}
    }

    # Load and preprocess loan data for classification
    print("Loading and preprocessing loan data for classification...")
    loan_data = pd.read_csv(loan_data_path)
    loan_data = loan_data.dropna()

    # Sample fraction of the dataset
    loan_data = loan_data.sample(frac=sample_fraction, random_state=random_state)
    print(f"Using {len(loan_data)} samples ({sample_fraction*100}% of original dataset)")

    # Convert categorical features to numerical
    cat_cols = ['person_gender', 'person_education', 'person_home_ownership',
                'loan_intent', 'previous_loan_defaults_on_file']
    for col in cat_cols:
        if col in loan_data.columns:
            loan_data[col] = loan_data[col].astype('category').cat.codes

    # Prepare data for modeling
    X = loan_data.drop(['loan_status'], axis=1)
    y = loan_data['loan_status']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)
    print(f"Train set: {X_train.shape[0]} samples, Test set: {X_test.shape[0]} samples")

    # Define classifiers
    classifiers = {
        'Logistic Regression': LogisticRegression(max_iter=1000, random_state=random_state),
        'Decision Tree': DecisionTreeClassifier(random_state=random_state),
        'Random Forest': RandomForestClassifier(random_state=random_state),
        'Gradient Boosting': GradientBoostingClassifier(random_state=random_state),
        'K-Nearest Neighbors': KNeighborsClassifier(),
        'Naive Bayes': GaussianNB()
    }

    # TABLE 1: Base classifiers without scaling
    print("\n--- TABLE 1: Base classifiers without scaling ---")
    for name in tqdm(classifiers.keys(), desc="Training base classifiers"):
        classifier = classifiers[name]
        start_time = time.time()
        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)
        train_time = time.time() - start_time
        accuracy, precision, recall, f1, cm = get_classification_metrics(y_test, y_pred)

        # Store results
        result = {
            "Classifier": name,
            "Accuracy": float(accuracy),
            "Precision": float(precision),
            "Recall": float(recall),
            "F1-Score": float(f1),
            "Time(s)": float(train_time),
            "Confusion_Matrix": cm.tolist()
        }
        progress["Table_1"]["Results"].append(result)

        # Calculate feature importance if available
        if name not in ['K-Nearest Neighbors', 'Naive Bayes']:
            feature_imp = get_feature_importance(classifier, X.columns)
            if feature_imp is not None:
                progress["Table_1"]["Feature_Importance"][name] = feature_imp.to_dict(orient='records')

        # Save progress after each model
        with open("progress.json", "w") as json_file:
            json.dump(progress, json_file, indent=4, default=make_serializable)

    # Display Table 1 results in console
    display_results_table(
        progress["Table_1"]["Results"],
        "TABLE 1: Base classifiers without scaling",
        {"Classifier": "Classifier", "Accuracy": "Accuracy", "Precision": "Precision",
         "Recall": "Recall", "F1-Score": "F1-Score", "Time(s)": "Time (s)"}
    )

    # Display feature importance
    display_feature_importance(progress["Table_1"]["Feature_Importance"], "TABLE 1")

    # Display confusion matrices
    print("\nTABLE 1: Confusion Matrices:")
    for result in progress["Table_1"]["Results"]:
        print(f"\nConfusion Matrix for {result['Classifier']}:")
        print(display_confusion_matrix(result["Confusion_Matrix"]))

    # TABLE 2: Testing different scaling methods
    print("\n--- TABLE 2: Testing different scaling methods ---")
    scalers = {
        'L1 Normalization': Normalizer(norm='l1'),
        'L2 Normalization': Normalizer(norm='l2'),
        'Min-Max Scaling': MinMaxScaler(),
        'Std Scaling': StandardScaler()
    }

    best_scaler = None
    best_scaler_name = None
    best_accuracy = 0

    for scaler_name in tqdm(scalers.keys(), desc="Testing scaling methods"):
        scaler = scalers[scaler_name]
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        for classifier_name in tqdm(classifiers.keys(), desc=f"Training with {scaler_name}", leave=False):
            classifier = classifiers[classifier_name]
            classifier.fit(X_train_scaled, y_train)
            y_pred = classifier.predict(X_test_scaled)
            accuracy, precision, recall, f1, cm = get_classification_metrics(y_test, y_pred)

            # Store results
            result = {
                "Scaling_Method": scaler_name,
                "Classifier": classifier_name,
                "Accuracy": float(accuracy),
                "Precision": float(precision),
                "Recall": float(recall),
                "F1-Score": float(f1),
                "Confusion_Matrix": cm.tolist()
            }
            progress["Table_2"]["Results"].append(result)

            # Track best scaler performance
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_scaler = scaler
                best_scaler_name = scaler_name

        # Save progress after each scaler
        with open("progress.json", "w") as json_file:
            json.dump(progress, json_file, indent=4, default=make_serializable)

    # Display Table 2 results in console
    display_results_table(
        progress["Table_2"]["Results"],
        "TABLE 2: Scaling methods comparison",
        {"Scaling_Method": "Scaling Method", "Classifier": "Classifier", "Accuracy": "Accuracy",
         "Precision": "Precision", "Recall": "Recall", "F1-Score": "F1-Score"}
    )

    print(f"\nBest Scaling Method: {best_scaler_name} (Accuracy: {best_accuracy:.4f})")

    # Apply best scaler for future use
    print(f"\nApplying best scaler ({best_scaler_name}) to the data...")
    X_train_best_scaled = best_scaler.fit_transform(X_train)
    X_test_best_scaled = best_scaler.transform(X_test)

    # TABLE 3: Grid search with 10-fold CV using best scaler
    print("\n--- TABLE 3: Grid search with 10-fold CV using best scaler ---")
    param_grids = {
        'Logistic Regression': {
            'classifier__C': [0.01, 0.1, 1, 10, 100],
            'classifier__solver': ['liblinear', 'saga']
        },
        'Decision Tree': {
            'classifier__max_depth': [None, 10, 20, 30],
            'classifier__min_samples_split': [2, 5, 10]
        },
        'Random Forest': {
            'classifier__n_estimators': [50, 100, 200],
            'classifier__max_depth': [None, 10, 20]
        },
        'Gradient Boosting': {
            'classifier__n_estimators': [50, 100, 200],
            'classifier__learning_rate': [0.01, 0.1, 0.2]
        },
        'K-Nearest Neighbors': {
            'classifier__n_neighbors': [3, 5, 7, 9],
            'classifier__weights': ['uniform', 'distance']
        },
        'Naive Bayes': {}  # No hyperparameters to tune
    }

    best_models_table3 = {}
    kf = KFold(n_splits=10, shuffle=True, random_state=random_state)

    for name in tqdm(classifiers.keys(), desc="Performing grid search with CV"):
        classifier = classifiers[name]
        # Create pipeline with best scaler
        pipeline = Pipeline([
            ('scaler', best_scaler.__class__()),  # Use same scaler type
            ('classifier', classifier)
        ])

        if name == 'Naive Bayes':
            # For Naive Bayes, just do cross-validation
            cv_scores = cross_val_score(pipeline, X_train, y_train, cv=kf, scoring='accuracy')
            pipeline.fit(X_train, y_train)
            y_pred = pipeline.predict(X_test)
            accuracy, precision, recall, f1, cm = get_classification_metrics(y_test, y_pred)

            # Store results
            result = {
                "Classifier": name,
                "CV_Accuracy": float(np.mean(cv_scores)),
                "Test_Accuracy": float(accuracy),
                "Precision": float(precision),
                "Recall": float(recall),
                "F1-Score": float(f1),
                "Confusion_Matrix": cm.tolist()
            }
            progress["Table_3"]["Results"].append(result)
            progress["Table_3"]["Best_Hyperparameters"][name] = "No hyperparameters to tune"
            best_models_table3[name] = pipeline
        else:
            # Grid search with CV for other classifiers
            grid_search = GridSearchCV(
                pipeline,
                param_grids[name],
                cv=kf,
                scoring='accuracy',
                n_jobs=-1
            )
            grid_search.fit(X_train, y_train)
            y_pred = grid_search.predict(X_test)
            accuracy, precision, recall, f1, cm = get_classification_metrics(y_test, y_pred)

            # Store results
            result = {
                "Classifier": name,
                "CV_Accuracy": float(grid_search.best_score_),
                "Test_Accuracy": float(accuracy),
                "Precision": float(precision),
                "Recall": float(recall),
                "F1-Score": float(f1),
                "Confusion_Matrix": cm.tolist()
            }
            progress["Table_3"]["Results"].append(result)
            progress["Table_3"]["Best_Hyperparameters"][name] = grid_search.best_params_
            best_models_table3[name] = grid_search.best_estimator_

            # Calculate feature importance for the best model
            if name not in ['K-Nearest Neighbors', 'Naive Bayes']:
                try:
                    feature_imp = get_feature_importance(grid_search.best_estimator_.named_steps['classifier'], X.columns)
                    if feature_imp is not None:
                        progress["Table_3"]["Feature_Importance"][name] = feature_imp.to_dict(orient='records')
                except:
                    pass

        # Save progress after each model
        with open("progress.json", "w") as json_file:
            json.dump(progress, json_file, indent=4, default=make_serializable)

    # Display Table 3 results in console
    display_results_table(
        progress["Table_3"]["Results"],
        "TABLE 3: Grid search with 10-fold CV",
        {"Classifier": "Classifier", "CV_Accuracy": "CV Accuracy", "Test_Accuracy": "Test Accuracy",
         "Precision": "Precision", "Recall": "Recall", "F1-Score": "F1-Score"}
    )

    # Display best hyperparameters
    display_hyperparameters(progress["Table_3"]["Best_Hyperparameters"], "TABLE 3")

    # Display feature importance
    display_feature_importance(progress["Table_3"]["Feature_Importance"], "TABLE 3")

    # Display confusion matrices
    print("\nTABLE 3: Confusion Matrices:")
    for result in progress["Table_3"]["Results"]:
        print(f"\nConfusion Matrix for {result['Classifier']}:")
        print(display_confusion_matrix(result["Confusion_Matrix"]))

    # TABLE 4: PCA with best scaling method
    print("\n--- TABLE 4: PCA with best scaling method ---")
    pca = PCA(n_components=0.95)  # Keep 95% of variance
    X_train_pca = pca.fit_transform(X_train_best_scaled)
    X_test_pca = pca.transform(X_test_best_scaled)
    print(f"Reduced from {X_train.shape[1]} to {X_train_pca.shape[1]} features")

    # Store PCA parameters
    progress["Table_4"]["PCA_Parameters"] = {
        "n_components": int(pca.n_components_),
        "explained_variance_ratio": pca.explained_variance_ratio_.tolist(),
        "singular_values": pca.singular_values_.tolist()
    }

    # Display PCA parameters
    display_pca_parameters(progress["Table_4"]["PCA_Parameters"])

    for name in tqdm(classifiers.keys(), desc="Training with PCA-transformed data"):
        # Use best parameters from Table 3
        if name == 'Naive Bayes':
            model = GaussianNB()
        elif name == 'Logistic Regression':
            best_params = progress["Table_3"]["Best_Hyperparameters"][name]
            classifier_params = {k.replace('classifier__', ''): v for k, v in best_params.items() if 'classifier__' in k}
            model = LogisticRegression(**classifier_params, max_iter=1000, random_state=random_state)
        elif name == 'Decision Tree':
            best_params = progress["Table_3"]["Best_Hyperparameters"][name]
            classifier_params = {k.replace('classifier__', ''): v for k, v in best_params.items() if 'classifier__' in k}
            model = DecisionTreeClassifier(**classifier_params, random_state=random_state)
        elif name == 'Random Forest':
            best_params = progress["Table_3"]["Best_Hyperparameters"][name]
            classifier_params = {k.replace('classifier__', ''): v for k, v in best_params.items() if 'classifier__' in k}
            model = RandomForestClassifier(**classifier_params, random_state=random_state)
        elif name == 'Gradient Boosting':
            best_params = progress["Table_3"]["Best_Hyperparameters"][name]
            classifier_params = {k.replace('classifier__', ''): v for k, v in best_params.items() if 'classifier__' in k}
            model = GradientBoostingClassifier(**classifier_params, random_state=random_state)
        elif name == 'K-Nearest Neighbors':
            best_params = progress["Table_3"]["Best_Hyperparameters"][name]
            classifier_params = {k.replace('classifier__', ''): v for k, v in best_params.items() if 'classifier__' in k}
            model = KNeighborsClassifier(**classifier_params)

        model.fit(X_train_pca, y_train)
        y_pred = model.predict(X_test_pca)
        accuracy, precision, recall, f1, cm = get_classification_metrics(y_test, y_pred)

        # Store results
        result = {
            "Classifier": name,
            "Accuracy": float(accuracy),
            "Precision": float(precision),
            "Recall": float(recall),
            "F1-Score": float(f1),
            "Confusion_Matrix": cm.tolist()
        }
        progress["Table_4"]["Results"].append(result)

        # Calculate feature importance for PCA models
        if name not in ['K-Nearest Neighbors', 'Naive Bayes']:
            try:
                # For PCA, we need to map back to original features
                feature_imp = get_feature_importance(model, X.columns, pca.components_)
                if feature_imp is not None:
                    progress["Table_4"]["Feature_Importance"][name] = feature_imp.to_dict(orient='records')
            except Exception as e:
                print(f"Error calculating feature importance for {name}: {e}")

        # Save progress after each model
        with open("progress.json", "w") as json_file:
            json.dump(progress, json_file, indent=4, default=make_serializable)

    # Display Table 4 results in console
    display_results_table(
        progress["Table_4"]["Results"],
        "TABLE 4: PCA with best scaling method",
        {"Classifier": "Classifier", "Accuracy": "Accuracy", "Precision": "Precision",
         "Recall": "Recall", "F1-Score": "F1-Score"}
    )

    # Display feature importance
    display_feature_importance(progress["Table_4"]["Feature_Importance"], "TABLE 4")

    # Display confusion matrices
    print("\nTABLE 4: Confusion Matrices:")
    for result in progress["Table_4"]["Results"]:
        print(f"\nConfusion Matrix for {result['Classifier']}:")
        print(display_confusion_matrix(result["Confusion_Matrix"]))

    # REGRESSION ANALYSIS
    print("\n\n======== REGRESSION ANALYSIS ========\n")
    print("Loading and preprocessing real estate data for regression...")
    real_estate = pd.read_csv(real_estate_path)
    real_estate = real_estate.dropna()

    # Sample fraction of the dataset
    real_estate = real_estate.sample(frac=sample_fraction, random_state=random_state)
    print(f"Using {len(real_estate)} samples ({sample_fraction*100}% of original dataset)")

    # Prepare data for modeling
    X_reg = real_estate.drop(['No', 'Y house price of unit area'], axis=1)
    y_reg = real_estate['Y house price of unit area']
    X_reg_train, X_reg_test, y_reg_train, y_reg_test = train_test_split(X_reg, y_reg, test_size=0.2, random_state=random_state)
    print(f"Train set: {X_reg_train.shape[0]} samples, Test set: {X_reg_test.shape[0]} samples")

    # Define regressors
    regressors = {
        'Linear Regression': LinearRegression(),
        'Ridge Regression': Ridge(random_state=random_state),
        'Lasso Regression': Lasso(random_state=random_state),
        'ElasticNet': ElasticNet(random_state=random_state),
        'Decision Tree': DecisionTreeRegressor(random_state=random_state),
        'Random Forest': RandomForestRegressor(random_state=random_state)
    }

    # REGRESSION TABLE 1: Base regressors
    print("\n--- REGRESSION TABLE 1: Base regressors ---")
    for name in tqdm(regressors.keys(), desc="Training base regressors"):
        regressor = regressors[name]
        start_time = time.time()
        regressor.fit(X_reg_train, y_reg_train)
        y_reg_pred = regressor.predict(X_reg_test)
        train_time = time.time() - start_time
        rmse, mae, r2 = get_regression_metrics(y_reg_test, y_reg_pred)

        # Store results
        result = {
            "Regressor": name,
            "RMSE": float(rmse),
            "MAE": float(mae),
            "R²": float(r2),
            "Time(s)": float(train_time)
        }
        progress["Regression_Table_1"]["Results"].append(result)

        # Calculate feature importance if available
        feature_imp = get_feature_importance(regressor, X_reg.columns)
        if feature_imp is not None:
            progress["Regression_Table_1"]["Feature_Importance"][name] = feature_imp.to_dict(orient='records')

        # Save progress after each model
        with open("progress.json", "w") as json_file:
            json.dump(progress, json_file, indent=4, default=make_serializable)

    # Display Regression Table 1 results in console
    display_results_table(
        progress["Regression_Table_1"]["Results"],
        "REGRESSION TABLE 1: Base regressors",
        {"Regressor": "Regressor", "RMSE": "RMSE", "MAE": "MAE", "R²": "R²", "Time(s)": "Time (s)"}
    )

    # Display feature importance
    display_feature_importance(progress["Regression_Table_1"]["Feature_Importance"], "REGRESSION TABLE 1")

    # REGRESSION TABLE 2: Grid search with 10-fold CV
    print("\n--- REGRESSION TABLE 2: Grid search with 10-fold CV ---")
    reg_param_grids = {
        'Linear Regression': {},  # No hyperparameters to tune
        'Ridge Regression': {
            'regressor__alpha': [0.01, 0.1, 1, 10, 100]
        },
        'Lasso Regression': {
            'regressor__alpha': [0.01, 0.1, 1, 10, 100]
        },
        'ElasticNet': {
            'regressor__alpha': [0.01, 0.1, 1, 10, 100],
            'regressor__l1_ratio': [0.1, 0.5, 0.9]
        },
        'Decision Tree': {
            'regressor__max_depth': [None, 10, 20, 30],
            'regressor__min_samples_split': [2, 5, 10]
        },
        'Random Forest': {
            'regressor__n_estimators': [50, 100, 200],
            'regressor__max_depth': [None, 10, 20]
        }
    }

    kf = KFold(n_splits=10, shuffle=True, random_state=random_state)

    for name in tqdm(regressors.keys(), desc="Performing grid search for regressors"):
        regressor = regressors[name]
        pipeline = Pipeline([('regressor', regressor)])

        if name == 'Linear Regression':
            # Linear Regression doesn't have hyperparameters to tune
            cv_scores = cross_val_score(pipeline, X_reg_train, y_reg_train, cv=kf,
                                      scoring='neg_mean_squared_error')
            pipeline.fit(X_reg_train, y_reg_train)
            y_reg_pred = pipeline.predict(X_reg_test)
            rmse, mae, r2 = get_regression_metrics(y_reg_test, y_reg_pred)

            # Store results
            result = {
                "Regressor": name,
                "CV_RMSE": float(np.sqrt(-np.mean(cv_scores))),
                "Test_RMSE": float(rmse),
                "Test_MAE": float(mae),
                "Test_R²": float(r2)
            }
            progress["Regression_Table_2"]["Results"].append(result)
            progress["Regression_Table_2"]["Best_Hyperparameters"][name] = "No hyperparameters to tune"
        else:
            grid_search = GridSearchCV(
                pipeline,
                reg_param_grids[name],
                cv=kf,
                scoring='neg_mean_squared_error',
                n_jobs=-1
            )
            grid_search.fit(X_reg_train, y_reg_train)
            y_reg_pred = grid_search.predict(X_reg_test)
            rmse, mae, r2 = get_regression_metrics(y_reg_test, y_reg_pred)

            # Store results
            result = {
                "Regressor": name,
                "CV_RMSE": float(np.sqrt(-grid_search.best_score_)),
                "Test_RMSE": float(rmse),
                "Test_MAE": float(mae),
                "Test_R²": float(r2)
            }
            progress["Regression_Table_2"]["Results"].append(result)
            progress["Regression_Table_2"]["Best_Hyperparameters"][name] = grid_search.best_params_

            # Calculate feature importance for the best model
            try:
                feature_imp = get_feature_importance(grid_search.best_estimator_.named_steps['regressor'], X_reg.columns)
                if feature_imp is not None:
                    progress["Regression_Table_2"]["Feature_Importance"][name] = feature_imp.to_dict(orient='records')
            except:
                pass

        # Save progress after each model
        with open("progress.json", "w") as json_file:
            json.dump(progress, json_file, indent=4, default=make_serializable)

    # Display Regression Table 2 results in console
    display_results_table(
        progress["Regression_Table_2"]["Results"],
        "REGRESSION TABLE 2: Grid search with 10-fold CV",
        {"Regressor": "Regressor", "CV_RMSE": "CV RMSE", "Test_RMSE": "Test RMSE",
         "Test_MAE": "Test MAE", "Test_R²": "Test R²"}
    )

    # Display best hyperparameters
    display_hyperparameters(progress["Regression_Table_2"]["Best_Hyperparameters"], "REGRESSION TABLE 2")

    # Display feature importance
    display_feature_importance(progress["Regression_Table_2"]["Feature_Importance"], "REGRESSION TABLE 2")

    return progress

# Entry point if running as a script
if __name__ == "__main__":
    # Define paths to data files
    loan_data_path = 'loan_data.csv'
    real_estate_path = 'Real-estate.csv'

    # Run all experiments
    progress = run_experiments(loan_data_path, real_estate_path)

    print("All experiments completed and saved to progress.json")


Loading and preprocessing loan data for classification...
Using 13500 samples (30.0% of original dataset)
Train set: 10800 samples, Test set: 2700 samples

--- TABLE 1: Base classifiers without scaling ---


Training base classifiers: 100%|██████████| 6/6 [00:05<00:00,  1.04it/s]



TABLE 1: Base classifiers without scaling Results:
+---------------------+--------------------+--------------------+--------------------+--------------------+---------------------+
|     Classifier      |      Accuracy      |     Precision      |       Recall       |      F1-Score      |      Time (s)       |
+---------------------+--------------------+--------------------+--------------------+--------------------+---------------------+
| Logistic Regression | 0.8792592592592593 | 0.8792592592592593 | 0.8792592592592593 | 0.8792592592592593 | 1.4425785541534424  |
|    Decision Tree    | 0.8762962962962964 | 0.8794726374598245 | 0.8762962962962964 | 0.8776308978479803 | 0.0583651065826416  |
|    Random Forest    | 0.9207407407407407 | 0.9192506112486195 | 0.9207407407407407 | 0.9187391344824295 | 1.3788549900054932  |
|  Gradient Boosting  | 0.9159259259259259 | 0.9141592968259635 | 0.9159259259259259 | 0.9138306535753372 |  2.503533124923706  |
| K-Nearest Neighbors | 0.819629629629

Testing scaling methods:   0%|          | 0/4 [00:00<?, ?it/s]
Training with L1 Normalization:   0%|          | 0/6 [00:00<?, ?it/s][A
Training with L1 Normalization:  33%|███▎      | 2/6 [00:00<00:00,  9.15it/s][A
Training with L1 Normalization:  50%|█████     | 3/6 [00:03<00:04,  1.38s/it][A
Training with L1 Normalization:  67%|██████▋   | 4/6 [00:09<00:06,  3.21s/it][A
Training with L1 Normalization:  83%|████████▎ | 5/6 [00:09<00:02,  2.17s/it][A
Testing scaling methods:  25%|██▌       | 1/4 [00:09<00:29,  9.94s/it]
Training with L2 Normalization:   0%|          | 0/6 [00:00<?, ?it/s][A
Training with L2 Normalization:  33%|███▎      | 2/6 [00:00<00:00, 12.69it/s][A
Training with L2 Normalization:  67%|██████▋   | 4/6 [00:10<00:05,  2.94s/it][A
Training with L2 Normalization:  83%|████████▎ | 5/6 [00:10<00:02,  2.12s/it][A
Testing scaling methods:  50%|█████     | 2/4 [00:20<00:20, 10.13s/it]
Training with Min-Max Scaling:   0%|          | 0/6 [00:00<?, ?it/s][A
Training w


TABLE 2: Scaling methods comparison Results:
+------------------+---------------------+--------------------+--------------------+--------------------+--------------------+
|  Scaling Method  |     Classifier      |      Accuracy      |     Precision      |       Recall       |      F1-Score      |
+------------------+---------------------+--------------------+--------------------+--------------------+--------------------+
| L1 Normalization | Logistic Regression | 0.7911111111111111 | 0.7832175200278649 | 0.7911111111111111 | 0.7370912806213931 |
| L1 Normalization |    Decision Tree    |        0.88        | 0.8793725868725869 |        0.88        | 0.879671204231698  |
| L1 Normalization |    Random Forest    | 0.9111111111111111 | 0.9091558589357904 | 0.9111111111111111 | 0.9085063503538883 |
| L1 Normalization |  Gradient Boosting  | 0.9103703703703704 | 0.9083086993970715 | 0.9103703703703704 | 0.9079870563821182 |
| L1 Normalization | K-Nearest Neighbors | 0.8177777777777778 | 0

Performing grid search with CV: 100%|██████████| 6/6 [05:45<00:00, 57.56s/it]



TABLE 3: Grid search with 10-fold CV Results:
+---------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|     Classifier      |    CV Accuracy     |   Test Accuracy    |     Precision      |       Recall       |      F1-Score      |
+---------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
| Logistic Regression | 0.8897222222222222 | 0.8844444444444445 | 0.8833926454370002 | 0.8844444444444445 | 0.8838688218141167 |
|    Decision Tree    | 0.9080555555555556 | 0.9018518518518519 | 0.8992776283808447 | 0.9018518518518519 | 0.8990762172462616 |
|    Random Forest    | 0.9222222222222223 | 0.9203703703703704 | 0.9188014315857419 | 0.9203703703703704 |  0.91859516456305  |
|  Gradient Boosting  | 0.9249074074074073 | 0.9225925925925926 | 0.9211184783336682 | 0.9225925925925926 | 0.9209671821287903 |
| K-Nearest Neighbors | 0.889351851851852  | 0.882

Training with PCA-transformed data: 100%|██████████| 6/6 [00:21<00:00,  3.59s/it]



TABLE 4: PCA with best scaling method Results:
+---------------------+--------------------+--------------------+--------------------+--------------------+
|     Classifier      |      Accuracy      |     Precision      |       Recall       |      F1-Score      |
+---------------------+--------------------+--------------------+--------------------+--------------------+
| Logistic Regression |        0.88        | 0.8797437252069712 |        0.88        | 0.8798694602896007 |
|    Decision Tree    | 0.8677777777777778 | 0.8634108983465202 | 0.8677777777777778 | 0.864570090177342  |
|    Random Forest    | 0.8933333333333333 | 0.8903870716510903 | 0.8933333333333333 | 0.8908508840028213 |
|  Gradient Boosting  | 0.892962962962963  | 0.8909266289783265 | 0.892962962962963  | 0.8916488095388704 |
| K-Nearest Neighbors | 0.8811111111111111 | 0.8774147854147853 | 0.8811111111111111 | 0.8781481929413358 |
|     Naive Bayes     | 0.8618518518518519 | 0.8558244000557992 | 0.8618518518518519 |  

Training base regressors: 100%|██████████| 6/6 [00:00<00:00, 16.49it/s]



REGRESSION TABLE 1: Base regressors Results:
+-------------------+--------------------+-------------------+---------------------+-----------------------+
|     Regressor     |        RMSE        |        MAE        |         R²          |       Time (s)        |
+-------------------+--------------------+-------------------+---------------------+-----------------------+
| Linear Regression | 10.424409933130125 | 7.034129722255646 | 0.2801434596539847  | 0.022675514221191406  |
| Ridge Regression  | 9.508211147175013  | 7.217743884488114 | 0.40111887306800165 | 0.005811929702758789  |
| Lasso Regression  |  9.58400225056513  | 7.219431286672506 | 0.3915333128090954  | 0.0052068233489990234 |
|    ElasticNet     | 9.549598665114218  | 7.181152842500624 | 0.39589388428243555 |  0.00445866584777832  |
|   Decision Tree   | 11.216487863854711 | 7.303999999999999 | 0.16659371053877214 | 0.004698038101196289  |
|   Random Forest   | 8.958242827824217  | 6.369933714285715 | 0.4683954944123905 

Performing grid search for regressors: 100%|██████████| 6/6 [00:18<00:00,  3.03s/it]


REGRESSION TABLE 2: Grid search with 10-fold CV Results:
+-------------------+--------------------+--------------------+--------------------+---------------------+
|     Regressor     |      CV RMSE       |     Test RMSE      |      Test MAE      |       Test R²       |
+-------------------+--------------------+--------------------+--------------------+---------------------+
| Linear Regression | 8.490477379129779  | 10.424409933130125 | 7.034129722255646  | 0.2801434596539847  |
| Ridge Regression  | 8.591346645860101  | 9.833963958856085  |  7.08641647474216  | 0.3593804078238685  |
| Lasso Regression  | 8.540316031991022  | 9.986616215508757  | 7.0640211927901975 | 0.3393374144170136  |
|    ElasticNet     | 8.710321982928066  | 9.549598665114218  | 7.181152842500624  | 0.39589388428243555 |
|   Decision Tree   | 9.464176532964052  | 8.965080804583048  | 6.534336507936507  | 0.46758361942975024 |
|   Random Forest   | 7.6937409376709995 | 8.981587559071642  | 6.402131587301589  | 0




In [None]:
import json
import pandas as pd
import numpy as np

# Load the progress-2.json file with model information
with open('progress.json', 'r') as file:
    data = json.load(file)

# Extract the best classifier from Table 4 (PCA models)
table4_results = [(result["Classifier"], result["Accuracy"]) for result in data["Table_4"]["Results"]]
best_model_name = max(table4_results, key=lambda x: x[1])[0]
best_accuracy = max(table4_results, key=lambda x: x[1])[1]
print(f"Using best model from Table 4: {best_model_name} with accuracy {best_accuracy:.4f}")

# Get feature importance for the best model
feature_importance = data["Table_4"]["Feature_Importance"].get(best_model_name, [])
if feature_importance:
    # Sort by importance
    feature_importance = sorted(feature_importance, key=lambda x: x["Importance"], reverse=True)
    print("\nTop 5 features by importance:")
    for i, feature in enumerate(feature_importance[:5]):
        print(f"{i+1}. {feature['Feature']}: {feature['Importance']:.4f}")

# Function to predict loan approval based on user input
def predict_loan_approval():
    print("\n===== LOAN APPROVAL PREDICTION =====")

    try:
        # Collect all inputs from user matching loan_data.csv columns
        person_age = float(input("Enter person's age: "))

        gender_options = {"male": 1, "female": 0}
        gender_input = input("Enter gender (male/female): ").lower()
        person_gender = gender_options.get(gender_input, 1)

        education_options = {
            "high school": 0, "associate": 1, "bachelor": 2,
            "master": 3, "doctorate": 4
        }
        education_input = input("Enter education level (high school/associate/bachelor/master/doctorate): ").lower()
        person_education = education_options.get(education_input, 2)

        person_income = float(input("Enter annual income ($): "))
        person_emp_length = float(input("Enter employment length (years): "))

        ownership_options = {"rent": 0, "own": 1, "mortgage": 2, "other": 3}
        ownership_input = input("Enter home ownership (rent/own/mortgage/other): ").lower()
        person_home_ownership = ownership_options.get(ownership_input, 0)

        loan_amnt = float(input("Enter loan amount ($): "))

        intent_options = {
            "personal": 0, "education": 1, "medical": 2, "venture": 3,
            "home improvement": 4, "debt consolidation": 5
        }
        intent_input = input("Enter loan intent (personal/education/medical/venture/home improvement/debt consolidation): ").lower()
        loan_intent = intent_options.get(intent_input, 0)

        loan_int_rate = float(input("Enter loan interest rate (%): "))
        loan_percent_income = float(input("Enter loan percent of income (0-1): "))
        cb_person_cred_hist_length = float(input("Enter credit history length (years): "))
        credit_score = float(input("Enter credit score (300-850): "))

        default_options = {"yes": 1, "no": 0}
        default_input = input("Previous loan defaults? (yes/no): ").lower()
        previous_loan_defaults = default_options.get(default_input, 0)

        # Create a DataFrame with user inputs
        loan_application = pd.DataFrame({
            'person_age': [person_age],
            'person_gender': [person_gender],
            'person_education': [person_education],
            'person_income': [person_income],
            'person_emp_exp': [person_emp_length],
            'person_home_ownership': [person_home_ownership],
            'loan_amnt': [loan_amnt],
            'loan_intent': [loan_intent],
            'loan_int_rate': [loan_int_rate],
            'loan_percent_income': [loan_percent_income],
            'cb_person_cred_hist_length': [cb_person_cred_hist_length],
            'credit_score': [credit_score],
            'previous_loan_defaults_on_file': [previous_loan_defaults]
        })

        # Risk scoring based on feature importance from the JSON file
        risk_score = 0

        # Previous loan defaults (major risk factor)
        if previous_loan_defaults == 1:
            risk_score += 5

        # Loan amount (from feature importance)
        if loan_amnt > 25000:
            risk_score += 3
        elif loan_amnt > 15000:
            risk_score += 2
        elif loan_amnt > 5000:
            risk_score += 1

        # Loan percent income (from feature importance)
        if loan_percent_income > 0.5:
            risk_score += 3
        elif loan_percent_income > 0.3:
            risk_score += 2
        elif loan_percent_income > 0.2:
            risk_score += 1

        # Loan interest rate (from feature importance)
        if loan_int_rate > 15:
            risk_score += 2
        elif loan_int_rate > 10:
            risk_score += 1

        # Income (from feature importance)
        if person_income < 30000:
            risk_score += 2
        elif person_income < 60000:
            risk_score += 1
        elif person_income > 100000:
            risk_score -= 1

        # Credit score
        if credit_score < 580:
            risk_score += 2
        elif credit_score < 670:
            risk_score += 1
        elif credit_score > 740:
            risk_score -= 1

        # Make prediction (0 = approved, 1 = not approved)
        threshold = 4
        prediction = 1 if risk_score > threshold else 0
        confidence = min(abs(risk_score - threshold) / 5 + 0.5, 0.95)

        return prediction, confidence, loan_application, risk_score

    except ValueError:
        print("Error: Please enter valid numerical values.")
        return None, None, None, None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None, None, None, None

# Make prediction
prediction, confidence, loan_application, risk_score = predict_loan_approval()

# Display results
if prediction is not None:
    print("\n===== LOAN APPLICATION SUMMARY =====")
    print(loan_application.to_string(index=False))

    print("\n===== PREDICTION RESULT =====")
    print(f"Risk Score: {risk_score:.2f}")
    print(f"Loan Status Prediction: {'NOT APPROVED' if prediction == 1 else 'APPROVED'}")
    print(f"Confidence: {confidence:.2f}")
    print(f"\nThis prediction is based on the {best_model_name} model (accuracy: {best_accuracy:.4f}) from PCA analysis")


Using best model from Table 4: Random Forest with accuracy 0.8933

Top 5 features by importance:
1. loan_amnt: 0.2621
2. loan_percent_income: 0.1573
3. person_income: 0.1468
4. loan_int_rate: 0.1419
5. person_home_ownership: 0.0812

===== LOAN APPROVAL PREDICTION =====
Enter person's age: 34
Enter gender (male/female): male
Enter education level (high school/associate/bachelor/master/doctorate): master
Enter annual income ($): 45000
Enter employment length (years): 5
Enter home ownership (rent/own/mortgage/other): rent
Enter loan amount ($): 300000
Enter loan intent (personal/education/medical/venture/home improvement/debt consolidation): education
Enter loan interest rate (%): 3.2
Enter loan percent of income (0-1): 1
Enter credit history length (years): 3
Enter credit score (300-850): 350
Previous loan defaults? (yes/no): no

===== LOAN APPLICATION SUMMARY =====
 person_age  person_gender  person_education  person_income  person_emp_exp  person_home_ownership  loan_amnt  loan_intent 