This code has been modified from the absolute path in the original version to a relative path, so there may be path bugs. Please be aware of this.

Please note that the cleaned data and sampling data generation part of 1-3 and 1-4 only requires one execution.

In [None]:
import pandas as pd
import os

file_path = "sampleddata/USKSAT_OpenRefined.csv"


# Read the CSV file

df = pd.read_csv(file_path)


# Keep specific features

selected_features = [
    "Ksat_cmhr",
    "Db",
    "Clay",
    "VFS",
    "MS",
    "OC",
    "Silt",
    "COS",
    "FS",
    "Depth.cm_Top",
    "VCOS",
]

df = df[selected_features]


# Remove samples with NaN values

df = df.dropna()


# Generate a new file path with the suffix "cleaned"

file_dir, file_name = os.path.split(file_path)

file_name_wo_ext, file_ext = os.path.splitext(file_name)

new_file_name = f"{file_name_wo_ext}_cleaned{file_ext}"

new_file_path = os.path.join(file_dir, new_file_name)


# Save the processed dataset to the new file

df = df.reset_index(drop=True)

df.to_csv(new_file_path)

From the cell below, it can be run independently. This is the sampling section.

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import skfuzzy as fuzz
import os
from joblib import Parallel, delayed

In [None]:
# Specify the file path
file_path = "sampleddata/USKSAT_OpenRefined_cleaned.csv"

# Read the CSV file
df = pd.read_csv(file_path)

Fuzzy c-means clustering sampling

Type 2: Number of clusters equals the number of sampling dataset

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import skfuzzy as fuzz
import os
from joblib import Parallel, delayed

# Define sample sizes and the number of sets to generate
sample_sizes = [1000, 5000, 10000]
num_samples_per_size = 20

# Remove Ksat_cmhr from df
df_nonKsat = df.drop(columns=["Ksat_cmhr"])

# Standardize the data
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_nonKsat)


# Define the function to generate sample sets
def generate_samples(sample_size, random_state):
    n_clusters = sample_size  # Set the number of clusters equal to the sample size

    # Perform fuzzy c-means clustering
    _, u, _, _, _, _, _ = fuzz.cluster.cmeans(
        df_scaled.T,
        n_clusters,
        2,
        error=0.005,
        maxiter=1000,
        init=None,
        seed=random_state,
    )

    # Initialize variables
    sampled_indices = set()
    sampled_cluster = np.full(
        df_scaled.shape[0], -1
    )  # -1 means not sampled, others indicate the corresponding cluster
    sample_status = np.full(
        (df_scaled.shape[0], n_clusters), 0
    )  # 0: not sampled, 1: sampled, 2: reclassified to another cluster after being sampled
    remaining_samples_per_cluster = np.ones(
        n_clusters, dtype=int
    )  # Only 1 sample is taken from each cluster

    def resample_cluster(
        cluster_idx,
        sampled_indices,
        sampled_cluster,
        sample_status,
        remaining_samples_per_cluster,
    ):
        """
        Resample samples in the specified cluster.

        Parameters:
        - cluster_idx: index of the current cluster
        - sampled_indices: set of sampled indices
        - sampled_cluster: cluster assignment for each sample
        - sample_status: status of each sample in each cluster
        - remaining_samples_per_cluster: remaining sample count for each cluster
        """
        # Select unsampled points
        cluster_indices_j = np.where(sample_status[:, cluster_idx] == 0)[0]
        # Sort by membership
        memberships_j = u[cluster_idx, cluster_indices_j]
        sorted_indices_j = cluster_indices_j[np.argsort(-memberships_j)]
        # Sample
        for idx_j in sorted_indices_j:
            if remaining_samples_per_cluster[cluster_idx] <= 0:
                break
            if sample_status[idx_j, cluster_idx] == 0:
                if sampled_cluster[idx_j] == -1:
                    # If the sample has not been sampled, sample it directly
                    sampled_indices.add(idx_j)
                    sampled_cluster[idx_j] = cluster_idx
                    sample_status[idx_j, cluster_idx] = 1  # Sampled
                    remaining_samples_per_cluster[cluster_idx] -= 1
                    return
                else:
                    # If the sample has already been sampled by another cluster, handle membership conflict
                    original_cluster = sampled_cluster[idx_j]
                    if u[cluster_idx, idx_j] > u[original_cluster, idx_j]:
                        sampled_cluster[idx_j] = cluster_idx
                        sample_status[idx_j, original_cluster] = (
                            2  # Reclassified to another cluster after being sampled
                        )
                        sample_status[idx_j, cluster_idx] = 1  # Sampled
                        remaining_samples_per_cluster[cluster_idx] -= 1
                        remaining_samples_per_cluster[original_cluster] += 1
                        # Resample the cluster from which the sample was removed
                        resample_cluster(
                            original_cluster,
                            sampled_indices,
                            sampled_cluster,
                            sample_status,
                            remaining_samples_per_cluster,
                        )
                        return

    # Sample based on membership and handle membership conflicts
    for i in range(n_clusters):
        cluster_indices = np.arange(df_scaled.shape[0])  # Select all points
        if len(cluster_indices) > 0:
            # Sort by membership
            memberships = u[i, cluster_indices]
            sorted_indices = cluster_indices[np.argsort(-memberships)]
            # Sample
            for idx in sorted_indices:
                if remaining_samples_per_cluster[i] <= 0:
                    break
                if sampled_cluster[idx] == -1:
                    sampled_indices.add(idx)
                    sampled_cluster[idx] = i
                    sample_status[idx, i] = 1  # Sampled
                    remaining_samples_per_cluster[i] -= 1
                else:
                    # If the sample has already been sampled by another cluster, handle membership conflict
                    current_cluster = sampled_cluster[idx]
                    if u[i, idx] > u[current_cluster, idx]:
                        sampled_cluster[idx] = i
                        sample_status[idx, current_cluster] = (
                            2  # Reclassified to another cluster after being sampled
                        )
                        sample_status[idx, i] = 1  # Sampled
                        remaining_samples_per_cluster[i] -= 1
                        remaining_samples_per_cluster[current_cluster] += 1
                        # Resample the cluster from which the sample was removed
                        resample_cluster(
                            current_cluster,
                            sampled_indices,
                            sampled_cluster,
                            sample_status,
                            remaining_samples_per_cluster,
                        )

    # Get the sampled data
    sampled_df = df.loc[list(sampled_indices)].copy()
    return sampled_df


# Specify the save path
save_path = "sampleddata/combined_samples_Ksat/"

# Create the save directory if it does not exist
os.makedirs(save_path, exist_ok=True)


# Define the parallel processing function
def process_sample(sample_size, i):
    random_state = 42 + i  # Generate random seed
    sampled_df = generate_samples(sample_size, random_state)
    sampled_df.to_csv(
        os.path.join(save_path, f"FCMtp2sampled_data_{sample_size}_set_{i+1}.csv"),
        index=True,
    )
    print(
        f"Generated sample set {i+1} for sample size {sample_size} with random state {random_state}"
    )


# Use joblib for parallel processing
Parallel(n_jobs=6, verbose=50)(
    delayed(process_sample)(sample_size, i)
    for sample_size in sample_sizes
    for i in range(num_samples_per_size)
)

print("All sample sets generated and saved.")

CLHS Conditioned Latin hypercube sampling

In [None]:
from clhs import clhs
from joblib import Parallel, delayed
from sklearn.preprocessing import StandardScaler
import os

# Standardize the data
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_nonKsat)

# Define sample sizes and the number of sets to generate
sample_sizes = [1000, 5000, 10000]
num_samples_per_size = 20

# Specify the save path
save_path = "sampleddata/combined_samples_Ksat/"

# Create the save directory if it does not exist
os.makedirs(save_path, exist_ok=True)


# Define the parallel processing function
def parallel_clhs(sample_size, i, seed):
    np.random.seed(seed)
    clhs_sampled_results = clhs(df_scaled, sample_size)
    clhs_sampled_df = df.iloc[clhs_sampled_results["sample_indices"]]
    file_path = os.path.join(
        save_path, f"clhs_sampled_data_{sample_size}_set_{i+1}.csv"
    )
    clhs_sampled_df.to_csv(file_path)
    print(f"Generated sample set {i+1} for sample size {sample_size}")


# Use joblib for parallel sampling
Parallel(n_jobs=5, verbose=30)(
    delayed(parallel_clhs)(size, i, np.random.randint(0, 10000))
    for size in sample_sizes
    for i in range(num_samples_per_size)
)

print("All sample sets generated and saved.")

From the cell below, it can be run independently. This is the model construction section.

In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.utils import resample

# Read the original dataset
class_df = pd.read_csv(
    "sampleddata/USKSAT_OpenRefined_cleaned.csv",
    index_col=0,  # Use the first column as the index
)

# Convert the target variable to natural logarithm
class_df["Ksat_cmhr"] = np.log(class_df["Ksat_cmhr"])

# Define sample levels and sampling methods
sample_levels = [1000, 5000, 10000]

sampling_methods = [
    "BalancedSampling_sampled_data",
    "clhs_sampled_data",
    "FCMtp2sampled_data",
    "FSCS_sampled_data",
]

# Define covariates and target variable
covariates = [
    "Db",
    "Clay",
    "VFS",
    "MS",
    "OC",
    "Silt",
    "COS",
    "FS",
    "Depth.cm_Top",
    "VCOS",
]
covariates_FSCS = [
    "Db",
    "Clay",
    "VFS",
    "MS",
    "OC",
    "Silt",
    "COS",
    "FS",
    "Depth_cm_Top",
    "VCOS",
]
target = "Ksat_cmhr"

# Store training and testing sets
train_test_data = {}

# Loop through each sample level and sampling method
for level in sample_levels:
    for t in range(1, 21):
        for method in sampling_methods:
            sample_file = (
                f"sampleddata/combined_samples_Ksat/{method}_{level}_set_{t}.csv"
            )

            if not os.path.exists(sample_file):
                print(f"File {sample_file} does not exist. Skipping...")
                continue

            sample_data = pd.read_csv(
                sample_file, index_col=0
            )  # Use the first column as the index
            sample_data[target] = np.log(
                sample_data[target]
            )  # Convert the target variable to natural logarithm
            if method == "FSCS_sampled_data":
                X_train = sample_data[covariates_FSCS]
                X_train = X_train.rename(columns={"Depth_cm_Top": "DT"})
            else:
                X_train = sample_data[covariates]
                X_train = X_train.rename(columns={"Depth.cm_Top": "DT"})

            y_train = sample_data[target]
            y_train.name = "Ks"
            train_indices = sample_data.index

            # Generate the corresponding test set
            test_data = class_df.drop(train_indices)
            X_test = test_data[covariates]
            X_test = X_test.rename(columns={"Depth.cm_Top": "DT"})
            y_test = test_data[target]
            y_test.name = "Ks"

            # Store training and testing sets
            train_test_data[f"{method}_{level}_set_{t}"] = (
                X_train,
                y_train,
                X_test,
                y_test,
            )

# Generate SRS sample subsets
for level in sample_levels:
    for t in range(1, 21):
        # Simple random sampling
        srs_sample = resample(class_df, n_samples=level, random_state=42 + t)
        X_train = srs_sample[covariates]
        y_train = srs_sample[target]
        train_indices = srs_sample.index

        # Generate the corresponding test set
        test_data = class_df.drop(train_indices)
        X_test = test_data[covariates]
        y_test = test_data[target]

        X_train = X_train.rename(columns={"Depth.cm_Top": "DT"})
        X_test = X_test.rename(columns={"Depth.cm_Top": "DT"})
        y_train.name = "Ks"
        y_test.name = "Ks"

        # Store training and testing sets
        train_test_data[f"SRS_sampled_df_{level}_set_{t}"] = (
            X_train,
            y_train,
            X_test,
            y_test,
        )

print("All training and testing sets generated and saved.")

In [None]:
# Clear unnecessary variables to free up memory.
del (
    class_df,
    sample_data,
    X_train,
    y_train,
    X_test,
    y_test,
    train_indices,
    test_data,
    srs_sample,
)
import gc

gc.collect()

In [None]:
import optuna
from optuna.samplers import TPESampler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import pandas as pd
from joblib import Parallel, delayed


# Define the objective function for Optuna hyperparameter optimization
def objective(trial, X_train, y_train):
    """
    Objective function for Optuna hyperparameter optimization.

    Parameters:
    trial: Optuna Trial object, used to define the hyperparameter search space and record results.
    X_train: Training feature data.
    y_train: Training label data.

    Returns:
    The average cross-validation accuracy.
    """

    # Define the hyperparameter search space
    n_estimators = trial.suggest_int("n_estimators", 10, 1000)  # Number of trees
    max_depth = trial.suggest_int("max_depth", 2, 32, log=True)  # Maximum tree depth
    min_samples_split = trial.suggest_int(
        "min_samples_split", 2, 16
    )  # Minimum samples required to split an internal node
    min_samples_leaf = trial.suggest_int(
        "min_samples_leaf", 1, 16
    )  # Minimum samples required at a leaf node
    max_features = trial.suggest_categorical(
        "max_features", ["sqrt", "log2"]
    )  # Number of features to consider for each tree

    # Create the random forest regressor
    reg = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=42,  # Fixed random seed for reproducibility
    )

    # Perform cross-validation and return the average RMSE
    scores = cross_val_score(reg, X_train, y_train, cv=10, scoring="rmse")
    return np.mean(scores)


# Parallelize hyperparameter optimization and model evaluation for each dataset
def optimize_and_evaluate(key, X_train, y_train, X_test, y_test):
    """
    Perform hyperparameter optimization and model evaluation for each dataset.

    Parameters:
    key: Name of the dataset.
    X_train: Training feature data.
    y_train: Training label data.
    X_test: Test feature data.
    y_test: Test label data.
    """

    # Create an Optuna study object for hyperparameter optimization
    study = optuna.create_study(direction="maximize", sampler=TPESampler())

    # Use Optuna for hyperparameter optimization, n_trials=100 means 100 trials, n_jobs=6 uses all available CPU cores
    study.optimize(
        lambda trial: objective(trial, X_train, y_train), n_trials=100, n_jobs=6
    )

    # Get the best hyperparameters
    best_params = study.best_params

    # Train the model with the best hyperparameters
    reg = RandomForestRegressor(**best_params, random_state=42)
    reg.fit(X_train, y_train)

    # Return the result
    return {
        "dataset": key,
        "best_params": best_params,
    }


# Use joblib to parallelize processing for each dataset
results = Parallel(n_jobs=6, verbose=30)(
    delayed(optimize_and_evaluate)(key, X_train, y_train, X_test, y_test)
    for key, (X_train, y_train, X_test, y_test) in train_test_data.items()
)

# Convert the results to a DataFrame and print
results_df = pd.DataFrame(results)
print(results_df)

In [None]:
results_df.to_csv("rfKsatresults.csv", index=False)

From the cell below, it can be run independently. This is the compare analysis section. Including SHAP and Model Performance Evaluation.

In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.utils import resample

# Read the original dataset
class_df = pd.read_csv(
    "sampleddata/USKSAT_OpenRefined_cleaned.csv",
    index_col=0,  # Use the first column as index
)

# Convert the target variable to natural logarithm
class_df["Ksat_cmhr"] = np.log(class_df["Ksat_cmhr"])

# Define sample sizes and sampling methods
sample_levels = [1000, 5000, 10000]

sampling_methods = [
    "BalancedSampling_sampled_data",
    "clhs_sampled_data",
    "FCMtp2sampled_data",
    "FSCS_sampled_data",
]

# Define covariates and target variable
covariates = [
    "Db",
    "Clay",
    "VFS",
    "MS",
    "OC",
    "Silt",
    "COS",
    "FS",
    "Depth.cm_Top",
    "VCOS",
]
covariates_FSCS = [
    "Db",
    "Clay",
    "VFS",
    "MS",
    "OC",
    "Silt",
    "COS",
    "FS",
    "Depth_cm_Top",
    "VCOS",
]
target = "Ksat_cmhr"

# Store training and testing sets
train_test_data = {}

# Loop through each sample size and sampling method
for level in sample_levels:
    for t in range(1, 21):
        for method in sampling_methods:
            sample_file = (
                f"sampleddata/combined_samples_Ksat/{method}_{level}_set_{t}.csv"
            )

            if not os.path.exists(sample_file):
                print(f"File {sample_file} does not exist. Skipping...")
                continue

            sample_data = pd.read_csv(
                sample_file, index_col=0
            )  # Use the first column as index
            sample_data[target] = np.log(
                sample_data[target]
            )  # Convert the target variable to natural logarithm
            if method == "FSCS_sampled_data":
                X_train = sample_data[covariates_FSCS]
                X_train = X_train.rename(columns={"Depth_cm_Top": "DT"})
            else:
                X_train = sample_data[covariates]
                X_train = X_train.rename(columns={"Depth.cm_Top": "DT"})

            y_train = sample_data[target]
            y_train.name = "Ks"
            train_indices = sample_data.index

            # Generate the corresponding test set
            test_data = class_df.drop(train_indices)
            X_test = test_data[covariates]
            X_test = X_test.rename(columns={"Depth.cm_Top": "DT"})
            y_test = test_data[target]
            y_test.name = "Ks"

            # Store training and testing sets
            train_test_data[f"{method}_{level}_set_{t}"] = (
                X_train,
                y_train,
                X_test,
                y_test,
            )

# Generate SRS sample subsets
for level in sample_levels:
    for t in range(1, 21):
        # Simple random sampling
        srs_sample = resample(class_df, n_samples=level, random_state=42 + t)
        X_train = srs_sample[covariates]
        y_train = srs_sample[target]
        train_indices = srs_sample.index

        # Generate the corresponding test set
        test_data = class_df.drop(train_indices)
        X_test = test_data[covariates]
        y_test = test_data[target]

        X_train = X_train.rename(columns={"Depth.cm_Top": "DT"})
        X_test = X_test.rename(columns={"Depth.cm_Top": "DT"})
        y_train.name = "Ks"
        y_test.name = "Ks"

        # Store training and testing sets
        train_test_data[f"SRS_sampled_df_{level}_set_{t}"] = (
            X_train,
            y_train,
            X_test,
            y_test,
        )

print("All training and testing sets generated and saved.")

In [None]:
# Delete unnecessary variables to free memory
del (
    class_df,
    sample_data,
    X_train,
    y_train,
    X_test,
    y_test,
    train_indices,
    test_data,
    srs_sample,
)
import gc

gc.collect()

In [None]:
# Read the saved DataFrame file
results_df = pd.read_csv("rfKsatresults.csv")

In [None]:
import joblib
import os
from sklearn.ensemble import RandomForestRegressor
from joblib import Parallel, delayed
import fasttreeshap

# Create a dictionary to store models
models = {}

# Specify the save path
save_path = "codepart/RFKsatpkl"
# Ensure the save path exists
os.makedirs(save_path, exist_ok=True)


def is_file_valid(filename):
    try:
        with open(filename, "rb") as f:
            joblib.load(f)
        return True
    except Exception as e:
        print(f"File {filename} is invalid: {e}")
        return False


def process_dataset(index, row):
    dataset = row["dataset"]

    best_params = row["best_params"]
    # best_params = eval(best_params)  # Convert string to dict if needed

    # Get the corresponding dataset
    X_train, y_train, X_test, y_test = train_test_data[dataset]

    # File paths
    model_filename = os.path.join(save_path, f"{dataset}_model.pkl")
    shap_values_filename_v2 = os.path.join(save_path, f"{dataset}_shap_values_v2.pkl")
    shap_interaction_values_filename_v1 = os.path.join(
        save_path, f"{dataset}_shap_interaction_values_v1.pkl"
    )
    shap_explainer_v2_filename = os.path.join(
        save_path, f"{dataset}_shap_explainer_v2.pkl"
    )
    shap_explainer_v1_filename = os.path.join(
        save_path, f"{dataset}_shap_explainer_v1.pkl"
    )

    # Check if all files already exist and are valid
    if (
        os.path.exists(model_filename)
        and is_file_valid(model_filename)
        and os.path.exists(shap_values_filename_v2)
        and is_file_valid(shap_values_filename_v2)
        and os.path.exists(shap_interaction_values_filename_v1)
        and is_file_valid(shap_interaction_values_filename_v1)
    ):
        print(f"{dataset} has already been processed, loading files directly.")
        rg = joblib.load(model_filename)
        shap_values_v2 = joblib.load(shap_values_filename_v2)
        shap_interaction_values_v1 = joblib.load(shap_interaction_values_filename_v1)

    else:
        if os.path.exists(model_filename) and is_file_valid(model_filename):
            rg = joblib.load(model_filename)
        else:
            # Train the model with the best hyperparameters
            rg = RandomForestRegressor(**best_params, random_state=42)
            rg.fit(X_train, y_train)

        # Save the trained model
        joblib.dump(rg, model_filename, compress=9)

        if os.path.exists(shap_values_filename_v2) and is_file_valid(
            shap_values_filename_v2
        ):
            shap_values_v2 = joblib.load(shap_values_filename_v2)
        else:
            # Perform SHAP analysis, use X_train to analyze feature importance and interaction changes
            shap_explainer_v2 = fasttreeshap.TreeExplainer(
                rg, algorithm="v2", n_jobs=-1
            )
            shap_values_v2 = shap_explainer_v2(X_train).values
            # Save SHAP values
            joblib.dump(shap_values_v2, shap_values_filename_v2, compress=9)
            # Save SHAP explainer
            joblib.dump(shap_explainer_v2, shap_explainer_v2_filename, compress=9)

        if os.path.exists(shap_interaction_values_filename_v1) and is_file_valid(
            shap_interaction_values_filename_v1
        ):
            shap_interaction_values_v1 = joblib.load(
                shap_interaction_values_filename_v1
            )
        else:
            shap_explainer_v1 = fasttreeshap.TreeExplainer(
                rg, algorithm="v1", n_jobs=-1
            )
            shap_interaction_values_v1

In [None]:
import joblib
import os
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score, mean_squared_log_error
from joblib import Parallel, delayed

# Specify the save path
save_path = "codepart/RFKsatpkl"
# Ensure the save path exists
os.makedirs(save_path, exist_ok=True)


def root_mean_squared_log_error(true, pred):
    square_error = np.square((np.log(true + 1) - np.log(pred + 1)))
    mean_square_log_error = np.mean(square_error)
    rmsle_loss = np.sqrt(mean_square_log_error)
    return rmsle_loss


def calculate_metrics(index, row):
    dataset = row["dataset"]

    # Get the corresponding dataset
    _, _, X_test, y_test = train_test_data[dataset]

    # File path
    model_filename = os.path.join(save_path, f"{dataset}_model.pkl")

    # Check if the model file already exists
    if os.path.exists(model_filename):
        print(f"{dataset} has already been processed, loading file directly.")
        clf = joblib.load(model_filename)

        # Predict on the test set
        y_pred = clf.predict(X_test)

        # Calculate R² score
        r2 = r2_score(y_test, y_pred)

        # Calculate RMSLE score
        rmsle_loss = root_mean_squared_log_error(y_test, y_pred)

        # Return results
        return index, r2, rmsle_loss
    else:
        print(f"Model file for {dataset} does not exist, skipping.")
        return index, None, None


# Use joblib for parallel processing
results = Parallel(n_jobs=-1, verbose=50)(
    delayed(calculate_metrics)(index, row) for index, row in results_df.iterrows()
)

# Update results_df
for index, r2, rmsle_value in results:
    if r2 is not None:
        results_df.loc[index, "r2"] = r2
        results_df.loc[index, "rmsle"] = rmsle_value

print("R² and RMSLE scores for all datasets have been calculated.")

In [None]:
results_df.to_csv("rfKsatresults_all.csv", index=False)

From the cell below, it can be run independently. Figure drawing

In [None]:
import pandas as pd

# Read the saved DataFrame file
results_df = pd.read_csv("rfKsatresults_all.csv")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Create a new variable to store extracted sampling methods and dataset levels
results_with_methods = results_df.copy()
results_with_methods["sampling_method"] = results_with_methods["dataset"].apply(
    lambda x: x.split("_")[0]
)
results_with_methods["dataset_level"] = results_with_methods["dataset"].apply(
    lambda x: x.split("_")[-3]
)

# Set font size
plt.rcParams.update({"font.size": 16})

# Draw boxplots
plt.figure(figsize=(10, 16))  # Adjust figure width

# Draw boxplot for R2
plt.subplot(2, 1, 1)
sns.boxplot(
    x="sampling_method",
    y="r2",
    hue="dataset_level",
    data=results_with_methods,
    width=0.6,
)
plt.title("R2 Boxplot", fontsize=20)
plt.xlabel("Sampling Method", fontsize=18, labelpad=20)
plt.ylabel("R2", fontsize=18, labelpad=20)
plt.grid(True, which="major", axis="y", linestyle="-", linewidth=0.6)
plt.grid(True, which="minor", axis="y", linestyle="--", linewidth=0.25)
plt.gca().yaxis.set_major_locator(plt.MultipleLocator(0.05))
plt.gca().yaxis.set_minor_locator(plt.MultipleLocator(0.01))
plt.legend(title="Dataset Level", loc="lower left")  # Add legend

# Draw boxplot for RMSLE
plt.subplot(2, 1, 2)
sns.boxplot(
    x="sampling_method", y="rmsle", hue="dataset_level", data=results_with_methods
)
plt.title("RMSLE Boxplot", fontsize=20)
plt.xlabel("Sampling Method", fontsize=18, labelpad=20)
plt.ylabel("Test RMSLE", fontsize=18, labelpad=20)
plt.grid(True, which="major", axis="y", linestyle="-", linewidth=0.6)
plt.grid(True, which="minor", axis="y", linestyle="--", linewidth=0.25)
plt.gca().yaxis.set_major_locator(plt.MultipleLocator(0.05))
plt.gca().yaxis.set_minor_locator(plt.MultipleLocator(0.01))
plt.gca().invert_yaxis()  # Invert y-axis direction
plt.legend(title="Dataset Level", loc="lower left")  # Add legend

# Specify x-axis labels from left to right
sampling_methods = [
    "Balance\nSampling",
    "CLHS",
    "FCM\nClu = level",
    "FSCS",
    "SRS",
]
plt.subplot(2, 1, 1)
plt.xticks(ticks=range(len(sampling_methods)), labels=sampling_methods)
plt.subplot(2, 1, 2)
plt.xticks(ticks=range(len(sampling_methods)), labels=sampling_methods)

plt.tight_layout()
plt.savefig("RFKsatresults_plot.jpg", format="jpg", dpi=800, bbox_inches="tight")
plt.show()

In [None]:
# Calculate the median R² and RMSLE for each sampling method
median_metrics = (
    results_with_methods.groupby(["sampling_method", "dataset_level"])[["r2", "rmsle"]]
    .median()
    .reset_index()
)

# Print the median results
print(median_metrics)
median_metrics.to_csv("rfKsatmedian_metrics.csv", index=False)


# Calculate the percentage difference of each method relative to SRS of the same level
def calculate_percentage_difference(group):
    srs_values = group[group["sampling_method"] == "SRS"]
    if srs_values.empty:
        return group.set_index("sampling_method") * float("nan")
    srs_values = srs_values.iloc[0][["r2", "rmsle"]]
    return (group.set_index("sampling_method")[["r2", "rmsle"]] / srs_values - 1) * 100


# Group by dataset_level and calculate the percentage difference for each method relative to SRS
percentage_diff = (
    median_metrics.groupby("dataset_level")
    .apply(calculate_percentage_difference)
    .reset_index()
)

# Merge into a single table
combined_metrics = pd.merge(
    median_metrics,
    percentage_diff,
    on=["sampling_method", "dataset_level"],
    suffixes=("", "_percentage_diff"),
)

# Print the combined table results
print("Combined table results:")
print(combined_metrics)
combined_metrics.to_csv("rfKsatmedian_metrics.csv", index=False)