This code has been modified from the absolute path in the original version to a relative path, so there may be path bugs. Please be aware of this.

Please note that the sampling data generation part of 1-1 and 1-2 only requires one execution.

In [None]:
import numpy as np
from sklearn.datasets import fetch_openml



# Load Forest CoverType dataset


forest_cover = fetch_openml(name="covertype", version=1, as_frame=True)


original_df = forest_cover.frame



# Ensure all variables are numerical


numerical_columns = original_df.select_dtypes(include=["number"]).columns


df = original_df[numerical_columns]



# Remove the last four columns from the dataframe


df = df.iloc[:, :-4]



# Extract the Cover_Type column and add it to df as the class column


class_df = df.copy()


class_df["class"] = forest_cover.frame["class"]



# Display the first few rows of the processed dataset


print(df.head())



# Use your data and variable list to create sample combinations


variables = df.columns.tolist()



# Categorize variables based on physical significance


variable_groups = [
    ["elevation", "aspect", "slope"],  # Terrain features
    [
        "horizontal_distance_to_hydrology",
        "Vertical_Distance_To_Hydrology",
    ],  # Hydrology features
    ["Horizontal_Distance_To_Roadways"],  # Road features
    ["Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm"],  # Hillshade features
    ["Horizontal_Distance_To_Fire_Points"],  # Fire point features
]



# Ensure all variables are in the variable list


for group in variable_groups:


    for variable in group:


        if variable not in variables:


            print(f"Warning: {variable} is not in the variables list")

Fuzzy c-means clustering sampling

Type 1: Number of clusters equals the number of class categories

In [None]:
from sklearn.preprocessing import StandardScaler
import skfuzzy as fuzz
import os
from joblib import Parallel, delayed

# Standardize the data
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)

# Define sample sizes and the number of samples to generate
sample_sizes = [1000, 5000, 10000, 20000]
num_samples_per_size = 20

# Get the number of class categories
n_clusters = class_df["class"].nunique()


# Define the function to generate sample sets
def generate_samples(sample_size, random_state):
    # Perform fuzzy c-means clustering
    _, u, _, d, _, _, _ = fuzz.cluster.cmeans(
        df_scaled.T,
        n_clusters,
        2,
        error=0.005,
        maxiter=1000,
        init=None,
        seed=random_state,
    )

    # Get the fuzzy membership for each data point
    fuzzy_membership = u.T

    # Calculate the number of samples for each cluster
    cluster_counts = np.sum(fuzzy_membership, axis=0)

    # Calculate the number of samples to extract from each cluster
    sample_size_per_cluster = (cluster_counts / cluster_counts.sum()) * sample_size
    sample_size_per_cluster = sample_size_per_cluster.astype(int)

    # Ensure the total number of samples equals sample_size
    difference = sample_size - sample_size_per_cluster.sum()
    if difference > 0:
        sample_size_per_cluster[np.argmin(sample_size_per_cluster)] += difference
    elif difference < 0:
        sample_size_per_cluster[np.argmax(sample_size_per_cluster)] += difference

    # Initialize variables
    sampled_indices = set()
    sampled_cluster = np.full(df_scaled.shape[0], -1)  # -1 indicates not sampled
    sample_status = np.full(
        (df_scaled.shape[0], n_clusters), 0
    )  # 0: not sampled, 1: sampled, 2: reclassified after sampling
    remaining_samples_per_cluster = sample_size_per_cluster.copy()

    def resample_cluster(
        cluster_idx,
        sampled_indices,
        sampled_cluster,
        sample_status,
        remaining_samples_per_cluster,
    ):
        """
        Resample samples from the specified cluster.

        Parameters:
        - cluster_idx: Index of the current cluster
        - sampled_indices: Set of sampled indices
        - sampled_cluster: Cluster assignment for each sample
        - sample_status: Sampling status for each sample in each cluster
        - remaining_samples_per_cluster: Remaining samples to extract from each cluster
        """
        # Select unsampled points
        cluster_indices_j = np.where(sample_status[:, cluster_idx] == 0)[0]
        # Sort by membership
        memberships_j = fuzzy_membership[cluster_indices_j, cluster_idx]
        # Sort indices by membership in descending order
        sorted_indices_j = cluster_indices_j[np.argsort(-memberships_j)]
        # Extract samples
        for idx_j in sorted_indices_j:
            if remaining_samples_per_cluster[cluster_idx] <= 0:
                break
            if sample_status[idx_j, cluster_idx] == 0:
                if sampled_cluster[idx_j] == -1:
                    # Sample not yet extracted, directly sample
                    sampled_indices.add(idx_j)
                    sampled_cluster[idx_j] = cluster_idx
                    sample_status[idx_j, cluster_idx] = 1  # Sampled
                    remaining_samples_per_cluster[cluster_idx] -= 1
                    return
                else:
                    # Sample already extracted by another cluster, handle membership conflict
                    original_cluster = sampled_cluster[idx_j]
                    if (
                        fuzzy_membership[idx_j, cluster_idx]
                        > fuzzy_membership[idx_j, original_cluster]
                    ):
                        sampled_cluster[idx_j] = cluster_idx
                        sample_status[idx_j, original_cluster] = (
                            2  # Reclassified after sampling
                        )
                        sample_status[idx_j, cluster_idx] = 1  # Sampled
                        remaining_samples_per_cluster[cluster_idx] -= 1
                        remaining_samples_per_cluster[original_cluster] += 1
                        # Resample the cluster from which the sample was removed
                        resample_cluster(
                            original_cluster,
                            sampled_indices,
                            sampled_cluster,
                            sample_status,
                            remaining_samples_per_cluster,
                        )
                        return

    # Extract samples based on relative distance to centroids and handle membership conflicts
    for i in range(n_clusters):
        cluster_indices = np.arange(df_scaled.shape[0])  # Select all points
        if len(cluster_indices) > 0:
            # Use distances from the d matrix
            distances = d[i, cluster_indices]
            # Sort indices by distance in ascending order
            sorted_indices = cluster_indices[np.argsort(distances)]
            # Extract samples
            for idx in sorted_indices:
                if remaining_samples_per_cluster[i] <= 0:
                    break
                if sampled_cluster[idx] == -1:
                    # Sample not yet extracted, directly sample
                    sampled_indices.add(idx)
                    sampled_cluster[idx] = i
                    sample_status[idx] = 1  # Sampled
                    remaining_samples_per_cluster[i] -= 1
                else:
                    # Sample already extracted by another cluster, handle membership conflict
                    current_cluster = sampled_cluster[idx]
                    if (
                        fuzzy_membership[idx, i]
                        > fuzzy_membership[idx, current_cluster]
                    ):
                        sampled_cluster[idx] = i
                        sample_status[idx, current_cluster] = (
                            2  # Reclassified after sampling
                        )
                        sample_status[idx, i] = 1  # Sampled
                        remaining_samples_per_cluster[i] -= 1
                        remaining_samples_per_cluster[current_cluster] += 1
                        # Resample the cluster from which the sample was removed
                        resample_cluster(
                            current_cluster,
                            sampled_indices,
                            sampled_cluster,
                            sample_status,
                            remaining_samples_per_cluster,
                        )

    # Get the sampled data
    sampled_df = df.loc[list(sampled_indices)].copy()
    # Add the class column
    sampled_df["class"] = class_df.loc[sampled_df.index, "class"]
    return sampled_df


# Specify the save path
save_path = "sampleddata/combined_samples/"

# Create the save path directory (if it doesn't exist)
os.makedirs(save_path, exist_ok=True)


# Define the function to generate sample sets in parallel
def parallel_generate_samples(sample_size, i):
    random_state = np.random.randint(0, 10000)  # Generate random seed
    sampled_df = generate_samples(sample_size, random_state)
    sampled_df.to_csv(
        os.path.join(save_path, f"FCMtp1sampled_data_{sample_size}_set_{i+1}.csv")
    )
    print(
        f"Generated sample set {i+1} for sample size {sample_size} with random state {random_state}"
    )


# Generate multiple sample sets in parallel
for sample_size in sample_sizes:
    Parallel(n_jobs=10, verbose=10)(
        delayed(parallel_generate_samples)(sample_size, i)
        for i in range(num_samples_per_size)
    )

print("All sample sets generated and saved.")

Type 2: Number of clusters equals the number of sampling dataset

In [None]:
# Define sample sizes and the number of samples to generate
sample_sizes = [1000, 5000, 10000, 20000]
num_samples_per_size = 20

# Get the number of class categories
n_clusters = class_df["class"].nunique()

# Standardize the data
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)

# Define sample sizes and the number of samples to generate
sample_sizes = [1000, 5000, 10000, 20000]
num_samples_per_size = 20


# Define the function to generate sample sets
def generate_samples(sample_size, random_state):
    n_clusters = sample_size  # Set the number of clusters equal to the sample size

    # Perform fuzzy c-means clustering
    _, u, _, _, _, _, _ = fuzz.cluster.cmeans(
        df_scaled.T,
        n_clusters,
        2,
        error=0.005,
        maxiter=1000,
        init=None,
        seed=random_state,
    )

    # Initialize variables
    sampled_indices = set()
    sampled_cluster = np.full(
        df_scaled.shape[0], -1
    )  # -1 indicates not sampled, others indicate the corresponding cluster
    sample_status = np.full(
        (df_scaled.shape[0], n_clusters), 0
    )  # 0: not sampled, 1: sampled, 2: reclassified after sampling
    remaining_samples_per_cluster = np.ones(
        n_clusters, dtype=int
    )  # Each cluster extracts only 1 sample

    def resample_cluster(
        cluster_idx,
        sampled_indices,
        sampled_cluster,
        sample_status,
        remaining_samples_per_cluster,
    ):
        """
        Resample samples from the specified cluster.

        Parameters:
        - cluster_idx: Index of the current cluster
        - sampled_indices: Set of sampled indices
        - sampled_cluster: Cluster assignment for each sample
        - sample_status: Sampling status for each sample in each cluster
        - remaining_samples_per_cluster: Remaining samples to extract from each cluster
        """
        # Select unsampled points
        cluster_indices_j = np.where(sample_status[:, cluster_idx] == 0)[0]
        # Sort by membership
        memberships_j = u[cluster_idx, cluster_indices_j]
        # Sort indices by membership in descending order
        sorted_indices_j = cluster_indices_j[np.argsort(-memberships_j)]
        # Extract samples
        for idx_j in sorted_indices_j:
            if remaining_samples_per_cluster[cluster_idx] <= 0:
                break
            if sample_status[idx_j, cluster_idx] == 0:
                if sampled_cluster[idx_j] == -1:
                    # Sample not yet extracted, directly sample
                    sampled_indices.add(idx_j)
                    sampled_cluster[idx_j] = cluster_idx
                    sample_status[idx_j, cluster_idx] = 1  # Sampled
                    remaining_samples_per_cluster[cluster_idx] -= 1
                    return
                else:
                    # Sample already extracted by another cluster, handle membership conflict
                    original_cluster = sampled_cluster[idx_j]
                    if u[cluster_idx, idx_j] > u[original_cluster, idx_j]:
                        sampled_cluster[idx_j] = cluster_idx
                        sample_status[idx_j, original_cluster] = (
                            2  # Reclassified after sampling
                        )
                        sample_status[idx_j, cluster_idx] = 1  # Sampled
                        remaining_samples_per_cluster[cluster_idx] -= 1
                        remaining_samples_per_cluster[original_cluster] += 1
                        # Resample the cluster from which the sample was removed
                        resample_cluster(
                            original_cluster,
                            sampled_indices,
                            sampled_cluster,
                            sample_status,
                            remaining_samples_per_cluster,
                        )
                        return

    # Extract samples based on membership and handle membership conflicts
    for i in range(n_clusters):
        cluster_indices = np.arange(df_scaled.shape[0])  # Select all points
        if len(cluster_indices) > 0:
            # Sort by membership
            memberships = u[i, cluster_indices]
            # Sort indices by membership in descending order
            sorted_indices = cluster_indices[np.argsort(-memberships)]
            # Extract samples
            for idx in sorted_indices:
                if remaining_samples_per_cluster[i] <= 0:
                    break
                if sampled_cluster[idx] == -1:
                    sampled_indices.add(idx)
                    sampled_cluster[idx] = i
                    sample_status[idx, i] = 1  # Sampled
                    remaining_samples_per_cluster[i] -= 1
                else:
                    # Sample already extracted by another cluster, handle membership conflict
                    current_cluster = sampled_cluster[idx]
                    if u[i, idx] > u[current_cluster, idx]:
                        sampled_cluster[idx] = i
                        sample_status[idx, current_cluster] = (
                            2  # Reclassified after sampling
                        )
                        sample_status[idx, i] = 1  # Sampled
                        remaining_samples_per_cluster[i] -= 1
                        remaining_samples_per_cluster[current_cluster] += 1
                        # Resample the cluster from which the sample was removed
                        resample_cluster(
                            current_cluster,
                            sampled_indices,
                            sampled_cluster,
                            sample_status,
                            remaining_samples_per_cluster,
                        )

    # Get the sampled data
    sampled_df = df.loc[list(sampled_indices)].copy()
    # Add the class column
    sampled_df["class"] = class_df.loc[sampled_df.index, "class"]
    return sampled_df


# Specify the save path
save_path = "sampleddata/combined_samples/"

# Create the save path directory (if it doesn't exist)
os.makedirs(save_path, exist_ok=True)

# Generate multiple sample sets
for sample_size in sample_sizes:
    for i in range(num_samples_per_size):
        random_state = np.random.randint(0, 10000)  # Generate random seed
        sampled_df = generate_samples(sample_size, random_state)
        sampled_df.to_csv(
            os.path.join(save_path, f"FCMtp2sampled_data_{sample_size}_set_{i+1}.csv")
        )
        print(
            f"Generated sample set {i+1} for sample size {sample_size} with random state {random_state}"
        )

print("All sample sets generated and saved.")

CLHS Conditioned Latin hypercube sampling

In [None]:
from clhs import clhs

# Standardize the data
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)

# Define sample sizes and the number of samples to generate
sample_sizes = [1000, 5000, 10000, 20000]
num_samples_per_size = 20

# Specify the save path
save_path = "sampleddata/combined_samples/"

# Create the save path directory (if it doesn't exist)
os.makedirs(save_path, exist_ok=True)


# Define the parallel processing function
def parallel_clhs(sample_size, i, seed):
    np.random.seed(seed)
    clhs_sampled_results = clhs(df_scaled, sample_size)
    clhs_sampled_df = class_df.iloc[clhs_sampled_results["sample_indices"]]
    file_path = os.path.join(
        save_path, f"clhs_sampled_data_{sample_size}_set_{i+1}.csv"
    )
    clhs_sampled_df.to_csv(file_path)
    print(f"Generated sample set {i+1} for sample size {sample_size}")


# Perform sampling in parallel using joblib
Parallel(n_jobs=10, verbose=30)(
    delayed(parallel_clhs)(size, i, np.random.randint(0, 10000))
    for size in sample_sizes
    for i in range(num_samples_per_size)
)

print("All sample sets generated and saved.")

k-means sampling, number of clusters equals the number of class categories

In [None]:
from sklearn.cluster import KMeans
import os
from joblib import Parallel, delayed

# Standardize the data
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)

# Define sample sizes and the number of samples to generate
sample_sizes = [1000, 5000, 10000, 20000]
num_samples_per_size = 20

# Get the number of class categories
n_clusters = class_df["class"].nunique()


# Define the function to generate sample sets
def generate_kmeans_samples(sample_size, random_state):
    # Perform clustering using KMeans
    kmeans = KMeans(n_clusters=n_clusters, random_state=random_state)
    kmeans.fit(df_scaled)

    # Get the labels for each cluster
    labels = kmeans.labels_

    # Calculate the number of samples for each cluster
    cluster_counts = np.bincount(labels, minlength=n_clusters)

    # Calculate the number of samples to extract from each cluster
    sample_size_per_cluster = (cluster_counts / cluster_counts.sum()) * sample_size
    sample_size_per_cluster = sample_size_per_cluster.astype(int)

    # Ensure the total number of samples equals sample_size
    difference = sample_size - sample_size_per_cluster.sum()
    if difference > 0:
        sample_size_per_cluster[np.argmin(sample_size_per_cluster)] += difference
    elif difference < 0:
        sample_size_per_cluster[np.argmax(sample_size_per_cluster)] += difference

    # Initialize variables
    sampled_indices = []

    # Extract samples based on relative distance to centroids
    for i in range(n_clusters):
        cluster_indices = np.where(labels == i)[0]
        if len(cluster_indices) > 0:
            # Use distances from the distance matrix
            distances = np.linalg.norm(
                df_scaled[cluster_indices] - kmeans.cluster_centers_[i], axis=1
            )
            # Sort indices by distance in ascending order
            sorted_indices = cluster_indices[np.argsort(distances)]
            # Extract samples
            sampled_indices.extend(sorted_indices[: sample_size_per_cluster[i]])

    # Get the sampled data
    sampled_df = df.loc[sampled_indices].copy()
    # Add the class column
    sampled_df["class"] = class_df.loc[sampled_df.index, "class"]
    return sampled_df


# Specify the save path
save_path = "sampleddata/combined_samples/"

# Create the save path directory (if it doesn't exist)
os.makedirs(save_path, exist_ok=True)


# Define the parallel function to generate sample sets
def parallel_generate_kmeans_samples(sample_size, i):
    random_state = np.random.randint(0, 10000)  # Generate random seed
    sampled_df = generate_kmeans_samples(sample_size, random_state)
    file_path = os.path.join(
        save_path, f"kmeans_sampled_data_{sample_size}_set_{i+1}.csv"
    )

    if os.path.exists(file_path):
        print(f"File {file_path} already exists, skipping...")
        return

    sampled_df.to_csv(file_path)
    print(
        f"Generated sample set {i+1} for sample size {sample_size} with random state {random_state}"
    )


# Perform sampling in parallel using joblib
Parallel(n_jobs=10, verbose=30)(
    delayed(parallel_generate_kmeans_samples)(size, i)
    for size in sample_sizes
    for i in range(num_samples_per_size)
)

print("All sample sets generated and saved.")

In [None]:
from sklearn.datasets import fetch_openml

# Load Forest CoverType dataset
forest_cover = fetch_openml(name="covertype", version=1, as_frame=True)
original_df = forest_cover.frame

# Ensure all variables are numerical
numerical_columns = original_df.select_dtypes(include=["number"]).columns
df = original_df[numerical_columns]

# Remove the last four columns from the dataframe
df = df.iloc[:, :-4]

# Extract the Cover_Type column and add it to df as the class column
class_df = df.copy()
class_df["class"] = forest_cover.frame["class"]

# Display the first few rows of the processed dataset
print(df.head())

# Use your data and variable list to create sample combinations
variables = df.columns.tolist()

# Categorize variables based on physical significance
variable_groups = [
    ["elevation", "aspect", "slope"],  # Terrain features
    [
        "horizontal_distance_to_hydrology",
        "Vertical_Distance_To_Hydrology",
    ],  # Hydrology features
    ["Horizontal_Distance_To_Roadways"],  # Road features
    ["Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm"],  # Hillshade features
    ["Horizontal_Distance_To_Fire_Points"],  # Fire point features
]

# Ensure all variables are in the variable list
for group in variable_groups:
    for variable in group:
        if variable not in variables:
            print(f"Warning: {variable} is not in the variables list")

Fuzzy c-means clustering sampling

Type 1: Number of clusters equals the number of class categories

In [None]:
from sklearn.preprocessing import StandardScaler
import skfuzzy as fuzz
import os
from joblib import Parallel, delayed

# Standardize the data
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)

# Define sample sizes and the number of samples to generate
sample_sizes = [1000, 5000, 10000, 20000]
num_samples_per_size = 20

# Get the number of class categories
n_clusters = class_df["class"].nunique()


# Define the function to generate sample sets
def generate_samples(sample_size, random_state):
    # Perform fuzzy c-means clustering
    _, u, _, d, _, _, _ = fuzz.cluster.cmeans(
        df_scaled.T,
        n_clusters,
        2,
        error=0.005,
        maxiter=1000,
        init=None,
        seed=random_state,
    )

    # Get the fuzzy membership for each data point
    fuzzy_membership = u.T

    # Calculate the number of samples for each cluster
    cluster_counts = np.sum(fuzzy_membership, axis=0)

    # Calculate the number of samples to extract from each cluster
    sample_size_per_cluster = (cluster_counts / cluster_counts.sum()) * sample_size
    sample_size_per_cluster = sample_size_per_cluster.astype(int)

    # Ensure the total number of samples equals sample_size
    difference = sample_size - sample_size_per_cluster.sum()
    if difference > 0:
        sample_size_per_cluster[np.argmin(sample_size_per_cluster)] += difference
    elif difference < 0:
        sample_size_per_cluster[np.argmax(sample_size_per_cluster)] += difference

    # Initialize variables
    sampled_indices = set()
    sampled_cluster = np.full(df_scaled.shape[0], -1)  # -1 indicates not sampled
    sample_status = np.full(
        (df_scaled.shape[0], n_clusters), 0
    )  # 0: not sampled, 1: sampled, 2: reclassified after sampling
    remaining_samples_per_cluster = sample_size_per_cluster.copy()

    def resample_cluster(
        cluster_idx,
        sampled_indices,
        sampled_cluster,
        sample_status,
        remaining_samples_per_cluster,
    ):
        """
        Resample samples from the specified cluster.

        Parameters:
        - cluster_idx: Index of the current cluster
        - sampled_indices: Set of sampled indices
        - sampled_cluster: Cluster assignment for each sample
        - sample_status: Sampling status for each sample in each cluster
        - remaining_samples_per_cluster: Remaining samples to extract from each cluster
        """
        # Select unsampled points
        cluster_indices_j = np.where(sample_status[:, cluster_idx] == 0)[0]
        # Sort by membership
        memberships_j = fuzzy_membership[cluster_indices_j, cluster_idx]
        # Sort indices by membership in descending order
        sorted_indices_j = cluster_indices_j[np.argsort(-memberships_j)]
        # Extract samples
        for idx_j in sorted_indices_j:
            if remaining_samples_per_cluster[cluster_idx] <= 0:
                break
            if sample_status[idx_j, cluster_idx] == 0:
                if sampled_cluster[idx_j] == -1:
                    # Sample not yet extracted, directly sample
                    sampled_indices.add(idx_j)
                    sampled_cluster[idx_j] = cluster_idx
                    sample_status[idx_j, cluster_idx] = 1  # Sampled
                    remaining_samples_per_cluster[cluster_idx] -= 1
                    return
                else:
                    # Sample already extracted by another cluster, handle membership conflict
                    original_cluster = sampled_cluster[idx_j]
                    if (
                        fuzzy_membership[idx_j, cluster_idx]
                        > fuzzy_membership[idx_j, original_cluster]
                    ):
                        sampled_cluster[idx_j] = cluster_idx
                        sample_status[idx_j, original_cluster] = (
                            2  # Reclassified after sampling
                        )
                        sample_status[idx_j, cluster_idx] = 1  # Sampled
                        remaining_samples_per_cluster[cluster_idx] -= 1
                        remaining_samples_per_cluster[original_cluster] += 1
                        # Resample the cluster from which the sample was removed
                        resample_cluster(
                            original_cluster,
                            sampled_indices,
                            sampled_cluster,
                            sample_status,
                            remaining_samples_per_cluster,
                        )
                        return

    # Extract samples based on relative distance to centroids and handle membership conflicts
    for i in range(n_clusters):
        cluster_indices = np.arange(df_scaled.shape[0])  # Select all points
        if len(cluster_indices) > 0:
            # Use distances from the d matrix
            distances = d[i, cluster_indices]
            # Sort indices by distance in ascending order
            sorted_indices = cluster_indices[np.argsort(distances)]
            # Extract samples
            for idx in sorted_indices:
                if remaining_samples_per_cluster[i] <= 0:
                    break
                if sampled_cluster[idx] == -1:
                    # Sample not yet extracted, directly sample
                    sampled_indices.add(idx)
                    sampled_cluster[idx] = i
                    sample_status[idx] = 1  # Sampled
                    remaining_samples_per_cluster[i] -= 1
                else:
                    # Sample already extracted by another cluster, handle membership conflict
                    current_cluster = sampled_cluster[idx]
                    if (
                        fuzzy_membership[idx, i]
                        > fuzzy_membership[idx, current_cluster]
                    ):
                        sampled_cluster[idx] = i
                        sample_status[idx, current_cluster] = (
                            2  # Reclassified after sampling
                        )
                        sample_status[idx, i] = 1  # Sampled
                        remaining_samples_per_cluster[i] -= 1
                        remaining_samples_per_cluster[current_cluster] += 1
                        # Resample the cluster from which the sample was removed
                        resample_cluster(
                            current_cluster,
                            sampled_indices,
                            sampled_cluster,
                            sample_status,
                            remaining_samples_per_cluster,
                        )

    # Get the sampled data
    sampled_df = df.loc[list(sampled_indices)].copy()
    # Add the class column
    sampled_df["class"] = class_df.loc[sampled_df.index, "class"]
    return sampled_df


# Specify the save path
save_path = "sampleddata/combined_samples/"

# Create the save path directory (if it doesn't exist)
os.makedirs(save_path, exist_ok=True)


# Define the function to generate sample sets in parallel
def parallel_generate_samples(sample_size, i):
    random_state = np.random.randint(0, 10000)  # Generate random seed
    sampled_df = generate_samples(sample_size, random_state)
    sampled_df.to_csv(
        os.path.join(save_path, f"FCMtp1sampled_data_{sample_size}_set_{i+1}.csv")
    )
    print(
        f"Generated sample set {i+1} for sample size {sample_size} with random state {random_state}"
    )


# Generate multiple sample sets in parallel
for sample_size in sample_sizes:
    Parallel(n_jobs=10, verbose=10)(
        delayed(parallel_generate_samples)(sample_size, i)
        for i in range(num_samples_per_size)
    )

print("All sample sets generated and saved.")

From the cell below, it can be run independently. This is the model construction section.

In [None]:
import pandas as pd
import os
from sklearn.utils import resample

# Read the original dataset
class_df = pd.read_csv("sampleddata/class_df.csv")

# Define sample levels and sampling methods
sample_levels = [1000, 5000, 10000, 20000]

sampling_methods = [
    "BalancedSampling_sampled_data",
    "clhs_sampled_data",
    "FCMtp1sampled_data",
    "FCMtp2sampled_data",
    "FSCS_sampled_data",
    "kmeans_sampled_data",
]

# Define covariates and target variable
covariates = [
    "elevation",
    "aspect",
    "slope",
    "horizontal_distance_to_hydrology",
    "Vertical_Distance_To_Hydrology",
    "Horizontal_Distance_To_Roadways",
    "Hillshade_9am",
    "Hillshade_Noon",
    "Hillshade_3pm",
    "Horizontal_Distance_To_Fire_Points",
]
target = "class"

# Store training and testing sets
train_test_data = {}

# Iterate through each sample level and sampling method
for level in sample_levels:
    for t in range(1, 21):
        for method in sampling_methods:
            sample_file = f"sampleddata/combined_samples/{method}_{level}_set_{t}.csv"

            if not os.path.exists(sample_file):
                print(f"File {sample_file} does not exist. Skipping...")
                continue

            sample_data = pd.read_csv(sample_file, index_col=0)
            X_train = sample_data[covariates]
            y_train = sample_data[target]
            train_indices = sample_data.index

            # Generate the corresponding test set
            test_data = class_df.drop(train_indices)
            X_test = test_data[covariates]
            y_test = test_data[target]

            # Store training and testing sets
            train_test_data[f"{method}_{level}_set_{t}"] = (
                X_train,
                y_train,
                X_test,
                y_test,
            )

# Generate SRS sample subsets
for level in sample_levels:
    for t in range(1, 21):
        # Perform simple random sampling
        srs_sample = resample(class_df, n_samples=level, random_state=42 + t)
        X_train = srs_sample[covariates]
        y_train = srs_sample[target]
        train_indices = srs_sample.index

        # Generate the corresponding test set
        test_data = class_df.drop(train_indices)
        X_test = test_data[covariates]
        y_test = test_data[target]

        # Store training and testing sets
        train_test_data[f"SRS_sampled_df_{level}_set_{t}"] = (
            X_train,
            y_train,
            X_test,
            y_test,
        )

print("All training and testing sets generated and saved.")

In [None]:
# Clear unnecessary variables to free up memory.
del (
    class_df,
    sample_data,
    X_train,
    y_train,
    X_test,
    y_test,
    train_indices,
    test_data,
    srs_sample,
)
import gc

gc.collect()

In [None]:
import optuna
from optuna.samplers import TPESampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import pandas as pd
from joblib import Parallel, delayed


# Define the objective function for Optuna hyperparameter optimization
def objective(trial, X_train, y_train):
    """
    Objective function for Optuna hyperparameter optimization.

    Parameters:
    trial: Optuna Trial object for recording trial results.
    X_train: Training feature data.
    y_train: Training label data.

    Returns:
    Mean accuracy from cross-validation.
    """

    # Define the hyperparameter search space
    n_estimators = trial.suggest_int("n_estimators", 10, 1000)  # Number of trees
    max_depth = trial.suggest_int(
        "max_depth", 2, 32, log=True
    )  # Maximum depth of trees
    min_samples_split = trial.suggest_int(
        "min_samples_split", 2, 16
    )  # Minimum samples required to split an internal node
    min_samples_leaf = trial.suggest_int(
        "min_samples_leaf", 1, 16
    )  # Minimum samples required at a leaf node
    max_features = trial.suggest_categorical(
        "max_features", ["sqrt", "log2"]
    )  # Number of features to consider for each tree

    # Create a random forest classifier
    clf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=42,  # Fixed random seed for reproducibility
    )

    # Perform cross-validation and return the mean accuracy
    scores = cross_val_score(clf, X_train, y_train, cv=10, scoring="accuracy")
    return scores.mean()


# Parallelize hyperparameter optimization and model evaluation
def optimize_and_evaluate(key, X_train, y_train):
    """
    Perform hyperparameter optimization and model evaluation for each dataset.

    Parameters:
    key: Name of the dataset.
    X_train: Training feature data.
    y_train: Training label data.
    """

    # Create an Optuna study object for hyperparameter optimization
    study = optuna.create_study(direction="maximize", sampler=TPESampler())

    # Use Optuna for hyperparameter optimization, n_trials=100 means 100 trials, n_jobs=6 uses 6 CPU cores
    study.optimize(
        lambda trial: objective(trial, X_train, y_train), n_trials=100, n_jobs=6
    )

    # Get the best hyperparameters
    best_params = study.best_params

    # Train the model with the best hyperparameters
    clf = RandomForestClassifier(**best_params, random_state=42)
    clf.fit(X_train, y_train)

    # Return results
    return {
        "dataset": key,
        "best_params": best_params,
    }


# Use joblib to parallelize processing for each dataset
results = Parallel(n_jobs=6, verbose=30)(
    delayed(optimize_and_evaluate)(key, X_train, y_train)
    for key, (X_train, y_train) in train_test_data.items()
)

# Convert results to DataFrame and print
results_df = pd.DataFrame(results)
print(results_df)

In [None]:
results_df.to_csv("rfresults.csv", index=False)

From the cell below, it can be run independently. This is the compare analysis section. Including SHAP and Model Performance Evaluation.

In [None]:
import pandas as pd
import os
from sklearn.utils import resample

# Read the original dataset
class_df = pd.read_csv("sampleddata/class_df.csv", index_col=0)

# Define sample levels and sampling methods
sample_levels = [1000, 5000, 10000, 20000]

sampling_methods = [
    "BalancedSampling_sampled_data",
    "clhs_sampled_data",
    "FCMtp1sampled_data",
    "FCMtp2sampled_data",
    "FSCS_sampled_data",
    "kmeans_sampled_data",
]

# Define covariates and target variable
covariates = [
    "elevation",
    "aspect",
    "slope",
    "horizontal_distance_to_hydrology",
    "Vertical_Distance_To_Hydrology",
    "Horizontal_Distance_To_Roadways",
    "Hillshade_9am",
    "Hillshade_Noon",
    "Hillshade_3pm",
    "Horizontal_Distance_To_Fire_Points",
]
target = "class"

# Store training and testing sets
train_test_data = {}

# Iterate through each sample level and sampling method
for level in sample_levels:
    for t in range(1, 21):
        for method in sampling_methods:
            sample_file = f"sampleddata/combined_samples/{method}_{level}_set_{t}.csv"

            if not os.path.exists(sample_file):
                print(f"File {sample_file} does not exist. Skipping...")
                continue

            sample_data = pd.read_csv(sample_file, index_col=0)
            X_train = sample_data[covariates]
            y_train = sample_data[target]
            train_indices = sample_data.index

            # Generate the corresponding test set
            test_data = class_df.drop(train_indices)
            X_test = test_data[covariates]
            y_test = test_data[target]

            # Store training and testing sets
            train_test_data[f"{method}_{level}_set_{t}"] = (
                X_train,
                y_train,
                X_test,
                y_test,
            )

# Generate SRS sample subsets
for level in sample_levels:
    for t in range(1, 21):
        # Simple random sampling
        srs_sample = resample(class_df, n_samples=level, random_state=42 + t)
        X_train = srs_sample[covariates]
        y_train = srs_sample[target]
        train_indices = srs_sample.index

        # Generate the corresponding test set
        test_data = class_df.drop(train_indices)
        X_test = test_data[covariates]
        y_test = test_data[target]

        # Store training and testing sets
        train_test_data[f"SRS_sampled_df_{level}_set_{t}"] = (
            X_train,
            y_train,
            X_test,
            y_test,
        )

print("All training and testing sets generated and saved.")

In [None]:
# Delete unnecessary variables to free memory
del (
    class_df,
    sample_data,
    X_train,
    y_train,
    X_test,
    y_test,
    train_indices,
    test_data,
    srs_sample,
)
import gc

gc.collect()

In [None]:
# Read the saved DataFrame file
results_df = pd.read_csv("rfresults.csv")

In [None]:
import joblib
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from joblib import Parallel, delayed
import fasttreeshap

# Create a dictionary to store models
models = {}

# Specify the save path
save_path = "codepart/RFpkl/"
# Ensure the save path exists
os.makedirs(save_path, exist_ok=True)


def is_file_valid(filename):
    try:
        with open(filename, "rb") as f:
            joblib.load(f)
        return True
    except Exception as e:
        print(f"File {filename} is invalid: {e}")
        return False


def process_dataset(index, row):
    dataset = row["dataset"]

    best_params = row["best_params"]
    best_params = eval(best_params)  # Convert string to dictionary

    # Get the corresponding dataset
    X_train, y_train, X_test, y_test = train_test_data[dataset]

    # File paths
    model_filename = os.path.join(save_path, f"{dataset}_model.pkl")
    shap_values_filename_v2 = os.path.join(save_path, f"{dataset}_shap_values_v2.pkl")
    shap_interaction_values_filename_v1 = os.path.join(
        save_path, f"{dataset}_shap_interaction_values_v1.pkl"
    )
    shap_explainer_v2_filename = os.path.join(
        save_path, f"{dataset}_shap_explainer_v2.pkl"
    )
    shap_explainer_v1_filename = os.path.join(
        save_path, f"{dataset}_shap_explainer_v1.pkl"
    )

    # Check if all files already exist and are valid
    if (
        os.path.exists(model_filename)
        and is_file_valid(model_filename)
        and os.path.exists(shap_values_filename_v2)
        and is_file_valid(shap_values_filename_v2)
        and os.path.exists(shap_interaction_values_filename_v1)
        and is_file_valid(shap_interaction_values_filename_v1)
    ):
        print(f"{dataset} has already been processed, loading files directly.")
        clf = joblib.load(model_filename)
        shap_values_v2 = joblib.load(shap_values_filename_v2)
        shap_interaction_values_v1 = joblib.load(shap_interaction_values_filename_v1)

    else:
        if os.path.exists(model_filename) and is_file_valid(model_filename):
            clf = joblib.load(model_filename)
        else:
            # Train the model with the best hyperparameters
            clf = RandomForestClassifier(**best_params, random_state=42)
            clf.fit(X_train, y_train)

        # Save the trained model
        joblib.dump(clf, model_filename, compress=9)

        if os.path.exists(shap_values_filename_v2) and is_file_valid(
            shap_values_filename_v2
        ):
            shap_values_v2 = joblib.load(shap_values_filename_v2)
        else:
            # Perform SHAP analysis, use X_train to analyze feature importance and interaction importance
            shap_explainer_v2 = fasttreeshap.TreeExplainer(
                clf, algorithm="v2", n_jobs=-1
            )
            shap_values_v2 = shap_explainer_v2(X_train).values
            # Save SHAP values
            joblib.dump(shap_values_v2, shap_values_filename_v2, compress=9)
            # Save SHAP explainer
            joblib.dump(shap_explainer_v2, shap_explainer_v2_filename, compress=9)

        if os.path.exists(shap_interaction_values_filename_v1) and is_file_valid(
            shap_interaction_values_filename_v1
        ):
            shap_interaction_values_v1 = joblib.load(
                shap_interaction_values_filename_v1
            )
        else:
            shap_explainer_v1 = fasttreeshap.TreeExplainer(
                clf, algorithm="v1", n_jobs=-1
            )
            shap_interaction_values_v1 = shap_explainer_v1(
                X_train, interactions=True
            ).values
            joblib.dump(
                shap_interaction_values_v1,
                shap_interaction_values_filename_v1,
                compress=9,
            )
            joblib.dump(shap_explainer_v1, shap_explainer_v1_filename, compress=9)

    # Final print statement
    print(f"Finished {dataset}")


# Use joblib to process in parallel and show progress bar
with Parallel(n_jobs=-1, verbose=50) as parallel:
    parallel(
        delayed(process_dataset)(index, row) for index, row in results_df.iterrows()
    )

print("All datasets have been processed.")

In [None]:
import joblib
import os
from sklearn.metrics import roc_auc_score
from joblib import Parallel, delayed
from sklearn.metrics import accuracy_score, f1_score

# Specify the save path
save_path = "codepart/RFpkl/"
# Ensure the save path exists
os.makedirs(save_path, exist_ok=True)


def calculate_roc(index, row):
    dataset = row["dataset"]

    # Get the corresponding dataset
    _, _, X_test, y_test = train_test_data[dataset]

    # File path
    model_filename = os.path.join(save_path, f"{dataset}_model.pkl")

    # Check if the model file already exists
    if os.path.exists(model_filename):
        print(f"{dataset} has already been processed, loading file directly.")
        clf = joblib.load(model_filename)

        # Make predictions on the test set
        y_pred = clf.predict(X_test)
        y_pred_proba = clf.predict_proba(X_test)

        # Calculate ROC AUC score
        if len(y_test.unique()) == 2:
            roc_auc = roc_auc_score(y_test, y_pred_proba[:, 1])
        else:
            roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class="ovr")

        # Calculate accuracy and F1 score
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average="weighted")

        # Return results
        return index, roc_auc, accuracy, f1
    else:
        print(f"Model file for {dataset} does not exist, skipping.")
        return index, None, None, None


# Use joblib to process in parallel
results = Parallel(n_jobs=-1, verbose=50)(
    delayed(calculate_roc)(index, row) for index, row in results_df.iterrows()
)

# Update results_df
for index, roc_auc, accuracy, f1 in results:
    if roc_auc is not None:
        results_df.loc[index, "roc_auc"] = roc_auc
        results_df.loc[index, "test_accuracy"] = accuracy
        results_df.loc[index, "test_f1"] = f1

print("ROC AUC, accuracy, and F1 score calculation for all datasets is complete.")

In [None]:
results_df.to_csv("rfresults_icluROCAUC.csv", index=False)

From the cell below, it can be run independently. Figure drawing

In [None]:
import pandas as pd

# Read the saved DataFrame file
results_df = pd.read_csv("rfresults_icluROCAUC.csv")

import matplotlib.pyplot as plt
import seaborn as sns

# Create a new variable to store the extracted sampling method and dataset level
results_with_methods = results_df.copy()
results_with_methods["sampling_method"] = results_with_methods["dataset"].apply(
    lambda x: x.split("_")[0]
)
results_with_methods["dataset_level"] = results_with_methods["dataset"].apply(
    lambda x: x.split("_")[-3]
)

# Set font size
plt.rcParams.update({"font.size": 16})

# Draw boxplots
plt.figure(figsize=(10, 25))  # Adjust figure width

# Draw boxplot for accuracy
plt.subplot(3, 1, 1)
sns.boxplot(
    x="sampling_method",
    y="test_accuracy",
    hue="dataset_level",
    data=results_with_methods,
    width=0.6,
)
plt.title("Accuracy Boxplot", fontsize=20)
plt.xlabel("Sampling Method", fontsize=18, labelpad=20)
plt.ylabel("Test Accuracy", fontsize=18, labelpad=20)
plt.grid(True, which="major", axis="y", linestyle="-", linewidth=0.6)
plt.grid(True, which="minor", axis="y", linestyle="--", linewidth=0.25)
plt.gca().yaxis.set_major_locator(plt.MultipleLocator(0.05))
plt.gca().yaxis.set_minor_locator(plt.MultipleLocator(0.01))
plt.legend(title="Dataset Level")  # Add legend

# Draw boxplot for F1 score
plt.subplot(3, 1, 2)
sns.boxplot(
    x="sampling_method", y="test_f1", hue="dataset_level", data=results_with_methods
)
plt.title("F1 Score Boxplot", fontsize=20)
plt.xlabel("Sampling Method", fontsize=18, labelpad=20)
plt.ylabel("Test F1 Score", fontsize=18, labelpad=20)
plt.grid(True, which="major", axis="y", linestyle="-", linewidth=0.6)
plt.grid(True, which="minor", axis="y", linestyle="--", linewidth=0.25)
plt.gca().yaxis.set_major_locator(plt.MultipleLocator(0.05))
plt.gca().yaxis.set_minor_locator(plt.MultipleLocator(0.01))
plt.legend(title="Dataset Level")  # Add legend

# Draw boxplot for ROC-AUC
plt.subplot(3, 1, 3)
sns.boxplot(
    x="sampling_method", y="roc_auc", hue="dataset_level", data=results_with_methods
)
plt.title("ROC-AUC Boxplot", fontsize=20)
plt.xlabel("Sampling Method", fontsize=18, labelpad=20)
plt.ylabel("ROC-AUC", fontsize=18, labelpad=20)
plt.grid(True, which="major", axis="y", linestyle="-", linewidth=0.6)
plt.grid(True, which="minor", axis="y", linestyle="--", linewidth=0.25)
plt.gca().yaxis.set_major_locator(plt.MultipleLocator(0.05))
plt.gca().yaxis.set_minor_locator(plt.MultipleLocator(0.01))
plt.legend(title="Dataset Level")  # Add legend

# Specify the X-axis labels from left to right
sampling_methods = [
    "Balance\nSampling",
    "CLHS",
    "FCM\nClu = class",
    "FCM\nClu = level",
    "FSCS",
    "K-means\nClu = class",
    "SRS",
]
plt.subplot(3, 1, 1)
plt.xticks(ticks=range(len(sampling_methods)), labels=sampling_methods)
plt.subplot(3, 1, 2)
plt.xticks(ticks=range(len(sampling_methods)), labels=sampling_methods)
plt.subplot(3, 1, 3)
plt.xticks(ticks=range(len(sampling_methods)), labels=sampling_methods)

plt.tight_layout()
plt.savefig("rfresults_plot.jpg", format="jpg", dpi=800, bbox_inches="tight")
plt.show()

# Calculate the median of R² and RMSLE for each sampling method
median_metrics = (
    results_with_methods.groupby(["sampling_method", "dataset_level"])[
        ["test_accuracy", "test_f1", "roc_auc"]
    ]
    .median()
    .reset_index()
)

# Print median results
print(median_metrics)


# Calculate the percentage difference of each method relative to SRS of the same level
def calculate_percentage_difference(group):
    srs_values = group[group["sampling_method"] == "SRS"]
    if srs_values.empty:
        return group.set_index("sampling_method") * float("nan")
    srs_values = srs_values.iloc[0][["test_accuracy", "test_f1", "roc_auc"]]
    return (
        group.set_index("sampling_method")[["test_accuracy", "test_f1", "roc_auc"]]
        / srs_values
        - 1
    ) * 100


# Group by dataset_level and calculate the percentage difference for each method relative to SRS
percentage_diff = (
    median_metrics.groupby("dataset_level")
    .apply(calculate_percentage_difference)
    .reset_index()
)

# Merge into one table
combined_metrics = pd.merge(
    median_metrics,
    percentage_diff,
    on=["sampling_method", "dataset_level"],
    suffixes=("", "_percentage_diff"),
)

# Print the combined table results
print("Combined table results:")
print(combined_metrics)
combined_metrics.to_csv("rfmedian_metrics.csv", index=False)