Supplementary figures for distribution

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Read the multiclass dataset
class_df = pd.read_csv("sampleddata/class_df.csv", index_col=0)

# Define covariates and target variable
covariates = [
    "elevation",
    "aspect",
    "slope",
    "horizontal_distance_to_hydrology",
    "Vertical_Distance_To_Hydrology",
    "Horizontal_Distance_To_Roadways",
    "Hillshade_9am",
    "Hillshade_Noon",
    "Hillshade_3pm",
    "Horizontal_Distance_To_Fire_Points",
]
target = "class"  # Class name

# Manually specify x-axis labels and units
feature_names = [
    "Elevation (meters)",
    "Aspect (degrees)",
    "Slope (degrees)",
    "Horizontal Distance to Hydrology (meters)",
    "Vertical Distance to Hydrology (meters)",
    "Horizontal Distance to Roadways (meters)",
    "Hillshade 9 am (0-255)",
    "Hillshade Noon (0-255)",
    "Hillshade 3 pm (0-255)",
    "Horizontal Distance to Fire Points (meters)",
]

class_feature = [
    "Aspen",
    "Cottonwood_Willow",
    "Douglas_fir",
    "Krummholz",
    "Lodgepole_Pine",
    "Ponderosa_Pine",
    "Spruce_Fir",
]

class_names = [
    "Aspen",
    "Cottonwood Willow",
    "Douglas Fir",
    "Krummholz",
    "Lodgepole Pine",
    "Ponderosa Pine",
    "Spruce Fir",
]

# Plot the histogram distribution of the target variable
plt.figure(figsize=(10, 6))
class_df[target].hist(bins=30, edgecolor="black")
plt.xlabel("Forest Cover Classes")
plt.xticks(ticks=class_feature, labels=class_names, rotation=45)
plt.ylabel("Frequency")
plt.grid(True)
plt.tight_layout()
plt.savefig("multiclass_target_hist.png", dpi=800)
plt.show()

# Plot the histogram distributions of the target variable and all covariates together
fig, axes = plt.subplots(nrows=4, ncols=3, figsize=(15, 10))
axes = axes.flatten()
for i, column in enumerate(covariates):
    axes[i].hist(class_df[column], bins=30, edgecolor="black")
    axes[i].set_xlabel(feature_names[i])
    axes[i].set_ylabel("Frequency")
    axes[i].grid(True)

axes[i + 1].hist(class_df[target], bins=30, edgecolor="black")
axes[i + 1].set_xlabel("Forest Cover Classes")
axes[i + 1].set_ylabel("Frequency")
axes[i + 1].set_xticks(ticks=class_feature, labels=class_names, rotation=45)
axes[i + 1].grid(True)
# Hide extra subplots
for j in range(len(covariates) + 1, len(axes)):
    fig.delaxes(axes[j])
plt.tight_layout()
plt.savefig("multiclass_covariates_hist.png", dpi=800)
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Read the original dataset
class_df = pd.read_csv(
    " sampleddata/USKSAT_OpenRefined_cleaned.csv",
    index_col=0,  # Use the first column as the index
)

# Define covariates and target variable
covariates = [
    "Db",
    "Clay",
    "VFS",
    "MS",
    "OC",
    "Silt",
    "COS",
    "FS",
    "Depth.cm_Top",
    "VCOS",
]

target = "Ksat_cmhr"
class_df[target] = np.log(class_df[target])
# Manually specify x-axis labels and units
feature_names = [
    "Db (g/cm$^3$)",
    "Clay (%)",
    "VFS (%)",
    "MS (%)",
    "OC (%)",
    "Silt (%)",
    "COS (%)",
    "FS (%)",
    "DT (cm)",
    "VCOS (%)",
]

# Plot the histogram distribution of the target variable
plt.figure(figsize=(10, 6))

class_df[target].hist(bins=30, edgecolor="black")
plt.xlabel("ln(Ks)")
plt.ylabel("Frequency")
plt.grid(True)
plt.tight_layout()
plt.savefig("log_ks_hist.png", dpi=800)
plt.show()

# Plot the histogram distributions of the target variable and all covariates together
fig, axes = plt.subplots(nrows=4, ncols=3, figsize=(15, 10))
axes = axes.flatten()
for i, column in enumerate(covariates):
    axes[i].hist(class_df[column], bins=30, edgecolor="black")
    axes[i].set_xlabel(feature_names[i])
    axes[i].set_ylabel("Frequency")
    axes[i].grid(True)

class_df[target] = np.exp(class_df[target])
axes[i + 1].hist(class_df[target], bins=30, edgecolor="black")
axes[i + 1].set_xlabel("Ks (cm/hr)")
axes[i + 1].set_ylabel("Frequency")
axes[i + 1].grid(True)
# Hide extra subplots
for j in range(len(covariates) + 1, len(axes)):
    fig.delaxes(axes[j])
plt.tight_layout()
plt.savefig("continuous_covariates_hist.png", dpi=800)
plt.show()

In [None]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
covertype = fetch_ucirepo(id=31)

# data (as pandas dataframes)
X = covertype.data.features
y = covertype.data.targets

# metadata
print(covertype.metadata)

# variable information
print(covertype.variables)

In [None]:
from sklearn.datasets import fetch_openml

# Load the Forest CoverType dataset
forest_cover = fetch_openml(name="covertype", version=1, as_frame=True)
original_df = forest_cover.frame

# Ensure all variables are numeric
numerical_columns = original_df.select_dtypes(include=["number"]).columns
df = original_df[numerical_columns]

# Remove the last four columns from the dataframe
df = df.iloc[:, :-4]

# Extract the Cover_Type column and add it to df as the class column
class_df = df.copy()
class_df["class"] = forest_cover.frame["class"]

# Display the first few rows of the processed dataset
print(df.head())

# Use your data and variable list to build sample combinations
variables = df.columns.tolist()

# Group variables according to their physical meaning
variable_groups = [
    ["elevation", "aspect", "slope"],  # Terrain features
    [
        "horizontal_distance_to_hydrology",
        "Vertical_Distance_To_Hydrology",
    ],  # Hydrological features
    ["Horizontal_Distance_To_Roadways"],  # Road features
    ["Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm"],  # Hillshade features
    ["Horizontal_Distance_To_Fire_Points"],  # Fire point features
]

# Ensure all variables are numeric
numerical_columns = original_df.select_dtypes(include=["number"]).columns
df = original_df[numerical_columns]

# Remove the last four columns from the dataframe
df = df.iloc[:, :-4]

# Extract the Cover_Type column and add it to df as the class column
class_df = df.copy()
class_df["class"] = forest_cover.frame["class"]

# Display the first few rows of the processed dataset
print(df.head())

# Use your data and variable list to build sample combinations
variables = df.columns.tolist()

# Group variables according to their physical meaning
variable_groups = [
    ["elevation", "aspect", "slope"],  # Terrain features
    [
        "horizontal_distance_to_hydrology",
        "Vertical_Distance_To_Hydrology",
    ],  # Hydrological features
    ["Horizontal_Distance_To_Roadways"],  # Road features
    ["Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm"],  # Hillshade features
    ["Horizontal_Distance_To_Fire_Points"],  # Fire point features
]

# Ensure all variables are in the variable list
for group in variable_groups:
    for variable in group:
        if variable not in variables:
            print(f"Warning: {variable} is not in the variables list")

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from scipy.spatial.distance import jensenshannon


# Remove extra spaces from column names and convert them to lowercase
df.columns = df.columns.str.strip().str.lower()
X.columns = X.columns.str.strip().str.lower()

# Define covariates and feature names
covariates = [
    "elevation",
    "aspect",
    "slope",
    "horizontal_distance_to_hydrology",
    "vertical_distance_to_hydrology",
    "horizontal_distance_to_roadways",
    "hillshade_9am",
    "hillshade_noon",
    "hillshade_3pm",
    "horizontal_distance_to_fire_points",
]
feature_names = [
    "Elevation (meters)",
    "Aspect (degrees)",
    "Slope (degrees)",
    "Horizontal Distance to Hydrology (meters)",
    "Vertical Distance to Hydrology (meters)",
    "Horizontal Distance to Roadways (meters)",
    "Hillshade 9 am (0-255)",
    "Hillshade Noon (0-255)",
    "Hillshade 3 pm (0-255)",
    "Horizontal Distance to Fire Points (meters)",
]

# Assume original and sub are already defined
original = y["Cover_Type"]
sub = class_df["class"]

# Create a mapping to map sub's text categories to original's numeric categories
mapping = {
    1: "Spruce_Fir",
    2: "Lodgepole_Pine",
    3: "Ponderosa_Pine",
    4: "Cottonwood_Willow",
    5: "Aspen",
    6: "Douglas_fir",
    7: "Krummholz",
}
sub_mapped = sub.map(
    {v: k for k, v in mapping.items()}
)  # Map text categories back to numbers

# Calculate distributions
original_dist = original.value_counts(normalize=True).sort_index()
sub_dist = sub_mapped.value_counts(normalize=True).sort_index()

# Align indices to ensure categories are consistent
all_categories = set(original_dist.index).union(set(sub_dist.index))
original_dist = original_dist.reindex(all_categories, fill_value=0)
sub_dist = sub_dist.reindex(all_categories, fill_value=0)

# Sort by subset density from high to low
sorted_indices = sub_dist.sort_values(ascending=False).index
original_dist = original_dist.loc[sorted_indices]
sub_dist = sub_dist.loc[sorted_indices]

# Custom label names
labelnames = [
    "Lodgepole Pine",
    "Spruce Fir",
    "Douglas Fir",
    "Krummholz",
    "Ponderosa Pine",
    "Aspen",
    "Cottonwood Willow",
]

# Calculate JS divergence
js_divergence = jensenshannon(original_dist, sub_dist)

# Create subplot layout
fig, axes = plt.subplots(nrows=4, ncols=3, figsize=(15, 11))
axes = axes.flatten()

# Store JS divergence results
js_results = []

# Plot distribution comparison histograms for numerical features
for i, (col, feature_name) in enumerate(
    zip(covariates, feature_names)
):  # 从第 1 个子图开始
    data1 = X[col].dropna()  # 确保没有缺失值
    data2 = df[col].dropna()  # 确保没有缺失值

    # Calculate distributions
    bins = 100000
    hist1, bin_edges = np.histogram(data1, bins=bins, density=True)
    hist2, _ = np.histogram(data2, bins=bin_edges, density=True)

    # Convert to probability distributions
    prob1 = hist1 / np.sum(hist1)
    prob2 = hist2 / np.sum(hist2)

    # Calculate JS divergence
    js_div = jensenshannon(prob1, prob2)
    js_results.append((col, js_div))

    # Plot histograms
    ax_left = axes[i]
    ax_right = ax_left.twinx()

    bins = np.histogram_bin_edges(np.concatenate([data1, data2]), bins=30)

    # Plot original dataset distribution on left y-axis
    ax_left.hist(
        data1,
        bins=bins,
        label=("Original Dataset" if col == "elevation" else None),
        color="orange",
        edgecolor="black",
    )
    ax_left.set_ylabel("Original Frequency")

    # Plot subset distribution on right y-axis
    ax_right.hist(
        data2,
        bins=bins,
        alpha=0.8,
        label="Subset Dataset" if col == "elevation" else None,
        color="#1f77b4",
        edgecolor="black",
    )
    ax_right.set_ylabel("Subset Frequency")

    # Show JS divergence in xlabel
    ax_left.set_xlabel(f"{feature_name}\nJSD: {js_div:.4f}")
    if col == "elevation":
        handles_left, labels_left = ax_left.get_legend_handles_labels()
        handles_right, labels_right = ax_right.get_legend_handles_labels()
        ax_left.legend(
            handles_left + handles_right,
            labels_left + labels_right,
            loc="upper left",
        )
    ax_left.set_axisbelow(False)
    ax_right.set_axisbelow(True)
    ax_left.grid(zorder=3)
    ax_right.grid(linestyle="--", alpha=0.5, zorder=0)

# Plot categorical variable density bar chart (in the 11th subplot)
bar_width = 0.2
x = range(len(labelnames))
ax_left = axes[-2]
ax_right = ax_left.twinx()

# Plot original dataset distribution (frequency) on left y-axis
ax_left.bar(
    [pos - bar_width / 2 for pos in x],
    original_dist * len(original),
    width=bar_width,
    label="Original Dataset",
    color="orange",
    edgecolor="black",
)
ax_left.set_ylabel("Original Frequency")

# Plot subset distribution (frequency) on right y-axis
ax_right.bar(
    [pos + bar_width / 2 for pos in x],
    sub_dist * len(sub),
    width=bar_width,
    alpha=0.8,
    label="Subset Dataset",
    color="#1f77b4",
    edgecolor="black",
)
ax_right.set_ylabel("Subset Frequency")

# Set x-axis label and JS divergence
ax_left.set_xlabel("Forest Cover Classes\nJSD: {:.4f}".format(js_divergence))
ax_left.set_xticks(ticks=x)
ax_left.set_xticklabels(labelnames, rotation=45)
ax_left.set_axisbelow(True)
ax_right.set_axisbelow(True)
ax_right.grid(linestyle="--", alpha=0.5, zorder=0)
ax_left.grid(zorder=1)

fig.delaxes(axes[-1])  # Remove the last subplot

plt.tight_layout()
plt.savefig("combined_feature_comparison.png", dpi=800)
plt.show()

# Convert JS divergence results to DataFrame for inspection
results_df = pd.DataFrame(js_results, columns=["Feature", "JS Divergence"])
print("\nJ-S Divergence Results:")
print(results_df)