In [1]:
from collections import defaultdict

import numpy as np
from scipy.stats import spearmanr
from scipy.cluster import hierarchy
from scipy.spatial.distance import squareform

from sklearn.datasets import load_breast_cancer

# Load the dataset
data = load_breast_cancer()
X = data.data

# Compute the Spearman rank-order correlation matrix
corr = spearmanr(X).correlation

# Ensure the correlation matrix is symmetric
corr = (corr + corr.T) / 2
np.fill_diagonal(corr, 1)

# Convert the correlation matrix to a distance matrix
distance_matrix = 1 - np.abs(corr)

# Perform hierarchical clustering using Ward's method
dist_linkage = hierarchy.ward(squareform(distance_matrix))

# Assign cluster IDs based on a distance threshold
cluster_ids = hierarchy.fcluster(dist_linkage, 1, criterion="distance")

# Map cluster IDs to feature indices
cluster_id_to_feature_ids = defaultdict(list)
for idx, cluster_id in enumerate(cluster_ids):
    cluster_id_to_feature_ids[cluster_id].append(idx)

# Select one feature from each cluster (e.g., the first feature)
selected_features = [v[0] for v in cluster_id_to_feature_ids.values()]

# Get the names of the selected features
selected_feature_names = data.feature_names[selected_features]
print(selected_feature_names)

['mean radius' 'mean texture' 'mean smoothness' 'mean compactness'
 'texture error']


In [2]:
from collections import defaultdict

import numpy as np
from scipy.stats import spearmanr
from scipy.cluster import hierarchy
from scipy.spatial.distance import squareform

from sklearn.datasets import load_breast_cancer

# Load the dataset
data = load_breast_cancer()
X, y = data.data, data.target

# Compute the Spearman rank-order correlation matrix
corr = spearmanr(X).correlation

# Ensure the correlation matrix is symmetric and set diagonal to 1
corr = (corr + corr.T) / 2
np.fill_diagonal(corr, 1)

# Convert the correlation matrix to a distance matrix
distance_matrix = 1 - np.abs(corr)

# Perform hierarchical clustering using Ward's method
dist_linkage = hierarchy.ward(squareform(distance_matrix))

# Assign cluster IDs based on a distance threshold
cluster_ids = hierarchy.fcluster(dist_linkage, 1, criterion="distance")

# Map cluster IDs to feature indices
cluster_id_to_feature_ids = defaultdict(list)
for idx, cluster_id in enumerate(cluster_ids):
    cluster_id_to_feature_ids[cluster_id].append(idx)

# Select one feature from each cluster (e.g., the first feature)
selected_features = [v[0] for v in cluster_id_to_feature_ids.values()]

# Compute and print intra-cluster and inter-cluster correlations
for cluster_id, feature_indices in cluster_id_to_feature_ids.items():
    selected_feature_idx = feature_indices[0]
    selected_feature_name = data.feature_names[selected_feature_idx]
    other_feature_indices_in_cluster = [i for i in feature_indices if i != selected_feature_idx]

    # Intra-cluster correlations (excluding the selected feature itself)
    if other_feature_indices_in_cluster:
        intra_corrs = corr[selected_feature_idx, other_feature_indices_in_cluster]
        avg_intra_corr = np.mean(np.abs(intra_corrs))
        std_intra_corr = np.std(np.abs(intra_corrs))
    else:
        avg_intra_corr = np.nan
        std_intra_corr = np.nan

    # Inter-cluster correlations
    other_cluster_feature_indices = [
        i for cid, idxs in cluster_id_to_feature_ids.items() if cid != cluster_id for i in idxs
    ]
    inter_corrs = corr[selected_feature_idx, other_cluster_feature_indices]
    avg_inter_corr = np.mean(np.abs(inter_corrs))
    std_inter_corr = np.std(np.abs(inter_corrs))

    print(f"Selected Feature: {selected_feature_name}")
    if not np.isnan(avg_intra_corr):
        print(f"Average intra-cluster correlation: {avg_intra_corr:.3f} ± {std_intra_corr:.3f}")
    else:
        print("Only feature in its cluster.")
    print(f"Average inter-cluster correlation: {avg_inter_corr:.3f} ± {std_inter_corr:.3f}")
    print()

Selected Feature: mean radius
Average intra-cluster correlation: 0.848 ± 0.186
Average inter-cluster correlation: 0.338 ± 0.213

Selected Feature: mean texture
Average intra-cluster correlation: 0.909 ± 0.000
Average inter-cluster correlation: 0.254 ± 0.129

Selected Feature: mean smoothness
Average intra-cluster correlation: 0.541 ± 0.133
Average inter-cluster correlation: 0.308 ± 0.171

Selected Feature: mean compactness
Average intra-cluster correlation: 0.829 ± 0.053
Average inter-cluster correlation: 0.461 ± 0.183

Selected Feature: texture error
Average intra-cluster correlation: 0.416 ± 0.027
Average inter-cluster correlation: 0.169 ± 0.119



In [3]:
from scipy.stats import mannwhitneyu

for cluster_id, feature_indices in cluster_id_to_feature_ids.items():
    selected_feature_idx = feature_indices[0]
    selected_feature_name = data.feature_names[selected_feature_idx]
    other_feature_indices_in_cluster = [i for i in feature_indices if i != selected_feature_idx]

    # Intra-cluster correlations (excluding the selected feature itself)
    if other_feature_indices_in_cluster:
        intra_corrs = corr[selected_feature_idx, other_feature_indices_in_cluster]
        intra_corrs_abs = np.abs(intra_corrs)
    else:
        intra_corrs_abs = np.array([])

    # Inter-cluster correlations
    other_cluster_feature_indices = [
        i for cid, idxs in cluster_id_to_feature_ids.items() if cid != cluster_id for i in idxs
    ]
    inter_corrs = corr[selected_feature_idx, other_cluster_feature_indices]
    inter_corrs_abs = np.abs(inter_corrs)

    # Perform Mann-Whitney U test if there are enough samples
    if len(intra_corrs_abs) > 0 and len(inter_corrs_abs) > 0:
        u_stat, p_value = mannwhitneyu(intra_corrs_abs, inter_corrs_abs, alternative='greater')
        print(f"Selected Feature: {selected_feature_name}")
        print(f"p-value for higher intra-cluster correlation: {p_value:.3e}")
        print()

Selected Feature: mean radius
p-value for higher intra-cluster correlation: 2.237e-05

Selected Feature: mean texture
p-value for higher intra-cluster correlation: 3.448e-02

Selected Feature: mean smoothness
p-value for higher intra-cluster correlation: 3.185e-03

Selected Feature: mean compactness
p-value for higher intra-cluster correlation: 2.330e-07

Selected Feature: texture error
p-value for higher intra-cluster correlation: 2.217e-02



In [4]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import ttest_rel

# Model using all features
clf_full = RandomForestClassifier(n_estimators=100, random_state=42)
scores_full = cross_val_score(clf_full, X, y, cv=5, scoring='accuracy')

# Model using selected features
X_sel = X[:, selected_features]
clf_sel = RandomForestClassifier(n_estimators=100, random_state=42)
scores_sel = cross_val_score(clf_sel, X_sel, y, cv=5, scoring='accuracy')

# Perform paired t-test
t_stat, p_value = ttest_rel(scores_full, scores_sel)
print(f"Accuracy with all features: {scores_full.mean():.3f} ± {scores_full.std():.3f}")
print(f"Accuracy with selected features: {scores_sel.mean():.3f} ± {scores_sel.std():.3f}")
print(f"p-value for difference in model accuracy: {p_value:.3f}")

Accuracy with all features: 0.956 ± 0.023
Accuracy with selected features: 0.923 ± 0.028
p-value for difference in model accuracy: 0.065


In [5]:
from scipy.cluster.hierarchy import cophenet

coph_corr, _ = cophenet(dist_linkage, squareform(distance_matrix))
print(f"Cophenetic Correlation Coefficient: {coph_corr:.3f}")

Cophenetic Correlation Coefficient: 0.672
