<a href="https://colab.research.google.com/github/jamile-kellensouza/A-machine-learning-workflow-for-biomedical-tabular-data/blob/main/Agglomerative_HierarchicalClustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import spearmanr, pearsonr, pointbiserialr
! pip install gower
import gower
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from scipy.spatial.distance import pdist, squareform
from sklearn.preprocessing import StandardScaler

In [None]:
data = pd.read_csv('/content/alzheimers_disease_data.csv')

In [None]:
data_fil = data.drop(columns=['PatientID', 'DoctorInCharge'])
summary = data_fil.describe().T
print(summary)

#  Hierarchical Grouping 1 - Eucliadine + average

In [None]:
# Dataset - potencials features discriminatory
features = ['MMSE', 'FunctionalAssessment', 'ADL', 'MemoryComplaints', 'BehavioralProblems']
df = data_fil[features].copy()
df_f = data_fil[features + ['Diagnosis']].copy() # features + label - purity calculation


scaler = StandardScaler()
df_scaled = df.copy()
df_scaled[['MMSE','FunctionalAssessment','ADL']] = scaler.fit_transform(df_scaled[['MMSE','FunctionalAssessment','ADL']])

# Eucliadine distance

euclid_dist = pdist(df_scaled, metric='euclidean')
euclid_dist_matrix = squareform(euclid_dist)

print(f"Matriz de distância Euclidiana calculada com formato: {euclid_dist_matrix.shape}")

# ==============================================================
# Hierarchical Grouping 1


# Medium Linkage
Z = linkage(euclid_dist, method='average')

max_d = 2.7  # cutoff
df['Cluster_Hierarquico_Eucli'] = fcluster(Z, max_d, criterion='distance')

n_clusters = df['Cluster_Hierarquico_Eucli'].nunique()
print(f"Clusters formados: {n_clusters}")
display(df.head())

# Dendrogram
plt.figure(figsize=(7, 5))
dendrogram(Z, labels=df.index, leaf_rotation=90, color_threshold=0,
           above_threshold_color='k')
plt.axhline(y=max_d, color='r', linestyle='solid', label=f'Corte = {max_d}')
plt.title("[A] Dendrogram - Eucliadine distance")
plt.xlabel("Clusters")
plt.ylabel("Distance")
plt.show()

In [None]:
# intra - cluster distance
intra_dist = []
for cluster in range(1, n_clusters + 1):
    indices = np.where(df['Cluster_Hierarquico_Eucli'] == cluster)[0]
    if len(indices) > 1:
        sub_dist = euclid_dist_matrix[np.ix_(indices, indices)]
        mean_intra = (sub_dist.sum() - np.trace(sub_dist)) / (len(indices) * (len(indices) - 1))
        intra_dist.append(mean_intra)
    else:
        intra_dist.append(0)

print(f"Intra-cluster distances per cluster: {intra_dist}")
print(f"Average intra-cluster distances: {np.mean(intra_dist):.3f}")

# ==============================================================
# inter - cluster distance

inter_dist = []
for c1 in range(1, n_clusters + 1):
    for c2 in range(c1 + 1, n_clusters + 1):
        idx1 = np.where(df['Cluster_Hierarquico_Eucli'] == c1)[0]
        idx2 = np.where(df['Cluster_Hierarquico_Eucli'] == c2)[0]
        sub_dist = euclid_dist_matrix[np.ix_(idx1, idx2)]
        inter_dist.append(sub_dist.mean())

print(f"Inter-cluster distances between clusters: {inter_dist}")
print(f"Average intra-cluster distances: {np.mean(inter_dist):.3f}")

# purity calculation
df_f['Cluster_Hierarquico_Eucli'] = df['Cluster_Hierarquico_Eucli']
if 'Diagnosis' in df_f.columns:
    total = len(df_f)
    purity_sum = 0
    cluster_purity = {}

    for cluster in range(1, n_clusters + 1):
        indices = df_f['Cluster_Hierarquico_Eucli'] == cluster
        true_labels = df_f.loc[indices, 'Diagnosis']

        if len(true_labels) == 0:
            continue

        most_common = true_labels.value_counts().max()
        purity_cluster = most_common / len(true_labels)

        cluster_purity[cluster] = purity_cluster

        purity_sum += most_common

    purity_total = purity_sum / total

    print("Individual purity per cluster:")
    for c, p in cluster_purity.items():
        print(f"  Cluster {c}: {p:.3f}")

    print(f"\nTotal purity: {purity_total:.3f}")

else:
    print("No true label ('Diagnosis') was found for purity calculation.")

#  Hierarchical Grouping 2 - Gower + complete

In [None]:
# Dataset - potencials features discriminatory
features = ['MMSE', 'FunctionalAssessment', 'ADL', 'MemoryComplaints', 'BehavioralProblems']
df1 = data_fil[features].copy()
df_f1 = data_fil[features + ['Diagnosis']].copy() # features + rótulo para calcular a pureza

# Gower distance
gower_dist = gower.gower_matrix(df1)

print(f" Matriz de distância de Gower calculada com formato: {gower_dist.shape}")

# ==============================================================
# Hierarchical Grouping 2

# complete linkage
Z = linkage(gower_dist, method='complete')


max_d = 17.5  # cutoff
df1['Cluster_Hierarquico_gower'] = fcluster(Z, max_d, criterion='distance')

n_clusters = df1['Cluster_Hierarquico_gower'].nunique()
print(f"Clusters formados: {n_clusters}")
display(df1.head())


# Dendrogram
plt.figure(figsize=(7, 5))
dendrogram(Z, labels=df1.index, leaf_rotation=90,color_threshold=0,
           above_threshold_color='k')
plt.axhline(y=max_d, color='r', linestyle='solid', label=f'Corte = {max_d}')
plt.title("[B] Dendrogram - Gower distance")
plt.xlabel("Clusters")
plt.ylabel("Distance")
plt.show()

In [None]:
# intra - cluster distance
intra_dist = []
for cluster in range(1, n_clusters+1):
    indices = np.where(df1['Cluster_Hierarquico_gower'] == cluster)[0]
    if len(indices) > 1:
        sub_dist = gower_dist[np.ix_(indices, indices)]
        mean_intra = (sub_dist.sum() - np.trace(sub_dist)) / (len(indices)*(len(indices)-1))
        intra_dist.append(mean_intra)
    else:
        intra_dist.append(0)
print(f"Intra-cluster distance per cluster: {intra_dist}")
print(f"Average intra-cluster distance: {np.mean(intra_dist):.3f}")

# ==============================================================
# inter - cluster distance

inter_dist = []
for c1 in range(1, n_clusters+1):
    for c2 in range(c1+1, n_clusters+1):
        idx1 = np.where(df1['Cluster_Hierarquico_gower']==c1)[0]
        idx2 = np.where(df1['Cluster_Hierarquico_gower']==c2)[0]
        sub_dist = gower_dist[np.ix_(idx1, idx2)]
        inter_dist.append(sub_dist.mean())
print(f"Inter-cluster distances between clusters: {inter_dist}")
print(f"Average inter-cluster distance: {np.mean(inter_dist):.3f}")

# purity calculation
df_f1['Cluster_Hierarquico_gower'] = df1['Cluster_Hierarquico_gower']
if 'Diagnosis' in df_f1.columns:
    total = len(df_f1)
    purity_sum = 0
    cluster_purity = {}

    for cluster in range(1, n_clusters + 1):
        indices = df_f1['Cluster_Hierarquico_gower'] == cluster
        true_labels = df_f1.loc[indices, 'Diagnosis']

        if len(true_labels) == 0:
            continue

        most_common = true_labels.value_counts().max()
        purity_cluster = most_common / len(true_labels)

        cluster_purity[cluster] = purity_cluster

        purity_sum += most_common

    purity_total = purity_sum / total


    print("Individual purity per cluster")
    for c, p in cluster_purity.items():
        print(f"  Cluster {c}: {p:.3f}")

    print(f"\nTotal purity: {purity_total:.3f}")

else:
    print("No true label ('Diagnosis') was found for purity calculation.")


#  Hierarchical Grouping 3 - Hamming - complete

In [None]:
# Dataset - potencials features discriminatory
features = ['MemoryComplaints', 'BehavioralProblems']
df2 = data_fil[features].copy()
df_f2 = data_fil[features + ['Diagnosis']].copy()

# Ensure that binary data is an integer
df2 = df2.astype(int)

# ==============================
# Hamming Distance

dist_hamming = pdist(df2, metric='hamming')
dist_square = squareform(dist_hamming)

# ==============================================================
# Hierarchical Grouping 3

# Complete linkage

Z = linkage(dist_hamming, method='complete')

max_d = 0.8  # # cutoff
df2['Cluster_Hierarquico_Hamm'] = fcluster(Z, max_d, criterion='distance')

n_clusters = df2['Cluster_Hierarquico_Hamm'].nunique()
print(f"Clusters formados: {n_clusters}")
display(df2.head())

# Dendrogram

plt.figure(figsize=(7,5))
dendrogram(Z, labels=df2.index, leaf_rotation=90,truncate_mode='lastp', p=1200,
           color_threshold=0,above_threshold_color='k')
plt.axhline(y=max_d, color='r', linestyle='solid', label=f'Corte = {max_d}')
plt.title("[C] Dendrogram - Hamming distance")
plt.xlabel("Clusters")
plt.ylabel("Distance")
plt.show()


In [None]:
# Intra-cluster distance
intra_dist = []

for cluster in range(1, n_clusters+1):
    idx = np.where(df2['Cluster_Hierarquico_Hamm'] == cluster)[0]
    if len(idx) > 1:
        sub_dist = dist_square[np.ix_(idx, idx)]
        mean_intra = np.mean(sub_dist[np.triu_indices(len(idx), k=1)])
        intra_dist.append(mean_intra)
    else:
        intra_dist.append(np.nan)

print(f"Intra-cluster distance per cluster: {intra_dist}")
print(f"Average intra-cluster distance: {np.nanmean(intra_dist):.3f}")

# Inter - cluster distance
inter_dist = []

for c1 in range(1, n_clusters+1):
    for c2 in range(c1+1, n_clusters+1):
        idx1 = np.where(df2['Cluster_Hierarquico_Hamm'] == c1)[0]
        idx2 = np.where(df2['Cluster_Hierarquico_Hamm'] == c2)[0]
        sub_dist = dist_square[np.ix_(idx1, idx2)]
        inter_dist.append(sub_dist.mean())

print(f"Inter-cluster distances between clusters {inter_dist}")
print(f"Average inter-cluster distance: {np.mean(inter_dist):.3f}")

# purity calculation
df_f2['Cluster_Hierarquico_Hamm'] = df2['Cluster_Hierarquico_Hamm']

if 'Diagnosis' in df_f2.columns:
    total = len(df_f2)
    purity_sum = 0
    cluster_purity = {}

    for cluster in range(1, n_clusters + 1):
        indices = df_f2['Cluster_Hierarquico_Hamm'] == cluster
        true_labels = df_f2.loc[indices, 'Diagnosis']

        if len(true_labels) == 0:
            continue

        most_common = true_labels.value_counts().max()
        purity_cluster = most_common / len(true_labels)

        cluster_purity[cluster] = purity_cluster

        purity_sum += most_common

    purity_total = purity_sum / total


    print("Individual purity per cluster")
    for c, p in cluster_purity.items():
        print(f" Cluster {c}: {p:.3f}")

    print(f"\nTotal purity: {purity_total:.3f}")

else:
    print("No true label ('Diagnosis') was found for purity calculation.")