In [None]:
from ucimlrepo import fetch_ucirepo 
import pandas as pd
import numpy as np
import itertools
import networkx as nx
from sklearn.manifold import SpectralEmbedding
from sklearn.cluster import KMeans

soybean = fetch_ucirepo(id=91) 
zoo = fetch_ucirepo(id=111)
heart_disease = fetch_ucirepo(id=45)
dermatology = fetch_ucirepo(id=33)
breast_cancer = fetch_ucirepo(id=15)
mushroom = fetch_ucirepo(id=73)

soybean_data = pd.DataFrame(data=soybean.data.features, columns=soybean.data.feature_names)
soybean_data['target'] = soybean.data.targets

zoo_data = pd.DataFrame(data=zoo.data.features, columns=zoo.data.feature_names)
zoo_data['target'] = zoo.data.targets

kokoro_data = pd.DataFrame(data=heart_disease.data.features, columns=heart_disease.data.feature_names)
kokoro_data['target'] = heart_disease.data.targets

dermatology_data = pd.DataFrame(data=dermatology.data.features, columns=dermatology.data.feature_names)
dermatology_df['target'] = dermatology.data.targets

breasts_data = pd.DataFrame(data=breast_cancer.data.features, columns=breast_cancer.data.feature_names)
breasts_data['target'] = breast_cancer.data.targets

mushroom_data = pd.DataFrame(data=mushroom.data.features, columns=mushroom.data.feature_names)
mushroom_data['target'] = mushroom.data.targets

def jaccard_coefficient(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union

def ochiai_coefficient(set1, set2):
    intersection = len(set1.intersection(set2))
    denominator = np.sqrt(len(set1) * len(set2))
    return intersection / denominator

def overlap_coefficient(set1, set2):
    intersection = len(set1.intersection(set2))
    min_length = min(len(set1), len(set2))
    return intersection / min_length

def dice_coefficient(set1, set2):
    intersection = len(set1.intersection(set2))
    denominator = len(set1) + len(set2)
    return 2 * intersection / denominator

def graph_based_representation(data):
    num_samples, num_features = data.shape
    similarity_matrix = np.zeros((num_features, num_features))
    for i, j in itertools.combinations(range(num_features), 2):
        similarity_matrix[i, j] = jaccard_coefficient(set(data[:, i]), set(data[:, j]))
        similarity_matrix[j, i] = similarity_matrix[i, j]
    G = nx.from_numpy_array(similarity_matrix)
    embedding = SpectralEmbedding(n_components=p)
    representation_matrix = embedding.fit_transform(similarity_matrix)
    return representation_matrix

def joint_operation(data, representation_matrix):
    return np.dot(data, representation_matrix)

def mean_operation(data, representation_matrix):
    return np.mean(np.dot(data, representation_matrix), axis=1)

def perform_clustering(data, k):
    kmeans = KMeans(n_clusters=k)
    return kmeans.fit_predict(data)


In [None]:
import itertools
import warnings
import numpy as np
import pandas as pd
import networkx as nx
from sklearn.manifold import SpectralEmbedding
from sklearn.cluster import KMeans
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, fowlkes_mallows_score

def jaccard_coefficient(set1, set2):
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    return intersection / union if union != 0 else 0

def create_graph_representation(data):
    num_samples, num_features = data.shape
    similarity_matrix = np.zeros((num_features, num_features))
    for i, j in itertools.combinations(range(num_features), 2):
        similarity_matrix[i, j] = jaccard_coefficient(set(data[:, i]), set(data[:, j]))
        similarity_matrix[j, i] = similarity_matrix[i, j]
    G = nx.from_numpy_array(similarity_matrix)
    embedding = SpectralEmbedding(n_components=num_components)
    representation_matrix = embedding.fit_transform(similarity_matrix)
    return representation_matrix

def apply_joint_operation(data, representation_matrix):
    return np.dot(data, representation_matrix)

def cluster_data(data, num_clusters):
    kmeans = KMeans(n_clusters=num_clusters, n_init=10)  # Explicitly set n_init
    return kmeans.fit_predict(data)

num_components = 10
num_clusters = 3

performance_results = []

datasets = {
     "soybean_data": soybean_data,
     "zoo_data": zoo_data,
     "kokoro_data": kokoro_data,
     "dermatology_df": dermatology_df,
     "breasts_data": breast_data,
     "mushroom_df": mushroom_df
 }

for dataset_name, dataset in datasets.items():
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=UserWarning)

        try:
            X = dataset.drop(columns=[dataset.columns[-1]])
            y = dataset[dataset.columns[-1]] 
            encoder = OneHotEncoder()
            X_encoded = encoder.fit_transform(X)
            representation_matrix = create_graph_representation(X_encoded.toarray())
            integrated_data = apply_joint_operation(X_encoded.toarray(), representation_matrix)
            labels = cluster_data(integrated_data, num_clusters)

            ARI = adjusted_rand_score(y, labels)
            NMI = normalized_mutual_info_score(y, labels)
            FMI = fowlkes_mallows_score(y, labels)

            performance_results.append([dataset_name, ARI, NMI, FMI])
        except UserWarning as e:
            print(f"Warning: {e}")

results_df = pd.DataFrame(performance_results, columns=["Dataset", "ARI", "NMI", "FMI"])
print(results_df)


The ARI, NMI, and FMI are ways to measure how good your clustering results are. ARI adjusts for random chance and works well when your clusters are similar in size, but it can be complicated. NMI checks how much information two clusterings share, so it's good for clusters of different sizes but can be a bit hard to understand. FMI is simpler and faster, balancing how well clusters group together (precision) and how completely they do so (recall), but it's not as good with clusters of very different sizes. Use ARI for balanced clusters, NMI for varied sizes, and FMI for quick, simple checks.

In [None]:
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, fowlkes_mallows_score
from kmodes.kmodes import KModes
from sklearn.preprocessing import LabelEncoder

k = 3
def preprocess_and_encode_categorical_data(data):
    encoder = LabelEncoder()
    encoded_data = data.apply(encoder.fit_transform)
    return encoded_data

def perform_clustering_and_evaluate_performance(data, true_labels, num_clusters):
    km = KModes(n_clusters=num_clusters, init='Huang', n_init=5, verbose=0)
    km_labels = km.fit_predict(data)
    ARI_km = adjusted_rand_score(true_labels, km_labels)
    NMI_km = normalized_mutual_info_score(true_labels, km_labels)
    FMI_km = fowlkes_mallows_score(true_labels, km_labels)

    ac = AgglomerativeClustering(n_clusters=num_clusters, linkage='ward')
    ac_labels = ac.fit_predict(data)
    ARI_ac = adjusted_rand_score(true_labels, ac_labels)
    NMI_ac = normalized_mutual_info_score(true_labels, ac_labels)
    FMI_ac = fowlkes_mallows_score(true_labels, ac_labels)
    
    return [
        ["K-Modes", ARI_km, NMI_km, FMI_km],
        ["Hierarchical", ARI_ac, NMI_ac, FMI_ac]
    ]

results_categorical = []

for dataset_name in datasets:
    dataset = globals()[dataset_name]
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=UserWarning)
        try:
            X_cat = dataset.iloc[:, :-1]
            true_labels_cat = dataset.iloc[:, -1]

            X_cat_encoded = preprocess_and_encode_categorical_data(X_cat)
            clustering_results = perform_clustering_and_evaluate_performance(X_cat_encoded, true_labels_cat, k)  # Pass 'k' as parameter

            for method, ari, nmi, fmi in clustering_results:
                results_categorical.append([dataset_name + " (" + method + ")", ari, nmi, fmi])
        except UserWarning as e:
            print(f"Warning: {e}")

results_categorical_df = pd.DataFrame(results_categorical, columns=["Dataset", "ARI", "NMI", "FMI"])
print(results_categorical_df)

The FMI is often greater than the Adjusted Rand Index ARI and Normalized Mutual Information NMI because it focuses directly on the balance of precision and recall without adjusting for chance or information overlap, making it simpler and often yielding higher values. The problem with ARI is that it adjusts for random chance, which can lower its value, especially in complex or unbalanced clusterings. NMI, on the other hand, measures the shared information between clusterings, but this can be biased towards larger clusters and harder to interpret, often resulting in lower values compared to FMI.