In [None]:
from ucimlrepo import fetch_ucirepo 
import pandas as pd

soybean = fetch_ucirepo(id=91) 
zoo = fetch_ucirepo(id=111)
heart_disease = fetch_ucirepo(id=45)
dermatology = fetch_ucirepo(id=33)
breast_cancer = fetch_ucirepo(id=15)
mushroom = fetch_ucirepo(id=73)

X = soybean.data.features
y = soybean.data.targets 
soybean_df = pd.merge(X, y, left_index=True, right_index=True)

X = zoo.data.features
y = zoo.data.targets 
zoo_df = pd.merge(X, y, left_index=True, right_index=True)

X = heart_disease.data.features
y = heart_disease.data.targets 
heart_disease_df = pd.merge(X, y, left_index=True, right_index=True)

X = dermatology.data.features
y = dermatology.data.targets 
dermatology_df = pd.merge(X, y, left_index=True, right_index=True)

X = breast_cancer.data.features
y = breast_cancer.data.targets 
breast_cancer_df = pd.merge(X, y, left_index=True, right_index=True)

X = mushroom.data.features
y = mushroom.data.targets 
mushroom_df = pd.merge(X, y, left_index=True, right_index=True)


soybean_df = soybean_df.dropna()
zoo_df = zoo_df.dropna()
heart_disease_df = heart_disease_df.dropna()
dermatology_df = dermatology_df.dropna()
breast_cancer_df = breast_cancer_df.dropna()
mushroom_df = mushroom_df.dropna()

import numpy as np
from scipy.special import logsumexp

class CustomBernoulliSEM:
    
    def __init__(self, num_components, max_iterations, batch_size=100, tolerance=1e-3):
        self.num_components = num_components
        self.max_iterations = max_iterations
        self.batch_size = batch_size
        self.tolerance = tolerance
    
    def fit(self, data):
        self.data = data
        self.initialize_parameters()
        log_bernoullis = self.calculate_log_bernoullis(self.data)
        self.previous_log_likelihood = self.calculate_log_likelihood(log_bernoullis)
        for step in range(self.max_iterations):
            if step > 0:
                self.previous_log_likelihood = self.log_likelihood
            self.responsibilities = self.calculate_responsibilities(log_bernoullis)
            self.save_previous_parameters()
            self.update_Neff()
            self.update_mu(self.data)
            self.update_pi()
            log_bernoullis = self.calculate_log_bernoullis(self.data)
            self.log_likelihood = self.calculate_log_likelihood(log_bernoullis)
            if np.isnan(self.log_likelihood):
                self.reset_parameters()
                print(self.log_likelihood)
                break

    def iterate_batches(self):
        num_samples = len(self.data)
        indices = np.arange(num_samples)
        np.random.shuffle(indices)
        for start_idx in range(0, num_samples, self.batch_size):
            end_idx = min(start_idx + self.batch_size, num_samples)
            batch_indices = indices[start_idx:end_idx]
            yield self.data.iloc[batch_indices]

    def reset_parameters(self):
        self.mu = self.previous_mu.copy()
        self.pi = self.previous_pi.copy()
        self.responsibilities = self.previous_responsibilities.copy()
        self.update_Neff()
        log_bernoullis = self.calculate_log_bernoullis(self.data)
        self.log_likelihood = self.calculate_log_likelihood(log_bernoullis)
        
    def save_previous_parameters(self):
        self.previous_mu = self.mu.copy()
        self.previous_pi = self.pi.copy()
        self.previous_responsibilities = self.responsibilities.copy()
    
    def initialize_parameters(self):
        self.num_samples = self.data.shape[0]
        self.num_features = self.data.shape[1]
        self.pi = 1/self.num_components * np.ones(self.num_components)
        self.mu = np.random.RandomState(seed=0).uniform(low=0.25, high=0.75, size=(self.num_components, self.num_features))
        self.normalize_mu()
        self.previous_mu = None
        self.previous_pi = None
        self.previous_responsibilities = None
    
    def normalize_mu(self):
        sum_over_features = np.sum(self.mu, axis=1)
        for k in range(self.num_components):
            self.mu[k,:] /= sum_over_features[k]
            
    def calculate_responsibilities(self, log_bernoullis):
        gamma = np.zeros(shape=(log_bernoullis.shape[0], self.num_components))
        Z =  logsumexp(np.log(self.pi[None,:]) + log_bernoullis, axis=1)
        for k in range(self.num_components):
            gamma[:, k] = np.exp(np.log(self.pi[k]) + log_bernoullis[:,k] - Z)
        return gamma
        
    def calculate_log_bernoullis(self, data):
        log_bernoullis = self.calculate_log_prob(data, self.mu)
        log_bernoullis += self.calculate_log_prob(1-data, 1-self.mu)
        return log_bernoullis
    
    def calculate_log_prob(self, data, mu):
        epsilon = 1e-15
        mu_place = np.clip(mu, epsilon, 1 - epsilon)
        return np.tensordot(data, np.log(mu_place), (1,1))

    def update_Neff(self):
        self.Neff = np.sum(self.responsibilities, axis=0)
    
    def update_mu(self, batch):
        self.mu = np.einsum('ik,id -> kd', self.responsibilities, batch) / self.Neff[:, None]
        
    def update_pi(self):
        self.pi = self.Neff / self.num_samples
    
    def predict(self, data):
        log_bernoullis = self.calculate_log_bernoullis(data)
        gamma = self.calculate_responsibilities(log_bernoullis)
        return np.argmax(gamma, axis=1)
        
    def get_sample_log_likelihood(self, log_bernoullis):
        return logsumexp(np.log(self.pi[None,:]) + log_bernoullis, axis=1)
    
    def calculate_log_likelihood(self, log_bernoullis):
        return np.mean(self.get_sample_log_likelihood(log_bernoullis))
        
    def score(self, data):
        log_bernoullis = self.calculate_log_bernoullis(data)
        return self.calculate_log_likelihood(log_bernoullis)
    
    def score_samples(self, data):
        log_bernoullis = self.calculate_log_bernoullis(data)
        return self.get_sample_log_likelihood(log_bernoullis)


In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import fowlkes_mallows_score, adjusted_rand_score, normalized_mutual_info_score
from sklearn.cluster import KMeans

class CustomKMeans(KMeans):
    pass

def custom_encode_categorical(dataframe):
    encoder = LabelEncoder()
    encoded_df = dataframe.copy()
    for col in dataframe.columns:
        if dataframe[col].dtype == 'object':
            encoded_df[col] = encoder.fit_transform(dataframe[col])
    return encoded_df

datasets_list = ["soybean_df", "zoo_df", "heart_disease_df", "dermatology_df", "breast_cancer_df", "mushroom_df"]
encoded_data = {}

for dataset in datasets_list:
    df_copy = globals()[dataset].copy()
    encoded_data[dataset] = custom_encode_categorical(df_copy)

def evaluate_clustering_algo(dataset_name, algorithm_name, **kwargs):
    data = encoded_data[dataset_name]
    features = data.iloc[:, :-1]
    true_labels = data.iloc[:, -1]
    
    if algorithm_name == 'CustomKMeans':
        model = CustomKMeans(**kwargs)
        labels = model.fit_predict(features)
    elif algorithm_name == 'CustomBernoulliSEM':
        model = CustomBernoulliSEM(**kwargs)
        model.fit(features)
        labels = model.predict(features)
    else:
        raise ValueError("Algorithm not supported.")
    
    fmi, ari, nmi = None, None, None
    
    if true_labels is not None:
        fmi = fowlkes_mallows_score(true_labels, labels)
        ari = adjusted_rand_score(true_labels, labels)
        nmi = normalized_mutual_info_score(true_labels, labels)
    
    return fmi, ari, nmi

results_dict = {}
algorithms_list = ['CustomKMeans', 'CustomBernoulliSEM']
for dataset_name in datasets_list:
    results_dict[dataset_name] = {}
    for algorithm in algorithms_list:
        if algorithm == 'CustomKMeans':
            kwargs = {'n_clusters': 2, 'max_iter': 100, 'n_init': 10}  # Explicitly set n_init
        elif algorithm == 'CustomBernoulliSEM':
            kwargs = {'num_components': 2, 'max_iterations': 100}
        fmi_score, ari_score, nmi_score = evaluate_clustering_algo(dataset_name, algorithm, **kwargs)
        results_dict[dataset_name][algorithm] = {'FMI': fmi_score, 'ARI': ari_score, 'NMI': nmi_score}

print(results_dict)



In [None]:
# Prepare data for results
results_data = []

for dataset_name, algorithms in results_dict.items():
    for algorithm, metrics in algorithms.items():
        results_data.append([dataset_name, algorithm, metrics['FMI'], metrics['ARI'], metrics['NMI']])

# Create DataFrame from results data
results_dataframe = pd.DataFrame(results_data, columns=['Dataset', 'Algorithm', 'FMI', 'ARI', 'NMI'])
results_dataframe


The FMI is often greater than the ARI and NMI because it focuses directly on the balance of precision and recall without adjusting for chance or information overlap, making it simpler and often yielding higher values. The problem with ARI is that it adjusts for random chance, which can lower its value, especially in complex or unbalanced clusterings. NMI, on the other hand, measures the shared information between clusterings, but this can be biased towards larger clusters and harder to interpret, often resulting in lower values compared to FMI.