In [1]:
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd 
import numpy as np	

# milk

In [2]:
dataset = pd.read_csv('/Users/farrelmanazilin/Document/kuliah/data/milk.csv')
dataset = dataset.drop('Grade', axis=1)

scaler = MinMaxScaler()
dataset_new = scaler.fit_transform(dataset)

In [3]:
kmeans = KMeans(n_clusters=3, random_state=0)
cluster = kmeans.fit_predict(dataset_new)  
dataset['Cluster'] = cluster

k = 3

cluster_centers = kmeans.cluster_centers_
labels = kmeans.labels_

variance_within_clusters = 0
for i in range(k):
    cluster_data = dataset_new[labels == i]
    cluster_center = cluster_centers[i]
    squared_distances = np.sum((cluster_data - cluster_center) ** 2)
    variance_within_clusters += squared_distances

grand_mean = np.mean(dataset_new, axis=0)
variance_between_clusters = 0
for i in range(k):
    cluster_data = dataset_new[labels == i]
    cluster_center = cluster_centers[i]
    n = len(cluster_data)
    squared_distances = n * np.sum((cluster_center - grand_mean) ** 2)
    variance_between_clusters += squared_distances

total_variance = np.sum((dataset_new - grand_mean) ** 2)

print("\nVariance Within Clusters:", variance_within_clusters)
print("Variance Between Clusters:", variance_between_clusters)
print("Total Variance (Dataset Variance):", total_variance)


Variance Within Clusters: 624.8924146831872
Variance Between Clusters: 566.4817193462391
Total Variance (Dataset Variance): 1191.3741340294262




In [4]:
single_linkage = linkage(dataset, method='single')

max_d = 2.0
clusters = fcluster(single_linkage, max_d, criterion='distance')

grand_mean = np.mean(dataset, axis=0)

variance_within_clusters = {}
for cluster_id in np.unique(clusters):
    cluster_data = dataset[clusters == cluster_id]
    n = len(cluster_data)
    total_variance_within_cluster = np.sum(np.sum((cluster_data - np.mean(dataset, axis=0)) ** 2))
    variance_within_clusters[cluster_id] = total_variance_within_cluster

total_variance_within_clusters = sum(variance_within_clusters.values())

variance_between_clusters = 0
for cluster_id in np.unique(clusters):
    cluster_data = dataset[clusters == cluster_id]
    cluster_mean = np.mean(cluster_data, axis=0)
    n = len(cluster_data)
    variance_between_clusters += n * np.sum((cluster_mean - grand_mean) ** 2)

total_variance = total_variance_within_clusters + variance_between_clusters

print("Total Variance Within Clusters:\n", total_variance_within_clusters)
print("\nTotal Variance Between Clusters:\n", variance_between_clusters)
print("\nTotal Variance (Dataset Variance):\n", total_variance)

Total Variance Within Clusters:
 131352.01142587347

Total Variance Between Clusters:
 130074.84746669892

Total Variance (Dataset Variance):
 261426.8588925724


In [5]:
average_linkage = linkage(dataset, method='average') 

max_d = 2.0
clusters = fcluster(average_linkage, max_d, criterion='distance')

grand_mean = np.mean(dataset, axis=0)

variance_within_clusters = {}
for cluster_id in np.unique(clusters):
    cluster_data = dataset[clusters == cluster_id]
    n = len(cluster_data)
    total_variance_within_cluster = np.sum(np.sum((cluster_data - np.mean(dataset, axis=0)) ** 2))
    variance_within_clusters[cluster_id] = total_variance_within_cluster

total_variance_within_clusters = sum(variance_within_clusters.values())

variance_between_clusters = 0
for cluster_id in np.unique(clusters):
    cluster_data = dataset[clusters == cluster_id]
    cluster_mean = np.mean(cluster_data, axis=0)
    n = len(cluster_data)
    variance_between_clusters += n * np.sum((cluster_mean - grand_mean) ** 2)

total_variance = total_variance_within_clusters + variance_between_clusters

print("Total Variance Within Clusters:\n", total_variance_within_clusters)
print("\nTotal Variance Between Clusters:\n", variance_between_clusters)
print("\nTotal Variance (Dataset Variance):\n", total_variance)

Total Variance Within Clusters:
 131352.01142587347

Total Variance Between Clusters:
 130966.9408714785

Total Variance (Dataset Variance):
 262318.95229735196


In [6]:
complete_linkage = linkage(dataset, method='complete')

max_d = 2.0  
clusters = fcluster(complete_linkage, max_d, criterion='distance')

grand_mean = np.mean(dataset, axis=0)

variance_within_clusters = {}
for cluster_id in np.unique(clusters):
    cluster_data = dataset[clusters == cluster_id]
    n = len(cluster_data)
    total_variance_within_cluster = np.sum(np.sum((cluster_data - np.mean(dataset, axis=0)) ** 2))
    variance_within_clusters[cluster_id] = total_variance_within_cluster

total_variance_within_clusters = sum(variance_within_clusters.values())

variance_between_clusters = 0
for cluster_id in np.unique(clusters):
    cluster_data = dataset[clusters == cluster_id]
    cluster_mean = np.mean(cluster_data, axis=0)
    n = len(cluster_data)
    variance_between_clusters += n * np.sum((cluster_mean - grand_mean) ** 2)

total_variance = total_variance_within_clusters + variance_between_clusters

print("Total Variance Within Clusters:\n", total_variance_within_clusters)
print("\nTotal Variance Between Clusters:\n", variance_between_clusters)
print("\nTotal Variance (Dataset Variance):\n", total_variance)

Total Variance Within Clusters:
 131352.01142587344

Total Variance Between Clusters:
 131145.92959778834

Total Variance (Dataset Variance):
 262497.94102366176


In [7]:
centroid_linkage = linkage(dataset, method='centroid')

max_d = 2.0  
clusters = fcluster(centroid_linkage, max_d, criterion='distance')

grand_mean = np.mean(dataset, axis=0)

variance_within_clusters = {}
for cluster_id in np.unique(clusters):
    cluster_data = dataset[clusters == cluster_id]
    n = len(cluster_data)
    total_variance_within_cluster = np.sum(np.sum((cluster_data - np.mean(dataset, axis=0)) ** 2))
    variance_within_clusters[cluster_id] = total_variance_within_cluster

total_variance_within_clusters = sum(variance_within_clusters.values())

variance_between_clusters = 0
for cluster_id in np.unique(clusters):
    cluster_data = dataset[clusters == cluster_id]
    cluster_mean = np.mean(cluster_data, axis=0)
    n = len(cluster_data)
    variance_between_clusters += n * np.sum((cluster_mean - grand_mean) ** 2)

total_variance = total_variance_within_clusters + variance_between_clusters

print("Total Variance Within Clusters:\n", total_variance_within_clusters)
print("\nTotal Variance Between Clusters:\n", variance_between_clusters)
print("\nTotal Variance (Dataset Variance):\n", total_variance)

Total Variance Within Clusters:
 131352.01142587344

Total Variance Between Clusters:
 130871.86709532511

Total Variance (Dataset Variance):
 262223.8785211985


# water potability

In [8]:
dataset = pd.read_csv('/Users/farrelmanazilin/Document/kuliah/data/water_potability.csv')
dataset = dataset.fillna(dataset.groupby("Potability").transform('mean'))
dataset = dataset.drop('Potability', axis=1)

scaler = MinMaxScaler()
dataset_new = scaler.fit_transform(dataset)

In [9]:
kmeans = KMeans(n_clusters=3, random_state=0)
cluster = kmeans.fit_predict(dataset_new)  
dataset['Cluster'] = cluster

k = 3

cluster_centers = kmeans.cluster_centers_
labels = kmeans.labels_

variance_within_clusters = 0
for i in range(k):
    cluster_data = dataset_new[labels == i]
    cluster_center = cluster_centers[i]
    squared_distances = np.sum((cluster_data - cluster_center) ** 2)
    variance_within_clusters += squared_distances

grand_mean = np.mean(dataset_new, axis=0)
variance_between_clusters = 0
for i in range(k):
    cluster_data = dataset_new[labels == i]
    cluster_center = cluster_centers[i]
    n = len(cluster_data)
    squared_distances = n * np.sum((cluster_center - grand_mean) ** 2)
    variance_between_clusters += squared_distances

total_variance = np.sum((dataset_new - grand_mean) ** 2)

print("\nVariance Within Clusters:", variance_within_clusters)
print("Variance Between Clusters:", variance_between_clusters)
print("Total Variance (Dataset Variance):", total_variance)




Variance Within Clusters: 398.3975777040871
Variance Between Clusters: 79.92710728029117
Total Variance (Dataset Variance): 478.3257489867918


In [10]:
single_linkage = linkage(dataset, method='single')

max_d = 2.0  
clusters = fcluster(single_linkage, max_d, criterion='distance')

grand_mean = np.mean(dataset, axis=0)

variance_within_clusters = {}
for cluster_id in np.unique(clusters):
    cluster_data = dataset[clusters == cluster_id]
    n = len(cluster_data)
    total_variance_within_cluster = np.sum(np.sum((cluster_data - np.mean(dataset, axis=0)) ** 2))
    variance_within_clusters[cluster_id] = total_variance_within_cluster

total_variance_within_clusters = sum(variance_within_clusters.values())

variance_between_clusters = 0
for cluster_id in np.unique(clusters):
    cluster_data = dataset[clusters == cluster_id]
    cluster_mean = np.mean(cluster_data, axis=0)
    n = len(cluster_data)
    variance_between_clusters += n * np.sum((cluster_mean - grand_mean) ** 2)

total_variance = total_variance_within_clusters + variance_between_clusters

print("Total Variance Within Clusters:\n", total_variance_within_clusters)
print("\nTotal Variance Between Clusters:\n", variance_between_clusters)
print("\nTotal Variance (Dataset Variance):\n", total_variance)

Total Variance Within Clusters:
 251837740490.18808

Total Variance Between Clusters:
 251837740490.18808

Total Variance (Dataset Variance):
 503675480980.37616


In [11]:
average_linkage = linkage(dataset, method='average') 
max_d = 2.0  
clusters = fcluster(average_linkage, max_d, criterion='distance')

grand_mean = np.mean(dataset, axis=0)

variance_within_clusters = {}
for cluster_id in np.unique(clusters):
    cluster_data = dataset[clusters == cluster_id]
    n = len(cluster_data)
    total_variance_within_cluster = np.sum(np.sum((cluster_data - np.mean(dataset, axis=0)) ** 2))
    variance_within_clusters[cluster_id] = total_variance_within_cluster

total_variance_within_clusters = sum(variance_within_clusters.values())

variance_between_clusters = 0
for cluster_id in np.unique(clusters):
    cluster_data = dataset[clusters == cluster_id]
    cluster_mean = np.mean(cluster_data, axis=0)
    n = len(cluster_data)
    variance_between_clusters += n * np.sum((cluster_mean - grand_mean) ** 2)

total_variance = total_variance_within_clusters + variance_between_clusters

print("Total Variance Within Clusters:\n", total_variance_within_clusters)
print("\nTotal Variance Between Clusters:\n", variance_between_clusters)
print("\nTotal Variance (Dataset Variance):\n", total_variance)

Total Variance Within Clusters:
 251837740490.18845

Total Variance Between Clusters:
 251837740490.18845

Total Variance (Dataset Variance):
 503675480980.3769


In [12]:
complete_linkage = linkage(dataset, method='complete')

max_d = 2.0  
clusters = fcluster(complete_linkage, max_d, criterion='distance')

grand_mean = np.mean(dataset, axis=0)

variance_within_clusters = {}
for cluster_id in np.unique(clusters):
    cluster_data = dataset[clusters == cluster_id]
    n = len(cluster_data)
    total_variance_within_cluster = np.sum(np.sum((cluster_data - np.mean(dataset, axis=0)) ** 2))
    variance_within_clusters[cluster_id] = total_variance_within_cluster

total_variance_within_clusters = sum(variance_within_clusters.values())

variance_between_clusters = 0
for cluster_id in np.unique(clusters):
    cluster_data = dataset[clusters == cluster_id]
    cluster_mean = np.mean(cluster_data, axis=0)
    n = len(cluster_data)
    variance_between_clusters += n * np.sum((cluster_mean - grand_mean) ** 2)

total_variance = total_variance_within_clusters + variance_between_clusters

print("Total Variance Within Clusters:\n", total_variance_within_clusters)
print("\nTotal Variance Between Clusters:\n", variance_between_clusters)
print("\nTotal Variance (Dataset Variance):\n", total_variance)

Total Variance Within Clusters:
 251837740490.18857

Total Variance Between Clusters:
 251837740490.18857

Total Variance (Dataset Variance):
 503675480980.37714


In [13]:
centroid_linkage = linkage(dataset, method='centroid')

max_d = 2.0  
clusters = fcluster(centroid_linkage, max_d, criterion='distance')

grand_mean = np.mean(dataset, axis=0)

variance_within_clusters = {}
for cluster_id in np.unique(clusters):
    cluster_data = dataset[clusters == cluster_id]
    n = len(cluster_data)
    total_variance_within_cluster = np.sum(np.sum((cluster_data - np.mean(dataset, axis=0)) ** 2))
    variance_within_clusters[cluster_id] = total_variance_within_cluster

total_variance_within_clusters = sum(variance_within_clusters.values())

variance_between_clusters = 0
for cluster_id in np.unique(clusters):
    cluster_data = dataset[clusters == cluster_id]
    cluster_mean = np.mean(cluster_data, axis=0)
    n = len(cluster_data)
    variance_between_clusters += n * np.sum((cluster_mean - grand_mean) ** 2)

total_variance = total_variance_within_clusters + variance_between_clusters

print("Total Variance Within Clusters:\n", total_variance_within_clusters)
print("\nTotal Variance Between Clusters:\n", variance_between_clusters)
print("\nTotal Variance (Dataset Variance):\n", total_variance)

Total Variance Within Clusters:
 251837740490.18832

Total Variance Between Clusters:
 251837740490.18832

Total Variance (Dataset Variance):
 503675480980.37665


# ruspini

In [14]:
dataset = pd.read_csv('/Users/farrelmanazilin/Document/kuliah/data/ruspini.csv')
dataset = dataset.drop('CLASS', axis=1)

scaler = MinMaxScaler()
dataset_new = scaler.fit_transform(dataset)

In [15]:
kmeans = KMeans(n_clusters=3, random_state=0)
cluster = kmeans.fit_predict(dataset_new)  
dataset['Cluster'] = cluster

k = 3

cluster_centers = kmeans.cluster_centers_
labels = kmeans.labels_

variance_within_clusters = 0
for i in range(k):
    cluster_data = dataset_new[labels == i]
    cluster_center = cluster_centers[i]
    squared_distances = np.sum((cluster_data - cluster_center) ** 2)
    variance_within_clusters += squared_distances

grand_mean = np.mean(dataset_new, axis=0)
variance_between_clusters = 0
for i in range(k):
    cluster_data = dataset_new[labels == i]
    cluster_center = cluster_centers[i]
    n = len(cluster_data)
    squared_distances = n * np.sum((cluster_center - grand_mean) ** 2)
    variance_between_clusters += squared_distances

total_variance = np.sum((dataset_new - grand_mean) ** 2)

print("\nVariance Within Clusters:", variance_within_clusters)
print("Variance Between Clusters:", variance_between_clusters)
print("Total Variance (Dataset Variance):", total_variance)


Variance Within Clusters: 4.618135408790582
Variance Between Clusters: 14.789865890362032
Total Variance (Dataset Variance): 19.408001299152616




In [16]:
single_linkage = linkage(dataset, method='single')

max_d = 2.0  
clusters = fcluster(single_linkage, max_d, criterion='distance')

grand_mean = np.mean(dataset, axis=0)

variance_within_clusters = {}
for cluster_id in np.unique(clusters):
    cluster_data = dataset[clusters == cluster_id]
    n = len(cluster_data)
    total_variance_within_cluster = np.sum(np.sum((cluster_data - np.mean(dataset, axis=0)) ** 2))
    variance_within_clusters[cluster_id] = total_variance_within_cluster

total_variance_within_clusters = sum(variance_within_clusters.values())

variance_between_clusters = 0
for cluster_id in np.unique(clusters):
    cluster_data = dataset[clusters == cluster_id]
    cluster_mean = np.mean(cluster_data, axis=0)
    n = len(cluster_data)
    variance_between_clusters += n * np.sum((cluster_mean - grand_mean) ** 2)

total_variance = total_variance_within_clusters + variance_between_clusters

print("Total Variance Within Clusters:\n", total_variance_within_clusters)
print("\nTotal Variance Between Clusters:\n", variance_between_clusters)
print("\nTotal Variance (Dataset Variance):\n", total_variance)

Total Variance Within Clusters:
 279570.5333333334

Total Variance Between Clusters:
 279569.0333333334

Total Variance (Dataset Variance):
 559139.5666666668


In [17]:
average_linkage = linkage(dataset, method='average') 
max_d = 2.0  
clusters = fcluster(average_linkage, max_d, criterion='distance')

grand_mean = np.mean(dataset, axis=0)

variance_within_clusters = {}
for cluster_id in np.unique(clusters):
    cluster_data = dataset[clusters == cluster_id]
    n = len(cluster_data)
    total_variance_within_cluster = np.sum(np.sum((cluster_data - np.mean(dataset, axis=0)) ** 2))
    variance_within_clusters[cluster_id] = total_variance_within_cluster

total_variance_within_clusters = sum(variance_within_clusters.values())

variance_between_clusters = 0
for cluster_id in np.unique(clusters):
    cluster_data = dataset[clusters == cluster_id]
    cluster_mean = np.mean(cluster_data, axis=0)
    n = len(cluster_data)
    variance_between_clusters += n * np.sum((cluster_mean - grand_mean) ** 2)

total_variance = total_variance_within_clusters + variance_between_clusters

print("Total Variance Within Clusters:\n", total_variance_within_clusters)
print("\nTotal Variance Between Clusters:\n", variance_between_clusters)
print("\nTotal Variance (Dataset Variance):\n", total_variance)

Total Variance Within Clusters:
 279570.5333333333

Total Variance Between Clusters:
 279569.0333333333

Total Variance (Dataset Variance):
 559139.5666666667


In [18]:
complete_linkage = linkage(dataset, method='complete')

max_d = 2.0  
clusters = fcluster(complete_linkage, max_d, criterion='distance')

grand_mean = np.mean(dataset, axis=0)

variance_within_clusters = {}
for cluster_id in np.unique(clusters):
    cluster_data = dataset[clusters == cluster_id]
    n = len(cluster_data)
    total_variance_within_cluster = np.sum(np.sum((cluster_data - np.mean(dataset, axis=0)) ** 2))
    variance_within_clusters[cluster_id] = total_variance_within_cluster

total_variance_within_clusters = sum(variance_within_clusters.values())

variance_between_clusters = 0
for cluster_id in np.unique(clusters):
    cluster_data = dataset[clusters == cluster_id]
    cluster_mean = np.mean(cluster_data, axis=0)
    n = len(cluster_data)
    variance_between_clusters += n * np.sum((cluster_mean - grand_mean) ** 2)

total_variance = total_variance_within_clusters + variance_between_clusters

print("Total Variance Within Clusters:\n", total_variance_within_clusters)
print("\nTotal Variance Between Clusters:\n", variance_between_clusters)
print("\nTotal Variance (Dataset Variance):\n", total_variance)

Total Variance Within Clusters:
 279570.53333333327

Total Variance Between Clusters:
 279569.03333333327

Total Variance (Dataset Variance):
 559139.5666666665


In [19]:
centroid_linkage = linkage(dataset, method='centroid')

max_d = 2.0  
clusters = fcluster(centroid_linkage, max_d, criterion='distance')

grand_mean = np.mean(dataset, axis=0)

variance_within_clusters = {}
for cluster_id in np.unique(clusters):
    cluster_data = dataset[clusters == cluster_id]
    n = len(cluster_data)
    total_variance_within_cluster = np.sum(np.sum((cluster_data - np.mean(dataset, axis=0)) ** 2))
    variance_within_clusters[cluster_id] = total_variance_within_cluster

total_variance_within_clusters = sum(variance_within_clusters.values())

variance_between_clusters = 0
for cluster_id in np.unique(clusters):
    cluster_data = dataset[clusters == cluster_id]
    cluster_mean = np.mean(cluster_data, axis=0)
    n = len(cluster_data)
    variance_between_clusters += n * np.sum((cluster_mean - grand_mean) ** 2)

total_variance = total_variance_within_clusters + variance_between_clusters

print("Total Variance Within Clusters:\n", total_variance_within_clusters)
print("\nTotal Variance Between Clusters:\n", variance_between_clusters)
print("\nTotal Variance (Dataset Variance):\n", total_variance)

Total Variance Within Clusters:
 279570.53333333344

Total Variance Between Clusters:
 279569.03333333344

Total Variance (Dataset Variance):
 559139.5666666669
