In [1]:
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd 
import numpy as np	

In [2]:
dataset = pd.read_csv('/Users/farrelmanazilin/Document/kuliah/data/milk.csv')
dataset = dataset.drop('Grade', axis=1)

scaler = MinMaxScaler()
dataset_new = scaler.fit_transform(dataset)

kmeans = KMeans(n_clusters=3, random_state=0)
cluster = kmeans.fit_predict(dataset_new)  
dataset['Cluster'] = cluster

k = 3

cluster_centers = kmeans.cluster_centers_
labels = kmeans.labels_

variance_within_clusters = 0
for i in range(k):
    cluster_data = dataset_new[labels == i]
    cluster_center = cluster_centers[i]
    squared_distances = np.sum((cluster_data - cluster_center) ** 2)
    variance_within_clusters += squared_distances

grand_mean = np.mean(dataset_new, axis=0)
variance_between_clusters = 0
for i in range(k):
    cluster_data = dataset_new[labels == i]
    cluster_center = cluster_centers[i]
    n = len(cluster_data)
    squared_distances = n * np.sum((cluster_center - grand_mean) ** 2)
    variance_between_clusters += squared_distances

total_variance = np.sum((dataset_new - grand_mean) ** 2)
variance_ratio = variance_between_clusters / variance_within_clusters

print("\nVariance Within Clusters:", variance_within_clusters)
print("Variance Between Clusters:", variance_between_clusters)
print("Total Variance (Dataset Variance):", total_variance)
print("Variance Ratio (Between / Within):", variance_ratio)


Variance Within Clusters: 624.8924146831872
Variance Between Clusters: 566.4817193462391
Total Variance (Dataset Variance): 1191.3741340294262
Variance Ratio (Between / Within): 0.9065267973102832




In [3]:
single_linkage = linkage(dataset, method='single')

max_d = 2.0  
clusters = fcluster(single_linkage, max_d, criterion='distance')

variance_within_clusters = 0
grand_mean = np.mean(dataset, axis=0)

for cluster_id in np.unique(clusters):
    cluster_data = dataset[clusters == cluster_id]
    cluster_mean = np.mean(cluster_data, axis=0)
    n = len(cluster_data)
    variance_within_clusters += np.sum((cluster_data - cluster_mean) ** 2)
variance_between_clusters = 0
for cluster_id in np.unique(clusters):
    cluster_data = dataset[clusters == cluster_id]
    cluster_mean = np.mean(cluster_data, axis=0)
    n = len(cluster_data)
    variance_between_clusters += n * np.sum((cluster_mean - grand_mean) ** 2)

total_variance = np.sum((dataset - grand_mean) ** 2)
variance_ratio = variance_between_clusters / variance_within_clusters

print("Variance Within Clusters:\n", variance_within_clusters)
print("\nVariance Between Clusters:\n", variance_between_clusters)
print("\nTotal Variance (Dataset Variance):\n", total_variance)
print("\nVariance Ratio (Between / Within):\n", variance_ratio)

Variance Within Clusters:
 pH             71.796427
Temprature    538.454746
Taste          83.773074
Odor           98.310564
Fat           115.196848
Turbidity      87.001736
Colour          4.056689
Cluster       278.573876
dtype: float64

Variance Between Clusters:
 130074.84746669892

Total Variance (Dataset Variance):
 pH              2072.729084
Temprature    107891.609065
Taste            262.436261
Odor             259.922568
Fat              233.643059
Turbidity        264.664778
Colour         19630.030217
Cluster          736.976393
dtype: float64

Variance Ratio (Between / Within):
 pH             1811.717563
Temprature      241.570621
Taste          1552.704726
Odor           1323.101430
Fat            1129.152831
Turbidity      1495.083354
Colour        32064.290792
Cluster         466.931248
dtype: float64


In [4]:
average_linkage = linkage(dataset, method='average') 
max_d = 2.0  
clusters = fcluster(average_linkage, max_d, criterion='distance')

variance_within_clusters = 0
grand_mean = np.mean(dataset, axis=0)

for cluster_id in np.unique(clusters):
    cluster_data = dataset[clusters == cluster_id]
    cluster_mean = np.mean(cluster_data, axis=0)
    n = len(cluster_data)
    variance_within_clusters += np.sum((cluster_data - cluster_mean) ** 2)

variance_between_clusters = 0
for cluster_id in np.unique(clusters):
    cluster_data = dataset[clusters == cluster_id]
    cluster_mean = np.mean(cluster_data, axis=0)
    n = len(cluster_data)
    variance_between_clusters += n * np.sum((cluster_mean - grand_mean) ** 2)

total_variance = np.sum((dataset - grand_mean) ** 2)
variance_ratio = variance_between_clusters / variance_within_clusters

print("Variance Within Clusters:\n", variance_within_clusters)
print("\nVariance Between Clusters:\n", variance_between_clusters)
print("\nTotal Variance (Dataset Variance):\n", total_variance)
print("\nVariance Ratio (Between / Within):\n", variance_ratio)

Variance Within Clusters:
 pH              2.578827
Temprature     48.788226
Taste          47.837167
Odor           42.246475
Fat            94.624441
Turbidity      34.031397
Colour          4.024286
Cluster       110.939736
dtype: float64

Variance Between Clusters:
 130966.9408714785

Total Variance (Dataset Variance):
 pH              2072.729084
Temprature    107891.609065
Taste            262.436261
Odor             259.922568
Fat              233.643059
Turbidity        264.664778
Colour         19630.030217
Cluster          736.976393
dtype: float64

Variance Ratio (Between / Within):
 pH            50785.477427
Temprature     2684.396476
Taste          2737.765382
Odor           3100.067906
Fat            1384.070966
Turbidity      3848.415030
Colour        32544.145761
Cluster        1180.523275
dtype: float64


In [5]:
complete_linkage = linkage(dataset, method='complete')

max_d = 2.0  
clusters = fcluster(complete_linkage, max_d, criterion='distance')

variance_within_clusters = 0
grand_mean = np.mean(dataset, axis=0)

for cluster_id in np.unique(clusters):
    cluster_data = dataset[clusters == cluster_id]
    cluster_mean = np.mean(cluster_data, axis=0)
    n = len(cluster_data)
    variance_within_clusters += np.sum((cluster_data - cluster_mean) ** 2)

variance_between_clusters = 0
for cluster_id in np.unique(clusters):
    cluster_data = dataset[clusters == cluster_id]
    cluster_mean = np.mean(cluster_data, axis=0)
    n = len(cluster_data)
    variance_between_clusters += n * np.sum((cluster_mean - grand_mean) ** 2)

total_variance = np.sum((dataset - grand_mean) ** 2)
variance_ratio = variance_between_clusters / variance_within_clusters

print("Variance Within Clusters:\n", variance_within_clusters)
print("\nVariance Between Clusters:\n", variance_between_clusters)
print("\nTotal Variance (Dataset Variance):\n", total_variance)
print("\nVariance Ratio (Between / Within):\n", variance_ratio)

Variance Within Clusters:
 pH             2.440800
Temprature    47.796991
Taste         30.525993
Odor          28.605641
Fat           33.193113
Turbidity     27.256436
Colour         4.024286
Cluster       32.238568
dtype: float64

Variance Between Clusters:
 131145.92959778834

Total Variance (Dataset Variance):
 pH              2072.729084
Temprature    107891.609065
Taste            262.436261
Odor             259.922568
Fat              233.643059
Turbidity        264.664778
Colour         19630.030217
Cluster          736.976393
dtype: float64

Variance Ratio (Between / Within):
 pH            53730.709293
Temprature     2743.811424
Taste          4296.205197
Odor           4584.617793
Fat            3950.998151
Turbidity      4811.558216
Colour        32588.622903
Cluster        4067.982483
dtype: float64


In [6]:
dataset = pd.read_csv('/Users/farrelmanazilin/Document/kuliah/data/water_potability.csv')
dataset = dataset.fillna(dataset.groupby("Potability").transform('mean'))
dataset = dataset.drop('Potability', axis=1)

scaler = MinMaxScaler()
dataset_new = scaler.fit_transform(dataset)

kmeans = KMeans(n_clusters=3, random_state=0)
cluster = kmeans.fit_predict(dataset_new)  
dataset['Cluster'] = cluster

k = 3

cluster_centers = kmeans.cluster_centers_
labels = kmeans.labels_

variance_within_clusters = 0
for i in range(k):
    cluster_data = dataset_new[labels == i]
    cluster_center = cluster_centers[i]
    squared_distances = np.sum((cluster_data - cluster_center) ** 2)
    variance_within_clusters += squared_distances

grand_mean = np.mean(dataset_new, axis=0)
variance_between_clusters = 0
for i in range(k):
    cluster_data = dataset_new[labels == i]
    cluster_center = cluster_centers[i]
    n = len(cluster_data)
    squared_distances = n * np.sum((cluster_center - grand_mean) ** 2)
    variance_between_clusters += squared_distances

total_variance = np.sum((dataset_new - grand_mean) ** 2)
variance_ratio = variance_between_clusters / variance_within_clusters

print("\nVariance Within Clusters:", variance_within_clusters)
print("Variance Between Clusters:", variance_between_clusters)
print("Total Variance (Dataset Variance):", total_variance)
print("Variance Ratio (Between / Within):", variance_ratio)


Variance Within Clusters: 398.3975777040871
Variance Between Clusters: 79.92710728029117
Total Variance (Dataset Variance): 478.3257489867918
Variance Ratio (Between / Within): 0.20062146898809122




In [7]:
single_linkage = linkage(dataset, method='single')

max_d = 2.0  
clusters = fcluster(single_linkage, max_d, criterion='distance')

variance_within_clusters = 0
grand_mean = np.mean(dataset, axis=0)

for cluster_id in np.unique(clusters):
    cluster_data = dataset[clusters == cluster_id]
    cluster_mean = np.mean(cluster_data, axis=0)
    n = len(cluster_data)
    variance_within_clusters += np.sum((cluster_data - cluster_mean) ** 2)
variance_between_clusters = 0
for cluster_id in np.unique(clusters):
    cluster_data = dataset[clusters == cluster_id]
    cluster_mean = np.mean(cluster_data, axis=0)
    n = len(cluster_data)
    variance_between_clusters += n * np.sum((cluster_mean - grand_mean) ** 2)

total_variance = np.sum((dataset - grand_mean) ** 2)
variance_ratio = variance_between_clusters / variance_within_clusters

print("Variance Within Clusters:\n", variance_within_clusters)
print("\nVariance Between Clusters:\n", variance_between_clusters)
print("\nTotal Variance (Dataset Variance):\n", total_variance)
print("\nVariance Ratio (Between / Within):\n", variance_ratio)

Variance Within Clusters:
 ph                 0.0
Hardness           0.0
Solids             0.0
Chloramines        0.0
Sulfate            0.0
Conductivity       0.0
Organic_carbon     0.0
Trihalomethanes    0.0
Turbidity          0.0
Cluster            0.0
dtype: float64

Variance Between Clusters:
 251837740490.18808

Total Variance (Dataset Variance):
 ph                 7.076539e+03
Hardness           3.540533e+06
Solids             2.518077e+11
Chloramines        8.207667e+03
Sulfate            4.278826e+06
Conductivity       2.139403e+07
Organic_carbon     3.584139e+04
Trihalomethanes    8.144590e+05
Turbidity          1.994464e+03
Cluster            1.982360e+03
dtype: float64

Variance Ratio (Between / Within):
 ph                 inf
Hardness           inf
Solids             inf
Chloramines        inf
Sulfate            inf
Conductivity       inf
Organic_carbon     inf
Trihalomethanes    inf
Turbidity          inf
Cluster            inf
dtype: float64


In [8]:
average_linkage = linkage(dataset, method='average') 
max_d = 2.0  
clusters = fcluster(average_linkage, max_d, criterion='distance')

variance_within_clusters = 0
grand_mean = np.mean(dataset, axis=0)

for cluster_id in np.unique(clusters):
    cluster_data = dataset[clusters == cluster_id]
    cluster_mean = np.mean(cluster_data, axis=0)
    n = len(cluster_data)
    variance_within_clusters += np.sum((cluster_data - cluster_mean) ** 2)

variance_between_clusters = 0
for cluster_id in np.unique(clusters):
    cluster_data = dataset[clusters == cluster_id]
    cluster_mean = np.mean(cluster_data, axis=0)
    n = len(cluster_data)
    variance_between_clusters += n * np.sum((cluster_mean - grand_mean) ** 2)

total_variance = np.sum((dataset - grand_mean) ** 2)
variance_ratio = variance_between_clusters / variance_within_clusters

print("Variance Within Clusters:\n", variance_within_clusters)
print("\nVariance Between Clusters:\n", variance_between_clusters)
print("\nTotal Variance (Dataset Variance):\n", total_variance)
print("\nVariance Ratio (Between / Within):\n", variance_ratio)

Variance Within Clusters:
 ph                 0.0
Hardness           0.0
Solids             0.0
Chloramines        0.0
Sulfate            0.0
Conductivity       0.0
Organic_carbon     0.0
Trihalomethanes    0.0
Turbidity          0.0
Cluster            0.0
dtype: float64

Variance Between Clusters:
 251837740490.18845

Total Variance (Dataset Variance):
 ph                 7.076539e+03
Hardness           3.540533e+06
Solids             2.518077e+11
Chloramines        8.207667e+03
Sulfate            4.278826e+06
Conductivity       2.139403e+07
Organic_carbon     3.584139e+04
Trihalomethanes    8.144590e+05
Turbidity          1.994464e+03
Cluster            1.982360e+03
dtype: float64

Variance Ratio (Between / Within):
 ph                 inf
Hardness           inf
Solids             inf
Chloramines        inf
Sulfate            inf
Conductivity       inf
Organic_carbon     inf
Trihalomethanes    inf
Turbidity          inf
Cluster            inf
dtype: float64


In [9]:
complete_linkage = linkage(dataset, method='complete')

max_d = 2.0  
clusters = fcluster(complete_linkage, max_d, criterion='distance')

variance_within_clusters = 0
grand_mean = np.mean(dataset, axis=0)

for cluster_id in np.unique(clusters):
    cluster_data = dataset[clusters == cluster_id]
    cluster_mean = np.mean(cluster_data, axis=0)
    n = len(cluster_data)
    variance_within_clusters += np.sum((cluster_data - cluster_mean) ** 2)

variance_between_clusters = 0
for cluster_id in np.unique(clusters):
    cluster_data = dataset[clusters == cluster_id]
    cluster_mean = np.mean(cluster_data, axis=0)
    n = len(cluster_data)
    variance_between_clusters += n * np.sum((cluster_mean - grand_mean) ** 2)

total_variance = np.sum((dataset - grand_mean) ** 2)
variance_ratio = variance_between_clusters / variance_within_clusters

print("Variance Within Clusters:\n", variance_within_clusters)
print("\nVariance Between Clusters:\n", variance_between_clusters)
print("\nTotal Variance (Dataset Variance):\n", total_variance)
print("\nVariance Ratio (Between / Within):\n", variance_ratio)

Variance Within Clusters:
 ph                 0.0
Hardness           0.0
Solids             0.0
Chloramines        0.0
Sulfate            0.0
Conductivity       0.0
Organic_carbon     0.0
Trihalomethanes    0.0
Turbidity          0.0
Cluster            0.0
dtype: float64

Variance Between Clusters:
 251837740490.18857

Total Variance (Dataset Variance):
 ph                 7.076539e+03
Hardness           3.540533e+06
Solids             2.518077e+11
Chloramines        8.207667e+03
Sulfate            4.278826e+06
Conductivity       2.139403e+07
Organic_carbon     3.584139e+04
Trihalomethanes    8.144590e+05
Turbidity          1.994464e+03
Cluster            1.982360e+03
dtype: float64

Variance Ratio (Between / Within):
 ph                 inf
Hardness           inf
Solids             inf
Chloramines        inf
Sulfate            inf
Conductivity       inf
Organic_carbon     inf
Trihalomethanes    inf
Turbidity          inf
Cluster            inf
dtype: float64


In [10]:
dataset = pd.read_csv('/Users/farrelmanazilin/Document/kuliah/data/ruspini.csv')
dataset = dataset.drop('CLASS', axis=1)

scaler = MinMaxScaler()
dataset_new = scaler.fit_transform(dataset)

kmeans = KMeans(n_clusters=3, random_state=0)
cluster = kmeans.fit_predict(dataset_new)  
dataset['Cluster'] = cluster

k = 3

cluster_centers = kmeans.cluster_centers_
labels = kmeans.labels_

variance_within_clusters = 0
for i in range(k):
    cluster_data = dataset_new[labels == i]
    cluster_center = cluster_centers[i]
    squared_distances = np.sum((cluster_data - cluster_center) ** 2)
    variance_within_clusters += squared_distances

grand_mean = np.mean(dataset_new, axis=0)
variance_between_clusters = 0
for i in range(k):
    cluster_data = dataset_new[labels == i]
    cluster_center = cluster_centers[i]
    n = len(cluster_data)
    squared_distances = n * np.sum((cluster_center - grand_mean) ** 2)
    variance_between_clusters += squared_distances

total_variance = np.sum((dataset_new - grand_mean) ** 2)
variance_ratio = variance_between_clusters / variance_within_clusters

print("\nVariance Within Clusters:", variance_within_clusters)
print("Variance Between Clusters:", variance_between_clusters)
print("Total Variance (Dataset Variance):", total_variance)
print("Variance Ratio (Between / Within):", variance_ratio)


Variance Within Clusters: 4.618135408790582
Variance Between Clusters: 14.789865890362032
Total Variance (Dataset Variance): 19.408001299152616
Variance Ratio (Between / Within): 3.202562199066239




In [11]:
single_linkage = linkage(dataset, method='single')

max_d = 2.0  
clusters = fcluster(single_linkage, max_d, criterion='distance')

variance_within_clusters = 0
grand_mean = np.mean(dataset, axis=0)

for cluster_id in np.unique(clusters):
    cluster_data = dataset[clusters == cluster_id]
    cluster_mean = np.mean(cluster_data, axis=0)
    n = len(cluster_data)
    variance_within_clusters += np.sum((cluster_data - cluster_mean) ** 2)
variance_between_clusters = 0
for cluster_id in np.unique(clusters):
    cluster_data = dataset[clusters == cluster_id]
    cluster_mean = np.mean(cluster_data, axis=0)
    n = len(cluster_data)
    variance_between_clusters += n * np.sum((cluster_mean - grand_mean) ** 2)

total_variance = np.sum((dataset - grand_mean) ** 2)
variance_ratio = variance_between_clusters / variance_within_clusters

print("Variance Within Clusters:\n", variance_within_clusters)
print("\nVariance Between Clusters:\n", variance_between_clusters)
print("\nTotal Variance (Dataset Variance):\n", total_variance)
print("\nVariance Ratio (Between / Within):\n", variance_ratio)

Variance Within Clusters:
 #          0.5
X          0.5
Y          0.5
Cluster    0.0
dtype: float64

Variance Between Clusters:
 279569.0333333334

Total Variance (Dataset Variance):
 #           35150.000000
X           68849.920000
Y          175523.946667
Cluster        46.666667
dtype: float64

Variance Ratio (Between / Within):
 #          5.591381e+05
X          5.591381e+05
Y          5.591381e+05
Cluster             inf
dtype: float64


In [12]:
average_linkage = linkage(dataset, method='average') 
max_d = 2.0  
clusters = fcluster(average_linkage, max_d, criterion='distance')

variance_within_clusters = 0
grand_mean = np.mean(dataset, axis=0)

for cluster_id in np.unique(clusters):
    cluster_data = dataset[clusters == cluster_id]
    cluster_mean = np.mean(cluster_data, axis=0)
    n = len(cluster_data)
    variance_within_clusters += np.sum((cluster_data - cluster_mean) ** 2)

variance_between_clusters = 0
for cluster_id in np.unique(clusters):
    cluster_data = dataset[clusters == cluster_id]
    cluster_mean = np.mean(cluster_data, axis=0)
    n = len(cluster_data)
    variance_between_clusters += n * np.sum((cluster_mean - grand_mean) ** 2)

total_variance = np.sum((dataset - grand_mean) ** 2)
variance_ratio = variance_between_clusters / variance_within_clusters

print("Variance Within Clusters:\n", variance_within_clusters)
print("\nVariance Between Clusters:\n", variance_between_clusters)
print("\nTotal Variance (Dataset Variance):\n", total_variance)
print("\nVariance Ratio (Between / Within):\n", variance_ratio)

Variance Within Clusters:
 #          0.5
X          0.5
Y          0.5
Cluster    0.0
dtype: float64

Variance Between Clusters:
 279569.0333333333

Total Variance (Dataset Variance):
 #           35150.000000
X           68849.920000
Y          175523.946667
Cluster        46.666667
dtype: float64

Variance Ratio (Between / Within):
 #          5.591381e+05
X          5.591381e+05
Y          5.591381e+05
Cluster             inf
dtype: float64


In [13]:
complete_linkage = linkage(dataset, method='complete')

max_d = 2.0  
clusters = fcluster(complete_linkage, max_d, criterion='distance')

variance_within_clusters = 0
grand_mean = np.mean(dataset, axis=0)

for cluster_id in np.unique(clusters):
    cluster_data = dataset[clusters == cluster_id]
    cluster_mean = np.mean(cluster_data, axis=0)
    n = len(cluster_data)
    variance_within_clusters += np.sum((cluster_data - cluster_mean) ** 2)

variance_between_clusters = 0
for cluster_id in np.unique(clusters):
    cluster_data = dataset[clusters == cluster_id]
    cluster_mean = np.mean(cluster_data, axis=0)
    n = len(cluster_data)
    variance_between_clusters += n * np.sum((cluster_mean - grand_mean) ** 2)

total_variance = np.sum((dataset - grand_mean) ** 2)
variance_ratio = variance_between_clusters / variance_within_clusters

print("Variance Within Clusters:\n", variance_within_clusters)
print("\nVariance Between Clusters:\n", variance_between_clusters)
print("\nTotal Variance (Dataset Variance):\n", total_variance)
print("\nVariance Ratio (Between / Within):\n", variance_ratio)

Variance Within Clusters:
 #          0.5
X          0.5
Y          0.5
Cluster    0.0
dtype: float64

Variance Between Clusters:
 279569.03333333327

Total Variance (Dataset Variance):
 #           35150.000000
X           68849.920000
Y          175523.946667
Cluster        46.666667
dtype: float64

Variance Ratio (Between / Within):
 #          5.591381e+05
X          5.591381e+05
Y          5.591381e+05
Cluster             inf
dtype: float64
