In [1]:
"""
Datasets are available at https://zenodo.org/records/6355684 
"""

'\nDatasets are available at https://zenodo.org/records/6355684 \n'

In [2]:
import pandas as pd
from silhouette_upper_bound import upper_bound, upper_bound_samples, upper_bound_macro_silhouette
import numpy as np
import matplotlib.pyplot as plt 
import pickle
from sklearn.metrics import silhouette_score, adjusted_rand_score, silhouette_samples, adjusted_mutual_info_score
from collections import Counter
import kmedoids
from sklearn.preprocessing import StandardScaler, RobustScaler, normalize
from sklearn.impute import SimpleImputer
from scipy.spatial.distance import squareform, pdist
import seaborn as sns
from matplotlib.ticker import MultipleLocator
from tqdm import tqdm
from pathlib import Path
from scipy.io import arff
from scipy import stats
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import TruncatedSVD
import time

In [3]:
# Distance metric = L1

In [4]:
dataset_name = "aloi-hsb-5x5x5"
path = "data/aloi/" + dataset_name + ".csv"
N_FEATURES = 125

In [5]:
# Load whitespace-separated data
df = pd.read_csv(path, 
                 sep=r"\s+",      # whitespace
                 header=None,     # no header row
                 engine="python") # needed because of regex separator

print(df.shape)
df.head()

(110250, 127)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,117,118,119,120,121,122,123,124,125,126
0,0.006198,0.0,0.0,0.0,2e-06,0.09314,5.4e-05,5e-06,0.0,0.002606,...,5.7e-05,5.2e-05,0.000217,0.000294,0.00118,0.002249,0.010247,0.016556,"""img1""","""1/1_i110.png"""
1,0.0076,0.0,0.0,0.0,5e-06,0.099254,0.000104,1.1e-05,2e-06,0.006027,...,4.7e-05,7.2e-05,0.000271,0.000294,0.000875,0.002636,0.011791,0.012607,"""img1""","""1/1_i120.png"""
2,0.009555,2e-06,0.0,0.0,5e-05,0.113557,0.000212,7.2e-05,2.9e-05,0.013439,...,5.9e-05,7.5e-05,0.000328,0.000217,0.000882,0.002471,0.011339,0.011084,"""img1""","""1/1_i130.png"""
3,0.013756,1.6e-05,0.0,0.0,0.007103,0.140993,0.000382,0.000237,0.000179,0.022142,...,6.6e-05,6.1e-05,0.000545,0.000176,0.001205,0.001029,0.004071,0.022572,"""img1""","""1/1_i140.png"""
4,0.014931,1.6e-05,0.0,0.0,0.000301,0.13083,0.000723,0.000744,0.001067,0.020598,...,7e-05,0.00012,0.000373,0.000179,0.000626,0.003092,0.012356,0.005875,"""img1""","""1/1_i150.png"""


In [6]:
df["class"] = df.iloc[:, -1].str.extract(r"(\d+)", expand=False)
df = df[df["class"].notna()]   # drop rows with missing class
df["class"] = df["class"].astype(int)

In [7]:
df_subset = (
    df.groupby("class", group_keys=False)
      .apply(lambda g: g.sample(n=100, random_state=42))
      .reset_index(drop=True)
)

  .apply(lambda g: g.sample(n=100, random_state=42))


In [8]:
df_subset

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,class
0,0.045401,0.001526,0.003160,0.006736,0.023887,0.172085,0.004343,0.011235,0.008873,0.003832,...,0.000411,0.000066,0.000335,0.002301,0.009011,0.006079,0.000045,"""img1""","""1/1_r280.png""",1
1,0.045225,0.000613,0.001031,0.002921,0.020752,0.173089,0.002421,0.004028,0.005229,0.005391,...,0.000271,0.000416,0.000201,0.000520,0.004795,0.011194,0.002145,"""img1""","""1/1_i230.png""",1
2,0.014931,0.000016,0.000000,0.000000,0.000301,0.130830,0.000723,0.000744,0.001067,0.020598,...,0.000120,0.000373,0.000179,0.000626,0.003092,0.012356,0.005875,"""img1""","""1/1_i150.png""",1
3,0.043287,0.001560,0.003798,0.005082,0.019194,0.170754,0.006427,0.012146,0.007935,0.003599,...,0.000330,0.000183,0.000183,0.001350,0.007388,0.009465,0.000136,"""img1""","""1/1_r305.png""",1
4,0.050345,0.001476,0.003187,0.005841,0.028332,0.177456,0.003007,0.005674,0.006933,0.004853,...,0.000543,0.000079,0.000104,0.002783,0.017675,0.003879,0.000070,"""img1""","""1/1_r210.png""",1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0.009074,0.000000,0.000000,0.000000,0.000086,0.114667,0.000027,0.000005,0.000002,0.000174,...,0.000197,0.000934,0.000081,0.003920,0.017287,0.014418,0.007765,"""img1000""","""1000/1000_i120.png""",1000
99996,0.032602,0.000023,0.000034,0.000206,0.000312,0.190423,0.000072,0.000050,0.000970,0.000574,...,0.000348,0.001404,0.000384,0.007381,0.007189,0.002708,0.003565,"""img1000""","""1000/1000_l4c1.png""",1000
99997,0.020684,0.000075,0.000136,0.000104,0.000086,0.152131,0.000999,0.002093,0.001621,0.000669,...,0.000836,0.001094,0.000084,0.007932,0.013358,0.003181,0.000843,"""img1000""","""1000/1000_r310.png""",1000
99998,0.022836,0.000102,0.000097,0.000059,0.000208,0.163262,0.000748,0.001718,0.001652,0.000416,...,0.000662,0.002093,0.000090,0.004297,0.008751,0.003673,0.004553,"""img1000""","""1000/1000_r45.png""",1000


In [9]:
y = df_subset["class"]
X = df_subset.iloc[:, 0:N_FEATURES]
X = X.to_numpy(dtype='float32')
print(X.shape, X.dtype)

(100000, 125) float32


In [10]:
def eval(D, y, cluster_labels, ub):

    # summary
    cluster_sizes = list(Counter(cluster_labels).values())
    min_cluster_size = min(cluster_sizes)
    print(f"Min cluster size = {min_cluster_size}")
    print(f"K = {len(cluster_sizes)}")
    # silhouette samples 
    silh_samples = silhouette_samples(X=D, labels=cluster_labels, metric='precomputed')
    # ASW 
    asw = np.mean(silh_samples)
    print(f"ASW = {asw}")
    print(f"ub = {ub}")
    print(f"WCRE = {(ub - asw)/ub}")

    # constrained
    uba = upper_bound(D, min_cluster_size)
    print(f"uba = {uba}\nwcre = {(uba - asw)/uba}")

    # external validation
    # AMI and ARI 
    ari = adjusted_rand_score(cluster_labels, y)
    ami = adjusted_mutual_info_score(cluster_labels, y)

    print(f"Adjusted Rand Index vs. true labels: {ari:.3f}")
    print(f"Adjusted Mutual Info vs. true labels: {ami:.3f}")

    print(f"\n{len(cluster_sizes)}&{min_cluster_size}&{ari:.3f}&{ami:.3f}&{asw:.3f}&{ub:.3f}&{((ub - asw)/ub):.3f}&{uba:.3f}&{((uba - asw)/uba):.3f}")
    
    

### n classes = 2

In [26]:
np.random.seed(872)
classes = df_subset["class"].unique()
n_classes = 2
chosen = np.random.choice(classes, size=n_classes, replace=False)
# Filter dataframe to those two classes
df_subset_2 = df_subset[df_subset["class"].isin(chosen)]

df_subset_2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,class
15400,0.074300,0.003447,0.008617,0.016328,0.054172,0.210296,0.010435,0.020164,0.018543,0.013331,...,0.000000,0.000000,0.000018,0.000000,0.000000,0.0,0.0,"""img155""","""155/155_r280.png""",155
15401,0.094928,0.003348,0.006045,0.015627,0.072729,0.214281,0.010220,0.012153,0.016199,0.014633,...,0.000000,0.000000,0.000041,0.000002,0.000000,0.0,0.0,"""img155""","""155/155_i230.png""",155
15402,0.034772,0.000430,0.000264,0.000156,0.021170,0.203340,0.006698,0.008800,0.017675,0.065780,...,0.000000,0.000000,0.000014,0.000000,0.000000,0.0,0.0,"""img155""","""155/155_i150.png""",155
15403,0.074510,0.003493,0.008579,0.017877,0.049793,0.209434,0.011750,0.021550,0.017815,0.014504,...,0.000000,0.000000,0.000005,0.000002,0.000000,0.0,0.0,"""img155""","""155/155_r305.png""",155
15404,0.077076,0.003251,0.007751,0.018351,0.058393,0.211381,0.011974,0.018643,0.017013,0.012295,...,0.000000,0.000000,0.000018,0.000007,0.000000,0.0,0.0,"""img155""","""155/155_r210.png""",155
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38895,0.010588,0.000002,0.000002,0.000000,0.000305,0.143925,0.000059,0.000043,0.000016,0.000181,...,0.000007,0.000005,0.000070,0.000041,0.000005,0.0,0.0,"""img389""","""389/389_i120.png""",389
38896,0.079074,0.000014,0.000005,0.000000,0.000167,0.224713,0.000472,0.000041,0.000018,0.000179,...,0.000002,0.000000,0.000029,0.000000,0.000000,0.0,0.0,"""img389""","""389/389_l4c1.png""",389
38897,0.059805,0.000090,0.000020,0.000007,0.000619,0.220667,0.000427,0.000111,0.000066,0.000199,...,0.000000,0.000000,0.000068,0.000000,0.000000,0.0,0.0,"""img389""","""389/389_r310.png""",389
38898,0.065866,0.000038,0.000025,0.000007,0.000533,0.223065,0.000644,0.000131,0.000059,0.000267,...,0.000000,0.000002,0.000025,0.000000,0.000000,0.0,0.0,"""img389""","""389/389_r45.png""",389


In [27]:
y2 = df_subset_2["class"]
X2 = df_subset_2.iloc[:, 0:N_FEATURES]
X2 = X2.to_numpy()
np.save("arrays/aloi125-2classes.npy", X2)
print(X2.shape, X2.dtype)
D2 = squareform(pdist(X2, metric="cityblock"))
ub = upper_bound(D2)

(200, 125) float64


In [None]:
eval(D2, y2, (kmedoids.pamsil(diss=D2, medoids=n_classes, random_state=42).labels + 1), ub)

Min cluster size = 2
K = 5
ASW = 0.6867861830924988
ub = 0.8989414824507823
WCRE = 0.236005683907127
uba = 0.8989414824507823
wcre = 0.236005683907127
Adjusted Rand Index vs. true labels: 0.904
Adjusted Mutual Info vs. true labels: 0.864

5&2&0.904&0.864&0.687&0.899&0.236&0.899&0.236


In [14]:
eval(D2, y2, (kmedoids.fastmsc(diss=D2, medoids=n_classes, random_state=42).labels + 1), ub)

Min cluster size = 94
K = 2
ASW = 0.6686712504441235
ub = 0.8989414824507823
WCRE = 0.25615708753240934
uba = 0.7145427525388146
wcre = 0.06419700141343099
Adjusted Rand Index vs. true labels: 0.883
Adjusted Mutual Info vs. true labels: 0.834

2&94&0.883&0.834&0.669&0.899&0.256&0.715&0.064


In [15]:
eval(D2, y2, (kmedoids.dynmsc(diss=D2, medoids=10, random_state=42).labels + 1), ub)

Min cluster size = 1
K = 8
ASW = 0.6467565067151209
ub = 0.8989414824507823
WCRE = 0.2805354749545322
uba = 0.8989414824507823
wcre = 0.2805354749545322
Adjusted Rand Index vs. true labels: 0.876
Adjusted Mutual Info vs. true labels: 0.820

8&1&0.876&0.820&0.647&0.899&0.281&0.899&0.281


### n classes = 5

In [29]:
np.random.seed(25)
classes = df_subset["class"].unique()
n_classes = 5
chosen = np.random.choice(classes, size=n_classes, replace=False)
# Filter dataframe to those two classes
df_subset_5 = df_subset[df_subset["class"].isin(chosen)]

df_subset_5

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,class
4900,0.060918,0.000000,0.000000,0.000000,0.000000,0.205641,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000005,0.000000,0.000000,0.000000,0.000000,"""img50""","""50/50_r280.png""",50
4901,0.066673,0.000000,0.000000,0.000000,0.000000,0.214615,0.000007,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000002,0.000000,0.000000,0.000000,0.000000,"""img50""","""50/50_i230.png""",50
4902,0.022400,0.000000,0.000000,0.000000,0.000000,0.164062,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,"""img50""","""50/50_i150.png""",50
4903,0.062414,0.000000,0.000000,0.000000,0.000000,0.206516,0.000014,0.000002,0.000000,0.000000,...,0.000000,0.000000,0.000007,0.000000,0.000000,0.000000,0.000000,"""img50""","""50/50_r305.png""",50
4904,0.060714,0.000000,0.000000,0.000000,0.000000,0.203218,0.000014,0.000000,0.000000,0.000000,...,0.000023,0.000000,0.000034,0.000000,0.000000,0.000000,0.000000,"""img50""","""50/50_r210.png""",50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77595,0.009015,0.000023,0.000016,0.000011,0.007792,0.100355,0.000258,0.000151,0.000233,0.005179,...,0.000095,0.000294,0.000226,0.002109,0.007695,0.011016,0.018136,"""img776""","""776/776_i120.png""",776
77596,0.038190,0.000671,0.000882,0.001042,0.011203,0.170632,0.001160,0.000427,0.000165,0.000606,...,0.000258,0.001506,0.000242,0.005018,0.010512,0.006237,0.007112,"""img776""","""776/776_l4c1.png""",776
77597,0.034984,0.000466,0.000861,0.000936,0.007132,0.158339,0.001160,0.000834,0.000457,0.000637,...,0.000115,0.000310,0.000145,0.002882,0.009257,0.009967,0.002003,"""img776""","""776/776_r310.png""",776
77598,0.038237,0.000744,0.001476,0.002161,0.016968,0.163595,0.001673,0.001625,0.001069,0.001148,...,0.000142,0.000902,0.000543,0.006793,0.010102,0.005380,0.000556,"""img776""","""776/776_r45.png""",776


In [30]:
y5 = df_subset_5["class"]
X5 = df_subset_5.iloc[:, 0:N_FEATURES]
X5 = X5.to_numpy()
np.save("arrays/aloi125-5classes.npy", X5)
print(X5.shape, X5.dtype)
D5 = squareform(pdist(X5, metric="cityblock"))
ub = upper_bound(D5)

(500, 125) float64


In [None]:
eval(D5, y5, (kmedoids.pamsil(diss=D5, medoids=n_classes, random_state=42).labels + 1), ub)

Min cluster size = 6
K = 2
ASW = 0.46139913604824323
ub = 0.8985129984767883
WCRE = 0.4864858529254068
uba = 0.8472327983019008
wcre = 0.45540453937451386
Adjusted Rand Index vs. true labels: -0.000
Adjusted Mutual Info vs. true labels: -0.005

2&6&-0.000&-0.005&0.461&0.899&0.486&0.847&0.455


In [24]:
eval(D5, y5, (kmedoids.fastmsc(diss=D5, medoids=n_classes, random_state=42).labels + 1), ub)

Min cluster size = 19
K = 5
ASW = 0.4732522931811366
ub = 0.8985129984767883
WCRE = 0.47329388224386126
uba = 0.7781139828984562
wcre = 0.3917956705799284
Adjusted Rand Index vs. true labels: 0.654
Adjusted Mutual Info vs. true labels: 0.723

5&19&0.654&0.723&0.473&0.899&0.473&0.778&0.392


In [25]:
eval(D5, y5, (kmedoids.dynmsc(diss=D5, medoids=10, random_state=42).labels + 1), ub)

Min cluster size = 28
K = 2
ASW = 0.41630960438602654
ub = 0.8985129984767883
WCRE = 0.5366682450985363
uba = 0.7518455007784354
wcre = 0.44628304092397486
Adjusted Rand Index vs. true labels: 0.001
Adjusted Mutual Info vs. true labels: 0.009

2&28&0.001&0.009&0.416&0.899&0.537&0.752&0.446
