In [2]:
"""
Datasets are available at https://zenodo.org/records/6355684 
"""

'\nDatasets are available at https://zenodo.org/records/6355684 \n'

In [3]:
import pandas as pd
from silhouette_upper_bound import upper_bound, upper_bound_samples, upper_bound_macro_silhouette
import numpy as np
import matplotlib.pyplot as plt 
import pickle
from sklearn.metrics import silhouette_score, adjusted_rand_score, silhouette_samples, adjusted_mutual_info_score
from collections import Counter
import kmedoids
from sklearn.preprocessing import StandardScaler, RobustScaler, normalize
from sklearn.impute import SimpleImputer
from scipy.spatial.distance import squareform, pdist
import seaborn as sns
from matplotlib.ticker import MultipleLocator
from tqdm import tqdm
from pathlib import Path
from scipy.io import arff
from scipy import stats
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import TruncatedSVD
import time

In [4]:
# Distance metric = L1

In [5]:
dataset_name = "aloi-hsb-14x6x6"
path = "data/aloi/" + dataset_name + ".csv"
N_FEATURES = 504

In [6]:
# Load whitespace-separated data
df = pd.read_csv(path, 
                 sep=r"\s+",      # whitespace
                 header=None,     # no header row
                 engine="python") # needed because of regex separator

print(df.shape)
df.head()

(110250, 506)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,496,497,498,499,500,501,502,503,504,505
0,0.00231,0.0,0.0,0.0,0.0,0.0,0.024204,7e-06,0.0,0.0,...,0.000127,0.000448,5.4e-05,0.000979,0.000834,0.003366,0.011929,0.012741,"""img1""","""1/1_i110.png"""
1,0.002697,0.0,0.0,0.0,0.0,0.0,0.024419,9e-06,0.0,0.0,...,0.000145,0.000463,9.3e-05,0.00073,0.000956,0.003872,0.012691,0.009307,"""img1""","""1/1_i120.png"""
2,0.003059,0.0,0.0,0.0,0.0,0.0,0.028411,1.8e-05,0.0,0.0,...,0.000156,0.000513,1.8e-05,0.000714,0.000906,0.003658,0.012037,0.00812,"""img1""","""1/1_i130.png"""
3,0.003852,2e-06,0.0,0.0,0.0,2e-06,0.036011,4.3e-05,1.8e-05,1.4e-05,...,0.000111,0.000961,3.2e-05,0.00099,0.000445,0.001614,0.004731,0.020402,"""img1""","""1/1_i140.png"""
4,0.00423,0.0,0.0,0.0,0.0,0.0,0.02979,5.2e-05,3.6e-05,9e-06,...,0.000301,0.000434,4.3e-05,0.000488,0.001069,0.005098,0.010948,0.003913,"""img1""","""1/1_i150.png"""


In [7]:
df["class"] = df.iloc[:, -1].str.extract(r"(\d+)", expand=False)
df = df[df["class"].notna()]   # drop rows with missing class
df["class"] = df["class"].astype(int)

In [8]:
df_subset = (
    df.groupby("class", group_keys=False)
      .apply(lambda g: g.sample(n=100, random_state=42))
      .reset_index(drop=True)
)

  .apply(lambda g: g.sample(n=100, random_state=42))


In [9]:
df_subset

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,497,498,499,500,501,502,503,504,505,class
0,0.010166,0.000084,0.000045,0.000079,0.000129,0.001203,0.034017,0.000197,0.000805,0.001309,...,0.000014,0.000061,0.001341,0.003411,0.009531,0.002423,0.000007,"""img1""","""1/1_r280.png""",1
1,0.010283,0.000054,0.000068,0.000059,0.000061,0.001610,0.032213,0.000167,0.000158,0.000174,...,0.000411,0.000050,0.000251,0.001487,0.007643,0.007541,0.000925,"""img1""","""1/1_i230.png""",1
2,0.004230,0.000000,0.000000,0.000000,0.000000,0.000000,0.029790,0.000052,0.000036,0.000009,...,0.000434,0.000043,0.000488,0.001069,0.005098,0.010948,0.003913,"""img1""","""1/1_i150.png""",1
3,0.010306,0.000093,0.000070,0.000079,0.000090,0.001072,0.032052,0.000278,0.000963,0.000726,...,0.000059,0.000086,0.000554,0.002062,0.009861,0.005012,0.000016,"""img1""","""1/1_r305.png""",1
4,0.011635,0.000131,0.000149,0.000156,0.000104,0.001216,0.036904,0.000416,0.000450,0.000877,...,0.000043,0.000084,0.001225,0.005269,0.016473,0.000328,0.000045,"""img1""","""1/1_r210.png""",1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0.002566,0.000000,0.000000,0.000000,0.000000,0.000011,0.020580,0.000002,0.000000,0.000000,...,0.001442,0.000072,0.001472,0.010410,0.014138,0.010704,0.005245,"""img1000""","""1000/1000_i120.png""",1000
99996,0.008843,0.000005,0.000011,0.000000,0.000000,0.000077,0.026611,0.000023,0.000009,0.000016,...,0.002159,0.000090,0.003651,0.007128,0.004356,0.001483,0.002279,"""img1000""","""1000/1000_l4c1.png""",1000
99997,0.005654,0.000018,0.000020,0.000005,0.000005,0.000009,0.020549,0.000059,0.000104,0.000050,...,0.001117,0.000041,0.002432,0.013283,0.005853,0.001465,0.000249,"""img1000""","""1000/1000_r310.png""",1000
99998,0.006212,0.000018,0.000002,0.000005,0.000000,0.000093,0.022393,0.000057,0.000041,0.000041,...,0.002817,0.000047,0.001569,0.006890,0.005050,0.002288,0.002907,"""img1000""","""1000/1000_r45.png""",1000


In [10]:
y = df_subset["class"]
X = df_subset.iloc[:, 0:N_FEATURES]
X = X.to_numpy(dtype='float32')
print(X.shape, X.dtype)

(100000, 504) float32


In [11]:
def eval(D, y, cluster_labels, ub):

    # summary
    cluster_sizes = list(Counter(cluster_labels).values())
    min_cluster_size = min(cluster_sizes)
    print(f"Min cluster size = {min_cluster_size}")
    print(f"K = {len(cluster_sizes)}")
    # silhouette samples 
    silh_samples = silhouette_samples(X=D, labels=cluster_labels, metric='precomputed')
    # ASW 
    asw = np.mean(silh_samples)
    print(f"ASW = {asw}")
    print(f"ub = {ub}")
    print(f"WCRE = {(ub - asw)/ub}")

    # constrained
    uba = upper_bound(D, min_cluster_size)
    print(f"uba = {uba}\nwcre = {(uba - asw)/uba}")

    # external validation
    # AMI and ARI 
    ari = adjusted_rand_score(cluster_labels, y)
    ami = adjusted_mutual_info_score(cluster_labels, y)

    print(f"Adjusted Rand Index vs. true labels: {ari:.3f}")
    print(f"Adjusted Mutual Info vs. true labels: {ami:.3f}")

    print(f"\n{len(cluster_sizes)}&{min_cluster_size}&{ari:.3f}&{ami:.3f}&{asw:.3f}&{ub:.3f}&{((ub - asw)/ub):.3f}&{uba:.3f}&{((uba - asw)/uba):.3f}")
    
    

### n classes = 2

In [11]:
np.random.seed(872)
classes = df_subset["class"].unique()
n_classes = 2
chosen = np.random.choice(classes, size=n_classes, replace=False)
# Filter dataframe to those two classes
df_subset_2 = df_subset[df_subset["class"].isin(chosen)]

df_subset_2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,497,498,499,500,501,502,503,504,505,class
15400,0.018851,0.000484,0.000409,0.000439,0.000400,0.000353,0.026541,0.000961,0.001408,0.001361,...,0.000000,0.000018,0.000000,0.000000,0.0,0.0,0.0,"""img155""","""155/155_r280.png""",155
15401,0.022678,0.000552,0.000477,0.000436,0.000626,0.000705,0.028555,0.000983,0.001234,0.001444,...,0.000000,0.000038,0.000002,0.000000,0.0,0.0,0.0,"""img155""","""155/155_i230.png""",155
15402,0.009273,0.000072,0.000032,0.000023,0.000009,0.000075,0.039228,0.000764,0.000513,0.000477,...,0.000000,0.000014,0.000000,0.000000,0.0,0.0,0.0,"""img155""","""155/155_i150.png""",155
15403,0.018765,0.000486,0.000515,0.000475,0.000382,0.000249,0.024762,0.001001,0.001456,0.001259,...,0.000000,0.000005,0.000002,0.000000,0.0,0.0,0.0,"""img155""","""155/155_r305.png""",155
15404,0.020234,0.000520,0.000436,0.000545,0.000470,0.000418,0.026365,0.001198,0.001693,0.001293,...,0.000000,0.000018,0.000007,0.000000,0.0,0.0,0.0,"""img155""","""155/155_r210.png""",155
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38895,0.002959,0.000000,0.000000,0.000002,0.000000,0.000079,0.038016,0.000005,0.000007,0.000011,...,0.000002,0.000068,0.000023,0.000016,0.0,0.0,0.0,"""img389""","""389/389_i120.png""",389
38896,0.018462,0.000009,0.000000,0.000000,0.000000,0.000088,0.038604,0.000097,0.000014,0.000002,...,0.000000,0.000027,0.000002,0.000000,0.0,0.0,0.0,"""img389""","""389/389_l4c1.png""",389
38897,0.013606,0.000038,0.000005,0.000002,0.000000,0.000414,0.038977,0.000066,0.000007,0.000014,...,0.000000,0.000068,0.000000,0.000000,0.0,0.0,0.0,"""img389""","""389/389_r310.png""",389
38898,0.014664,0.000020,0.000009,0.000000,0.000000,0.000260,0.040290,0.000086,0.000038,0.000018,...,0.000000,0.000025,0.000000,0.000000,0.0,0.0,0.0,"""img389""","""389/389_r45.png""",389


In [12]:
y2 = df_subset_2["class"]
X2 = df_subset_2.iloc[:, 0:N_FEATURES]
X2 = X2.to_numpy()
np.save("arrays/aloi504-2classes.npy", X2)
print(X2.shape, X2.dtype)
D2 = squareform(pdist(X2, metric="cityblock"))
ub = upper_bound(D2)

(200, 504) float64


In [13]:
eval(D2, y2, (kmedoids.pamsil(diss=D2, medoids=n_classes, random_state=42).labels + 1), ub)

Min cluster size = 94
K = 2
ASW = 0.6402911008561487
ub = 0.8788585652873633
WCRE = 0.2714514870242055
uba = 0.6945871926817037
wcre = 0.07817030373958575
Adjusted Rand Index vs. true labels: 0.883
Adjusted Mutual Info vs. true labels: 0.834

2&94&0.883&0.834&0.640&0.879&0.271&0.695&0.078


In [14]:
eval(D2, y2, (kmedoids.fastmsc(diss=D2, medoids=n_classes, random_state=42).labels + 1), ub)

Min cluster size = 92
K = 2
ASW = 0.6392995421986787
ub = 0.8788585652873633
WCRE = 0.27257972164196315
uba = 0.7021548197694213
wcre = 0.08951769011766306
Adjusted Rand Index vs. true labels: 0.846
Adjusted Mutual Info vs. true labels: 0.795

2&92&0.846&0.795&0.639&0.879&0.273&0.702&0.090


In [15]:
eval(D2, y2, (kmedoids.dynmsc(diss=D2, medoids=10, random_state=42).labels + 1), ub)

Min cluster size = 1
K = 9
ASW = 0.6439019391799796
ub = 0.8788585652873633
WCRE = 0.26734293251219454
uba = 0.8788585652873633
wcre = 0.26734293251219454
Adjusted Rand Index vs. true labels: 0.856
Adjusted Mutual Info vs. true labels: 0.776

9&1&0.856&0.776&0.644&0.879&0.267&0.879&0.267


### n classes = 5

In [12]:
np.random.seed(25)
classes = df_subset["class"].unique()
n_classes = 5
chosen = np.random.choice(classes, size=n_classes, replace=False)
# Filter dataframe to those two classes
df_subset_5 = df_subset[df_subset["class"].isin(chosen)]

df_subset_5

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,497,498,499,500,501,502,503,504,505,class
4900,0.012526,0.000000,0.000000,0.000000,0.000000,0.000000,0.025232,0.000000,0.000000,0.000000,...,0.000000,0.000005,0.000000,0.000000,0.000000,0.000000,0.000000,"""img50""","""50/50_r280.png""",50
4901,0.013319,0.000000,0.000000,0.000000,0.000000,0.000000,0.026980,0.000000,0.000000,0.000000,...,0.000000,0.000002,0.000000,0.000000,0.000000,0.000000,0.000000,"""img50""","""50/50_i230.png""",50
4902,0.005561,0.000000,0.000000,0.000000,0.000000,0.000000,0.028139,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,"""img50""","""50/50_i150.png""",50
4903,0.013048,0.000000,0.000000,0.000000,0.000000,0.000000,0.026399,0.000005,0.000000,0.000000,...,0.000000,0.000007,0.000000,0.000000,0.000000,0.000000,0.000000,"""img50""","""50/50_r305.png""",50
4904,0.012858,0.000000,0.000000,0.000000,0.000000,0.000000,0.024920,0.000005,0.000000,0.000000,...,0.000000,0.000034,0.000000,0.000000,0.000000,0.000000,0.000000,"""img50""","""50/50_r210.png""",50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77595,0.002959,0.000011,0.000007,0.000007,0.000000,0.000002,0.018740,0.000084,0.000054,0.000057,...,0.000502,0.000047,0.000877,0.003863,0.007672,0.010254,0.015424,"""img776""","""776/776_i120.png""",776
77596,0.011120,0.000079,0.000029,0.000070,0.000066,0.004948,0.022974,0.000097,0.000057,0.000023,...,0.001239,0.000059,0.002100,0.006499,0.007987,0.004903,0.005654,"""img776""","""776/776_l4c1.png""",776
77597,0.009741,0.000068,0.000070,0.000020,0.000034,0.001700,0.021588,0.000176,0.000052,0.000036,...,0.000199,0.000050,0.000717,0.004551,0.009151,0.007152,0.000653,"""img776""","""776/776_r310.png""",776
77598,0.009992,0.000063,0.000072,0.000072,0.000111,0.005550,0.022811,0.000077,0.000350,0.000222,...,0.000766,0.000090,0.002866,0.007311,0.008638,0.002134,0.000307,"""img776""","""776/776_r45.png""",776


In [13]:
y5 = df_subset_5["class"]
X5 = df_subset_5.iloc[:, 0:N_FEATURES]
X5 = X5.to_numpy()
np.save("arrays/aloi504-5classes.npy", X5)
print(X5.shape, X5.dtype)
D5 = squareform(pdist(X5, metric="cityblock"))
ub = upper_bound(D5)

(500, 504) float64


In [None]:
eval(D5, y5, (kmedoids.pamsil(diss=D5, medoids=n_classes, random_state=42).labels + 1), ub)

Min cluster size = 122
K = 2
ASW = 0.45830683539262795
ub = 0.8880079850168976
WCRE = 0.4838933397835302
uba = 0.5807297945328764
wcre = 0.21080881382833516
Adjusted Rand Index vs. true labels: 0.196
Adjusted Mutual Info vs. true labels: 0.354

2&122&0.196&0.354&0.458&0.888&0.484&0.581&0.211


In [19]:
eval(D5, y5, (kmedoids.fastmsc(diss=D5, medoids=n_classes, random_state=42).labels + 1), ub)

Min cluster size = 23
K = 5
ASW = 0.47093971569729165
ub = 0.8880079850168976
WCRE = 0.4696672511471501
uba = 0.7550342275774208
wcre = 0.37626706380141983
Adjusted Rand Index vs. true labels: 0.676
Adjusted Mutual Info vs. true labels: 0.754

5&23&0.676&0.754&0.471&0.888&0.470&0.755&0.376


In [20]:
eval(D5, y5, (kmedoids.dynmsc(diss=D5, medoids=10, random_state=42).labels + 1), ub)

Min cluster size = 20
K = 6
ASW = 0.4721924482042574
ub = 0.8880079850168976
WCRE = 0.4682565290274139
uba = 0.7637853692700396
wcre = 0.3817733787496632
Adjusted Rand Index vs. true labels: 0.678
Adjusted Mutual Info vs. true labels: 0.746

6&20&0.678&0.746&0.472&0.888&0.468&0.764&0.382
