In [1]:
import pandas as pd
from silhouette_upper_bound import upper_bound, upper_bound_samples, upper_bound_macro_silhouette
import numpy as np
import matplotlib.pyplot as plt 
import pickle
from sklearn.metrics import silhouette_score, adjusted_rand_score, silhouette_samples, adjusted_mutual_info_score
from collections import Counter
import kmedoids
from sklearn.preprocessing import StandardScaler, RobustScaler, normalize
from sklearn.impute import SimpleImputer
from scipy.spatial.distance import squareform, pdist
import seaborn as sns
from matplotlib.ticker import MultipleLocator
from tqdm import tqdm
from pathlib import Path
from scipy.io import arff
from scipy import stats
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import TruncatedSVD

In [2]:
path = "data/aloi/aloi-hsb-14x6x6.csv"

In [3]:
# Load whitespace-separated data
df = pd.read_csv(path, 
                 sep=r"\s+",      # whitespace
                 header=None,     # no header row
                 engine="python") # needed because of regex separator

print(df.shape)
df.head()

(110250, 506)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,496,497,498,499,500,501,502,503,504,505
0,0.00231,0.0,0.0,0.0,0.0,0.0,0.024204,7e-06,0.0,0.0,...,0.000127,0.000448,5.4e-05,0.000979,0.000834,0.003366,0.011929,0.012741,"""img1""","""1/1_i110.png"""
1,0.002697,0.0,0.0,0.0,0.0,0.0,0.024419,9e-06,0.0,0.0,...,0.000145,0.000463,9.3e-05,0.00073,0.000956,0.003872,0.012691,0.009307,"""img1""","""1/1_i120.png"""
2,0.003059,0.0,0.0,0.0,0.0,0.0,0.028411,1.8e-05,0.0,0.0,...,0.000156,0.000513,1.8e-05,0.000714,0.000906,0.003658,0.012037,0.00812,"""img1""","""1/1_i130.png"""
3,0.003852,2e-06,0.0,0.0,0.0,2e-06,0.036011,4.3e-05,1.8e-05,1.4e-05,...,0.000111,0.000961,3.2e-05,0.00099,0.000445,0.001614,0.004731,0.020402,"""img1""","""1/1_i140.png"""
4,0.00423,0.0,0.0,0.0,0.0,0.0,0.02979,5.2e-05,3.6e-05,9e-06,...,0.000301,0.000434,4.3e-05,0.000488,0.001069,0.005098,0.010948,0.003913,"""img1""","""1/1_i150.png"""


In [4]:
df["class"] = df.iloc[:, -1].str.extract(r"(\d+)", expand=False)
df = df[df["class"].notna()]   # drop rows with missing class
df["class"] = df["class"].astype(int)

In [5]:
df_subset = (
    df.groupby("class", group_keys=False)
      .apply(lambda g: g.sample(n=30, random_state=42))
      .reset_index(drop=True)
)

  .apply(lambda g: g.sample(n=30, random_state=42))


In [6]:
df_subset

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,497,498,499,500,501,502,503,504,505,class
0,0.010166,0.000084,0.000045,0.000079,0.000129,0.001203,0.034017,0.000197,0.000805,0.001309,...,0.000014,0.000061,0.001341,0.003411,0.009531,0.002423,0.000007,"""img1""","""1/1_r280.png""",1
1,0.010283,0.000054,0.000068,0.000059,0.000061,0.001610,0.032213,0.000167,0.000158,0.000174,...,0.000411,0.000050,0.000251,0.001487,0.007643,0.007541,0.000925,"""img1""","""1/1_i230.png""",1
2,0.004230,0.000000,0.000000,0.000000,0.000000,0.000000,0.029790,0.000052,0.000036,0.000009,...,0.000434,0.000043,0.000488,0.001069,0.005098,0.010948,0.003913,"""img1""","""1/1_i150.png""",1
3,0.010306,0.000093,0.000070,0.000079,0.000090,0.001072,0.032052,0.000278,0.000963,0.000726,...,0.000059,0.000086,0.000554,0.002062,0.009861,0.005012,0.000016,"""img1""","""1/1_r305.png""",1
4,0.011635,0.000131,0.000149,0.000156,0.000104,0.001216,0.036904,0.000416,0.000450,0.000877,...,0.000043,0.000084,0.001225,0.005269,0.016473,0.000328,0.000045,"""img1""","""1/1_r210.png""",1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,0.005093,0.000007,0.000000,0.000000,0.000000,0.000041,0.016857,0.000041,0.000020,0.000007,...,0.000115,0.000113,0.001413,0.003422,0.002412,0.000624,0.000038,"""img1000""","""1000/1000_l6c3.png""",1000
29996,0.006208,0.000018,0.000000,0.000000,0.000000,0.000237,0.022927,0.000038,0.000032,0.000027,...,0.003820,0.000061,0.000757,0.005089,0.006730,0.004548,0.002667,"""img1000""","""1000/1000_r130.png""",1000
29997,0.006305,0.000027,0.000000,0.000002,0.000000,0.000007,0.022402,0.000077,0.000070,0.000057,...,0.001512,0.000027,0.001723,0.011391,0.006829,0.003088,0.000656,"""img1000""","""1000/1000_r260.png""",1000
29998,0.004569,0.000025,0.000014,0.000000,0.000000,0.001024,0.010650,0.000124,0.000057,0.000016,...,0.000737,0.000063,0.001824,0.005466,0.005199,0.004153,0.002884,"""img1000""","""1000/1000_l1c2.png""",1000


In [7]:
y = df_subset["class"]
X = df_subset.iloc[:, 0:125]

In [8]:
print(X.shape)
np.save("arrays/aloi.npy", X)
D = squareform(pdist(X, metric="cityblock"))

(30000, 125)


In [18]:
D.dtype

dtype('float64')

In [19]:
D_ = D.astype('float32')

In [21]:
D_.dtype 

dtype('float32')

In [22]:
upper_bound(D_)

np.float64(0.9045913161783321)

In [9]:
upper_bound(D)

np.float64(0.9045688336769377)

In [10]:
cluster_labels = (kmedoids.dynmsc(diss=D, medoids=10, random_state=42).labels + 1)

In [None]:
#cluster_labels = (kmedoids.fastmsc(diss=D, medoids=1000, random_state=42).labels + 1)

In [11]:
# silhouette samples 
silh_samples = silhouette_samples(X=D, labels=cluster_labels, metric='precomputed')

# ASW 
asw = np.mean(silh_samples)

In [12]:
asw 

np.float64(0.47718692787687883)

In [13]:
cluster_sizes = list(Counter(cluster_labels).values())
min_cluster_size = min(cluster_sizes)
min_cluster_size 

32

In [14]:
upper_bound(D, 32)

np.float64(0.7702052404494275)

In [89]:
df_subset2 = df_subset[df_subset['class'] <= 5]

In [90]:
y2 = df_subset2["class"]
X2 = df_subset2.iloc[:, 0:125]

In [91]:
print(X2.shape)
D2 = squareform(pdist(X2, metric="cityblock"))
upper_bound(D2)

(150, 125)


np.float64(0.8169975963818827)

In [92]:
print(len(pd.unique(df_subset2["class"])))

5


In [93]:
cluster_labels = (kmedoids.fastmsc(diss=D2, medoids=5, random_state=42).labels + 1)

In [94]:
# ASW 
silh_samples = silhouette_samples(X=D2, labels=cluster_labels, metric='precomputed')
asw = np.mean(silh_samples)
asw 

np.float64(0.3414752478294641)

In [95]:
ari = adjusted_rand_score(cluster_labels, y2)
ami = adjusted_mutual_info_score(cluster_labels, y2)

print(f"Adjusted Rand Index vs. true labels: {ari:.3f}")
print(f"Adjusted Mutual Info vs. true labels: {ami:.3f}")

Adjusted Rand Index vs. true labels: 0.712
Adjusted Mutual Info vs. true labels: 0.717


In [96]:
cluster_sizes = list(Counter(cluster_labels).values())
min_cluster_size = min(cluster_sizes)

In [97]:
min_cluster_size

27

In [98]:
upper_bound(D2, 27)

np.float64(0.6245384856134615)