In [38]:
import pandas as pd
import os

Step 1
Data preparation. Joining datasets from the Amazon file

In [39]:
file_paths = [
    '/Users/crazyrinchik/Desktop/DSBA/Coursework/archive/Air Conditioners.csv',
    '/Users/crazyrinchik/Desktop/DSBA/Coursework/archive/All Appliances.csv',
    '/Users/crazyrinchik/Desktop/DSBA/Coursework/archive/All Books.csv',
    '/Users/crazyrinchik/Desktop/DSBA/Coursework/archive/All Car and Motorbike Products.csv',
    '/Users/crazyrinchik/Desktop/DSBA/Coursework/archive/All Electronics.csv',
    '/Users/crazyrinchik/Desktop/DSBA/Coursework/archive/All English.csv',
    '/Users/crazyrinchik/Desktop/DSBA/Coursework/archive/All Exercise and Fitness.csv',
    '/Users/crazyrinchik/Desktop/DSBA/Coursework/archive/All Grocery and Gourmet Foods.csv',
    '/Users/crazyrinchik/Desktop/DSBA/Coursework/archive/All Hindi.csv',
    '/Users/crazyrinchik/Desktop/DSBA/Coursework/archive/All Home and Kitchen.csv'
]

df = []
for file_path in file_paths:
    data = pd.read_csv(file_path)
    data = data[['name', 'main_category', 'sub_category']]
    df.append(data)

all_data = pd.concat(df, ignore_index=True)
all_data

Unnamed: 0,name,main_category,sub_category
0,"Lloyd 1.5 Ton 3 Star Inverter Split Ac (5 In 1 Convertible, Copper, Anti-Viral + Pm 2.5 Filter, 2023 Model, White, Gls18I3...",appliances,Air Conditioners
1,"LG 1.5 Ton 5 Star AI DUAL Inverter Split AC (Copper, Super Convertible 6-in-1 Cooling, HD Filter with Anti-Virus Protectio...",appliances,Air Conditioners
2,"LG 1 Ton 4 Star Ai Dual Inverter Split Ac (Copper, Super Convertible 6-In-1 Cooling, Hd Filter With Anti Virus Protection,...",appliances,Air Conditioners
3,"LG 1.5 Ton 3 Star AI DUAL Inverter Split AC (Copper, Super Convertible 6-in-1 Cooling, HD Filter with Anti-Virus Protectio...",appliances,Air Conditioners
4,"Carrier 1.5 Ton 3 Star Inverter Split AC (Copper,ESTER Dxi, 4-in-1 Flexicool Inverter, 2022 Model,R32,White)",appliances,Air Conditioners
...,...,...,...
24523,Faber-Castell Jumbo Wax Crayons - 24 Shades,home & kitchen,All Home & Kitchen
24524,"Aqtus Vacuum Bags Reusable Space Saver Quit Vacuum Plastic Storage Bag 8 Pack (2 x Jumbo, 2 x Large, 2 x Medium, 2 x Smal...",home & kitchen,All Home & Kitchen
24525,DITYA Enterprise Glass Tumbler with Lid and Straw Coffee Mug Tea Cup Travel Mug Smoothies Fruit Juice Bottle for Home and ...,home & kitchen,All Home & Kitchen
24526,PrettyKrafts Shirt Stacker Closet Organizer - Shirts and Clothing Organizer - Exile (Set of 2) (Big) - GreyBlack,home & kitchen,All Home & Kitchen


Step 2
Cleaning the data with regular expressions

In [40]:
import re
all_data.name = all_data.name.apply(lambda x: re.sub(r"\d+", "", x))\
                             .apply(lambda x: re.sub(r"[\/|.+\-']", "", x))\
                             .apply(lambda x: x.replace('  ', ' '))\
                             .apply(lambda x: x.split(' ')[:7])

In [41]:
all_data.name = all_data.name.apply(lambda x: ' '.join(x))

In [42]:
all_data.name = all_data.name.apply(lambda x: x.strip())

In [43]:
all_data.head()

Unnamed: 0,name,main_category,sub_category
0,Lloyd Ton Star Inverter Split Ac (,appliances,Air Conditioners
1,LG Ton Star AI DUAL Inverter Split,appliances,Air Conditioners
2,LG Ton Star Ai Dual Inverter Split,appliances,Air Conditioners
3,LG Ton Star AI DUAL Inverter Split,appliances,Air Conditioners
4,"Carrier Ton Star Inverter Split AC (Copper,ESTER",appliances,Air Conditioners


In [44]:
from sklearn.preprocessing import OrdinalEncoder

ord_enc = OrdinalEncoder()

all_data_copy = all_data.copy()

# all_data_copy["main_category"] = ord_enc.fit_transform(all_data_copy[["main_category"]])
all_data_copy["name"] = ord_enc.fit_transform(all_data_copy[["name"]])

all_data_copy.head()

Unnamed: 0,name,main_category,sub_category
0,9786.0,appliances,Air Conditioners
1,9293.0,appliances,Air Conditioners
2,9295.0,appliances,Air Conditioners
3,9293.0,appliances,Air Conditioners
4,3709.0,appliances,Air Conditioners


Step 3
Using BERT embeddings, convert the text data into numerical representation (vectors)

In [45]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
embeddings = model.encode(all_data['name'], show_progress_bar=True)



Batches:   0%|          | 0/767 [00:00<?, ?it/s]

In [46]:
import numpy as np
embeddings_np = np.array(embeddings)

In [47]:
embeddings_np.shape

(24528, 768)

In [48]:
# import umap
# 
# reducer = umap.UMAP(n_components=4)
# embedding = reducer.fit_transform(embeddings_np[:, :-1])

Step 4
Now, before clustering we have to reduce dimensionality, using UMAP algorithm; only after that we'll cluster products by main_category

In [49]:
%timeit
import umap
umap_embeddings = umap.UMAP(n_components=50, metric='cosine').fit_transform(embeddings_np)
umap_embeddings.shape

(24528, 50)

In [50]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
embeddings_scaled = scaler.fit_transform(umap_embeddings)

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
embeddings_scaled = scaler.fit_transform(embeddings_scaled)

Step 5: Clustering with USPEC

In [51]:
from sklearn.cluster import KMeans
import numpy as np
import math
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import coo_matrix, csr_matrix
from sklearn.decomposition import TruncatedSVD

def U_SPEC(scaled_X,                      # dataset
           k = 10,
           p = 1000,                      # number of representatives
           p_mltplr = 10,                 # multiplier to define the number of candidates to be representatives
           z1 = 32,                       # number of clusters of representatives (sqrt(p))
           seed = 42,
           KMeans_MaxIter = 500,
           K_Neighbours_fin = 4,
           K_Neighbours_mltplr = 10,
           verbose=True):
  INF = 1e+6
  BATCH_SIZE = int(2e+5) # for pairwise euclidian distance calculation

  K_neighbours_pr = K_Neighbours_mltplr * K_Neighbours_fin

  # 1. Hybrid representative selection
  sample_idx = np.random.choice(scaled_X.shape[0], size = p*p_mltplr, replace=False)
  sample = scaled_X[sample_idx]
  kmeans = KMeans(n_clusters=p, random_state=seed, max_iter=KMeans_MaxIter, tol=1e-9).fit(sample)
  repr = kmeans.cluster_centers_

  # 2. Approximation of K-Nearest Representatives
  # 2.1. Pre-step 1. Clusterization of representatives
  kmeans = KMeans(n_clusters=z1, random_state=seed, max_iter=KMeans_MaxIter, tol=1e-9).fit(repr)
  # what we need from this clusterization?
  # 2.1.1. centers
  z_centers = kmeans.cluster_centers_
  # 2.1.2. list of representatives for each cluster
  repr_labels = kmeans.labels_

  # 2.2. Pre-step 2. KNN of representatives
  knn_model = NearestNeighbors(n_neighbors=K_neighbours_pr, algorithm='kd_tree').fit(repr)
  repr_knn_dist, repr_knn_idx = knn_model.kneighbors(repr)


  # 3. K nearest representatives approximation for full data
  obj_to_cl_pw_dist = pairwise_distances(scaled_X, z_centers, metric='euclidean')
  obj_to_cl_pairing = np.argmin(obj_to_cl_pw_dist, axis=1)

  row_id_lst, col_id_lst = [], []
  for i in range(z1):
    obj_to_cl_idx = np.where(obj_to_cl_pairing == i)[0]
    repr_idx = np.where(repr_labels == i)[0]

    obj_to_repr_pw_dist = pairwise_distances(scaled_X[obj_to_cl_idx],
                                            repr[repr_idx],
                                            metric='euclidean')
    obj_to_repr_pairing = np.argmin(obj_to_repr_pw_dist, axis=1)

    row_id_lst.append(repr_knn_idx[repr_idx[obj_to_repr_pairing]].flatten())
    col_id_lst.append(obj_to_cl_idx[np.arange(obj_to_cl_idx.shape[0] * K_neighbours_pr) // K_neighbours_pr])

  row_id, col_id = np.hstack(row_id_lst), np.hstack(col_id_lst)

  df_obj_to_knn = np.full((scaled_X.shape[0], repr.shape[0]), fill_value=INF, dtype=np.float64)
  for i in range(int((row_id.shape[0] + BATCH_SIZE - 1) // BATCH_SIZE)):
    cols_id = col_id[i*BATCH_SIZE:(i + 1) * BATCH_SIZE]
    rows_id = row_id[i*BATCH_SIZE:(i + 1) * BATCH_SIZE]
    df_obj_to_knn[cols_id, rows_id] = np.sqrt(np.sum((repr[rows_id] - scaled_X[cols_id])**2, axis=1))

  B = df_obj_to_knn.copy()
  fin_knn_idx = np.sort(np.argpartition(B, K_Neighbours_fin, axis=1)[:,:K_Neighbours_fin], axis=1)
  B = B[np.arange(K_Neighbours_fin * B.shape[0]) // K_Neighbours_fin, fin_knn_idx.flatten()]

  sigma = np.mean(B)
  # Gaussian kernel
  B = np.exp(-B**2 / (2*sigma**2))

  row = np.arange(B.shape[0]) // K_Neighbours_fin
  col = fin_knn_idx.flatten()
  B_sparse = coo_matrix((B, (row, col)),
                                    shape=(scaled_X.shape[0], repr.shape[0]))
  B = B.reshape((B.shape[0] // K_Neighbours_fin, K_Neighbours_fin))

  # inverse degree matrix for G_X graph
  D_x_inv = coo_matrix((scaled_X.shape[0], scaled_X.shape[0]), dtype=np.float64)
  D_x_inv.setdiag(1 / B.sum(axis=1))

  # transition matrix
  T = D_x_inv.dot(B_sparse)
  E_r = B_sparse.T.dot(T)

  # degree matrix for G_R graph
  d = E_r.toarray().sum(axis=1).T
  D_r = coo_matrix((B_sparse.shape[1], B_sparse.shape[1]), dtype=np.float64)
  D_r.setdiag(d)

  D_r_inv = coo_matrix((B_sparse.shape[1], B_sparse.shape[1]), dtype=np.float64)
  D_r_inv.setdiag(1/d)

  # graph Laplacian
  L_r = D_r - E_r
  L_rw = D_r_inv.dot(L_r)

  tsvd = TruncatedSVD(n_components = B_sparse.shape[1], algorithm = "randomized",
                            n_iter = KMeans_MaxIter, tol = 1e-9, random_state=seed)
  tsvd.fit(L_rw.toarray())

  v = tsvd.components_
  sing_values = tsvd.singular_values_

  #########
  # clean zero-values
  v = v[sing_values != 0][-k:]
  sing_values = sing_values[sing_values != 0][-k:]
  #########

  ################################
  # clean from nans
  if verbose:
    if np.where(np.isnan(sing_values))[0].shape[0] > 0:
      print('{} nans out of {} singular values'.format(np.where(np.isnan(sing_values))[0].shape[0],
                                                        sing_values.shape[0]))
  v = v[np.logical_not(np.isnan(sing_values))]
  sing_values = sing_values[np.logical_not(np.isnan(sing_values))]

  ################################

  gamma = 1 - np.sqrt(1 - sing_values)

  h = (1 / (1 - gamma)) * T.dot(v.T)

  kmeans = KMeans(n_clusters=k, random_state=seed, max_iter=KMeans_MaxIter, tol=1e-9).fit(h)
  h_centers = kmeans.cluster_centers_
  X_labels = kmeans.labels_
  return X_labels

In [52]:
embeddings_scaled = embeddings_scaled.astype("float64")
embeddings_scaled

array([[0.35837778, 0.04839686, 0.95315099, ..., 0.56436032, 0.43130514,
        0.75823635],
       [0.35829434, 0.0534512 , 0.95200437, ..., 0.5980733 , 0.42452049,
        0.91393805],
       [0.35828984, 0.05343268, 0.95199019, ..., 0.59840471, 0.42437553,
        0.91447574],
       ...,
       [0.3587487 , 0.0285106 , 0.95533842, ..., 0.55406165, 0.4312185 ,
        0.54642016],
       [0.3594934 , 0.0217715 , 0.95650381, ..., 0.69367439, 0.42877421,
        0.3094964 ],
       [0.35982516, 0.0271014 , 0.95601255, ..., 0.43304431, 0.40651423,
        0.48102835]])

GRID SEARCH FOR U_SPEC

In [79]:
from sklearn.metrics import normalized_mutual_info_score as NMI
from umap import UMAP

# Диапазоны параметров
p_values = range(300, 1501, 30)
k_values = range(80, 101, 5)

# Перебор параметров по сетке
best_score = 0
best_params = {}
iteration = 1


for k in k_values:
    for p in p_values:
        z1 = math.floor(math.sqrt(p))
        
        # Кластеризуем преобразованные данные с помощью U-SPEC
        print(np.isnan(embeddings_scaled).any())
        labels_pred = U_SPEC(embeddings_scaled, k, p)
        
        # Вычисляем NMI-score
        nmi_score = NMI(labels_pred, all_data_copy["name"])

        # Вывод текущего результата
        print(f'iter {iteration}, p: {p}, k: {k}, NMI-score: {nmi_score}')

        # Обновляем лучший результат
        if nmi_score > best_score:
            best_score = nmi_score
            best_params = {'p': p, 'k': k}
        iteration += 1

print("Лучший NMI-score:", best_score)
print("Лучшие параметры:", best_params)

False
iter 1, p: 300, k: 80, NMI-score: 0.5785181058350577
False
iter 2, p: 330, k: 80, NMI-score: 0.5701867548494036
False
iter 3, p: 360, k: 80, NMI-score: 0.5679038623331767
False
iter 4, p: 390, k: 80, NMI-score: 0.5728781701706878
False
iter 5, p: 420, k: 80, NMI-score: 0.5655596022344174
False
iter 6, p: 450, k: 80, NMI-score: 0.5615770315762542
False
iter 7, p: 480, k: 80, NMI-score: 0.5675883547868479
False
iter 8, p: 510, k: 80, NMI-score: 0.5616804058504725
False
iter 9, p: 540, k: 80, NMI-score: 0.5591112868346821
False
iter 10, p: 570, k: 80, NMI-score: 0.5578488933845516
False
iter 11, p: 600, k: 80, NMI-score: 0.5569607065550671
False
iter 12, p: 630, k: 80, NMI-score: 0.5530661020950093
False
iter 13, p: 660, k: 80, NMI-score: 0.5505065596439039
False
iter 14, p: 690, k: 80, NMI-score: 0.5493659483232841
False
iter 15, p: 720, k: 80, NMI-score: 0.5513243841645022
False
iter 16, p: 750, k: 80, NMI-score: 0.5487069226317312
False
iter 17, p: 780, k: 80, NMI-score: 0.545057

ADDITIONAL GRID SEARCH FOR U_SPEC

In [129]:
# Диапазоны параметров
p_values = range(300, 401, 50)
k_values = range(200, 301, 50)

# Перебор параметров по сетке
best_score = 0
best_params = {}
iteration = 1


for k in k_values:
    for p in p_values:
        z1 = math.floor(math.sqrt(p))

        # Кластеризуем преобразованные данные с помощью U-SPEC
        labels_pred = U_SPEC(embeddings_scaled, k, p)

        # Вычисляем NMI-score
        nmi_score = NMI(labels_pred, all_data_copy["name"])

        # Вывод текущего результата
        print(f'iter {iteration}, p: {p}, k: {k}, NMI-score: {nmi_score}')

        # Обновляем лучший результат
        if nmi_score > best_score:
            best_score = nmi_score
            best_params = {'p': p, 'k': k}
        iteration += 1

print("Лучший NMI-score:", best_score)
print("Лучшие параметры:", best_params)

iter 1, p: 300, k: 200, NMI-score: 0.6801047599753205
iter 2, p: 350, k: 200, NMI-score: 0.6780880100821666
iter 3, p: 400, k: 200, NMI-score: 0.6801426524144835
iter 4, p: 300, k: 250, NMI-score: 0.6981457081348386
iter 5, p: 350, k: 250, NMI-score: 0.7005470916101484
iter 6, p: 400, k: 250, NMI-score: 0.7022211875656775


  gamma = 1 - np.sqrt(1 - sing_values)


ValueError: Input X contains NaN.
KMeans does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [130]:
labels_pred_best_uspec = U_SPEC(embeddings_scaled, k=250, p=400)

In [131]:
NMI(labels_pred_best_uspec, all_data_copy["name"]).round(2)

0.7

In [132]:
silhouette_score(embeddings_scaled, labels_pred_best_uspec).round(2)

0.36

In [133]:
np.unique(labels_pred_best_uspec)

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 18

In [135]:
for id in np.unique(labels_pred_best_uspec):
  print(id, labels_pred_best_uspec[labels_pred_best_uspec == id].shape[0])

0 146
1 431
2 145
3 45
4 51
5 188
6 166
7 197
8 71
9 197
10 302
11 171
12 434
13 860
14 439
15 141
16 216
17 133
18 56
19 265
20 230
21 33
22 293
23 69
24 154
25 454
26 220
27 293
28 246
29 182
30 37
31 235
32 95
33 209
34 178
35 216
36 184
37 213
38 171
39 127
40 147
41 64
42 117
43 155
44 120
45 86
46 114
47 174
48 67
49 127
50 168
51 102
52 91
53 113
54 93
55 115
56 104
57 194
58 75
59 65
60 119
61 113
62 96
63 123
64 84
65 113
66 58
67 110
68 134
69 84
70 165
71 136
72 213
73 77
74 84
75 157
76 190
77 125
78 106
79 89
80 62
81 82
82 29
83 65
84 99
85 107
86 64
87 131
88 120
89 142
90 126
91 197
92 79
93 99
94 34
95 79
96 76
97 95
98 44
99 68
100 116
101 135
102 61
103 67
104 66
105 171
106 97
107 74
108 141
109 79
110 16
111 120
112 63
113 108
114 35
115 34
116 84
117 100
118 96
119 42
120 57
121 116
122 99
123 110
124 79
125 70
126 106
127 43
128 40
129 77
130 64
131 82
132 112
133 57
134 15
135 56
136 67
137 150
138 53
139 43
140 122
141 49
142 102
143 19
144 88
145 91
146 69
147

In [None]:
# NMI_story = []
# ARI_story = []
# 
# for i in range(20):
#   print('iter: [{}]'.format(i))
#   NMI_story.append(NMI(all_data_copy["main_category"], U_SPEC(embeddings_scaled, k=6)))
#   ARI_story.append(ARI(all_data_copy["main_category"], U_SPEC(embeddings_scaled, k=6)))
#   print(NMI_story, '\n', ARI_story)

Reducing dimensionality of the data

In [143]:
import umap.umap_ as umap

umap_embeddings = umap.UMAP(n_neighbors=15, n_components=70, min_dist=0.0, metric='cosine').fit_transform(embeddings)
umap_embeddings.shape

zsh:1: command not found: pip


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


(24528, 70)

Step 6: Clustering with HDBSCAN

In [145]:
from sklearn.cluster import HDBSCAN

cluster_hdbscan = HDBSCAN(min_cluster_size=2,
                          metric='euclidean',
                          cluster_selection_method='eom').fit(umap_embeddings)

In [144]:
from sklearn.metrics import normalized_mutual_info_score as NMI
from umap import UMAP

# Диапазон парметров для HDBSCAN
min_cluster_sizes = range(2, 51, 5)
cluster_selection_epsilons = [0., 0.3, 0.5]

best_score = 0
best_params = {}
iter = 1

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
embeddings_scaled = scaler.fit_transform(umap_embeddings)

for min_cluster_size in min_cluster_sizes:
    for cluster_selection_epsilon in cluster_selection_epsilons:
        # Кластеризируем при помощи HDBSCAN
        cluster_hdbscan = hdbscan.HDBSCAN(min_cluster_size=int(min_cluster_size),
                                            metric='euclidean',
                                            cluster_selection_epsilon=cluster_selection_epsilon).fit(umap_embeddings)
        labels_pred = cluster_hdbscan.labels_
        # NMI-score
        nmi_score = NMI(labels_pred, all_data["name"])

        print(f'iter {iter}, min_cluster_size: {min_cluster_size}, cluster_selection_epsilon: {cluster_selection_epsilon}, NMI: {nmi_score}')

        # Обновляем лучшие параметры
        if nmi_score > best_score:
            best_score = nmi_score
            best_params = {
                'min_cluster_size': min_cluster_size,
                'cluster_selection_epsilon': cluster_selection_epsilon}
        iter += 1


print("Best NMI:", best_score)
print("Best Parameters:", best_params)

In [146]:
labels_pred_best_hdbscan = cluster_hdbscan.labels_

In [149]:
NMI(labels_pred_best_hdbscan, all_data["name"])

0.8384782837232634

In [151]:
silhouette_score(umap_embeddings, labels_pred_best_hdbscan).round(2)

0.22

In [None]:
#GRID SEACH

Step 7: Clustering with OPTICS

In [152]:
from sklearn.cluster import OPTICS

cluster_optics = OPTICS(min_samples=5, 
                        metric='cosine',
                        max_eps=2.1, 
                        xi=0.01, 
                        cluster_method='xi').fit(umap_embeddings)

In [None]:
from sklearn.metrics import normalized_mutual_info_score as NMI
from umap import UMAP

# Диапазон парметров для HDBSCAN
min_samples = range(5, 51, 5)
max_eps_range = [1.0, 2.1, 0.1]
xi_range = [0.05, 0.16, 0.01]

best_score = 0
best_params = {}
iter = 1

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
embeddings_scaled = scaler.fit_transform(umap_embeddings)

for min_sample in min_samples:
    for max_eps in max_eps_range:
        for xi in xi_range:
            # Кластеризируем при помощи HDBSCAN
            cluster_optics = OPTICS(min_samples=min_sample,
                                      max_eps=max_eps,
                                      xi=xi).fit(umap_embeddings)
            labels_pred_optics = cluster_optics.labels_
            # NMI-score
            nmi_score = NMI(labels_pred_optics, all_data["name"])

            print(f'iter {iter}, min_samples: {min_samples}, max_eps: {max_eps}, xi: {xi}, NMI: {nmi_score}')

            # Обновляем лучшие параметры
            if nmi_score > best_score:
                best_score = nmi_score
                best_params = {
                    'min_samples': min_samples,
                    'max_eps': max_eps,
                    'xi': xi}
            iter += 1


print("Best NMI:", best_score)
print("Best Parameters:", best_params)

In [153]:
labels_pred_best_optics = cluster_optics.labels_

In [154]:
NMI(labels_pred_best_optics, all_data["name"])

0.7182368150284131

In [159]:
silhouette_score(umap_embeddings, labels_pred_best_optics).round(2)

0.11

Step 8: Clustering with K-MEANS

In [162]:
from sklearn.cluster import KMeans

cluster_labels_kmeans = KMeans(n_clusters=300, 
                               random_state=52).fit(umap_embeddings)

In [108]:
# Диапазоны параметров
k_vals = range(200, 301, 20)
algorithms = ["lloyd", "elkan"]


# Перебор параметров по сетке
best_score = 0
best_params = {}
iteration = 1

for k in k_vals:
    for alg in algorithms:
        # Кластеризуем преобразованные данные с помощью KMeans
        labels_pred = KMeans(n_clusters=k, init="k-means++", algorithm=alg, random_state=52).fit(umap_embeddings).labels_

        # Вычисляем NMI-score
        nmi_score = NMI(labels_pred, all_data_copy["name"])
        ari_score = ARI(labels_pred, all_data_copy["name"])

        # Вывод текущего результата
        print(f'iter {iteration}, algorithm: {alg}, k: {k}, NMI-score: {nmi_score}, ARI-score: {ari_score}')

        # Обновляем лучший результат
        if nmi_score > best_score:
            best_score = nmi_score
            best_params = {'p': p, 'k': k}
        iteration += 1

print("Лучший NMI-score:", best_score)
print("Лучшие параметры:", best_params)

iter 1, algorithm: lloyd, k: 200, NMI-score: 0.6741487266275054, ARI-score: 0.010824518577093854
iter 2, algorithm: elkan, k: 200, NMI-score: 0.6741487266275054, ARI-score: 0.010824518577093854
iter 3, algorithm: lloyd, k: 220, NMI-score: 0.686217086281151, ARI-score: 0.012841453470872738
iter 4, algorithm: elkan, k: 220, NMI-score: 0.686217086281151, ARI-score: 0.012841453470872738
iter 5, algorithm: lloyd, k: 240, NMI-score: 0.6944566629796521, ARI-score: 0.014002710594939707
iter 6, algorithm: elkan, k: 240, NMI-score: 0.6944566629796521, ARI-score: 0.014002710594939707
iter 7, algorithm: lloyd, k: 260, NMI-score: 0.7030125016134539, ARI-score: 0.01549717212700995
iter 8, algorithm: elkan, k: 260, NMI-score: 0.7030125016134539, ARI-score: 0.01549717212700995
iter 9, algorithm: lloyd, k: 280, NMI-score: 0.710183621938111, ARI-score: 0.016772476058535418
iter 10, algorithm: elkan, k: 280, NMI-score: 0.710183621938111, ARI-score: 0.016772476058535418
iter 11, algorithm: lloyd, k: 300, 

In [163]:
labels_pred_best_kmeans = KMeans(n_clusters=300,  random_state=52).fit_predict(umap_embeddings)

In [164]:
NMI(labels_pred_best_kmeans, all_data["name"])

0.7133416561339821

In [165]:
from sklearn.metrics import silhouette_score
silhouette_score(umap_embeddings, labels_pred_best_kmeans)

0.4844255