## DBSCAN

### Identify optiomal eps and min_samples values

In [None]:
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score

eps_values = [0.5, 1.0, 1.5]  # Example values for eps
min_samples_values = [5, 10, 15]  # Example values for min_samples
best_score = -1
best_eps = None
best_min_samples = None
progress_bar = tqdm(total=len(eps_values)*len(min_samples_values), desc='DBSCAN progress')

for eps in eps_values:
    for min_samples in min_samples_values:
        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        labels = dbscan.fit_predict(matrix.T)

        # Exclude outliers (-1 label) from silhouette score calculation
        if np.unique(labels) != -1:
            if -1 in labels:
                score = silhouette_score(matrix.T[labels != -1], labels[labels != -1])
            else:
                score = silhouette_score(matrix.T, labels)

        if score > best_score:
            best_score = score
            best_eps = eps
            best_min_samples = min_samples
        progress_bar.update(1)

print("Best eps:", best_eps)
print("Best min_samples:", best_min_samples)

### Run DBSCAN

In [None]:
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score

dbscan = DBSCAN(eps=0.5, min_samples=5)
labels = dbscan.fit_predict(matrix.T)

score = silhouette_score(matrix.T, labels)
print(f'silhouette value: {score}')

In [None]:
df_dblabel = pd.Series(labels,name='product cluster label')
df_dbscan = pd.concat([unique_rows_series,df_dblabel], axis=1)
df_dbscan.columns = ['product title', 'product cluster label']

In [None]:
cluster_labels = df_dbscan['product cluster label'].unique()
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(16, 8))
axes = axes.flatten()

for i, cluster_label in enumerate(cluster_labels):
    df_text = df_kmean[df_kmean['product cluster label'] == cluster_label]['product title']
    combined_text = ' '.join(df_text)
    word_frequencies = Counter(combined_text.split())
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_frequencies)
    ax = axes[i]
    ax.imshow(wordcloud, interpolation='bilinear')
    ax.set_title(f'Cluster {cluster_label}')
    ax.axis('off')

if len(cluster_labels) < len(axes):
    fig.delaxes(axes[len(cluster_labels)])

plt.tight_layout()
plt.savefig('kmeans-wordcloud.png')
plt.show()