In [None]:
!pip install hdbscan

In [None]:
!pip install umap-learn

In [None]:
!pip install plotly

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import hdbscan
import ast
from umap.umap_ import UMAP
import plotly.graph_objects as go

In [None]:
# Define file paths and parameters

input_csv_orig = '/Users/ianspence/Desktop/review-analysis/output/embedding_analysis/csv/review_embeddings.csv'
input_csv_anon = '/Users/ianspence/Desktop/review-analysis/output/embedding_analysis/csv/anonymized_reviews_qc1_4500_embeddings.csv'

In [None]:
# Load and process data (orig)

df_orig = pd.read_csv(input_csv_orig)
df_orig = df_orig.head(4500)  # Limit to first 4500 rows
df_orig['Embedding'] = df_orig.Embedding.apply(ast.literal_eval).apply(np.array)
matrix_orig = np.array(df_orig.Embedding.to_list())

In [None]:
# Wrap text (orig)

def wrap_text(text, max_length=30):
    """Inserts line breaks into text to make it wrap in the hover label."""
    words = text.split()
    lines = []
    current_line = ""
    for word in words:
        if len(current_line) + len(word) + 1 > max_length:  # +1 for the space
            lines.append(current_line)
            current_line = word
        else:
            current_line += " " + word
    lines.append(current_line)
    return "<br>".join(lines)

df_orig['Wrapped_Review_Text'] = df_orig['Review_Text'].apply(wrap_text)

In [None]:
# Reduce dimensions with UMAP (orig)

um_orig = UMAP(n_components=3, n_neighbors=50)
matrix_reduced_orig = um_orig.fit_transform(matrix_orig)

In [None]:
# Visualize in 3D without colouring by cluster (orig) 

fig = go.Figure(data=[go.Scatter3d(
    x=matrix_reduced_orig[:, 0], y=matrix_reduced_orig[:, 1], z=matrix_reduced_orig[:, 2],
    mode='markers',
    marker=dict(
        size=2,
#         color=df['Cluster'].values,
#         colorscale='Viridis',
        opacity=0.8
    ),
    text=df_orig['Wrapped_Review_Text']
)])
fig.show()

In [None]:
# Cluster data using HBDSCAN (orig)

hdb = hdbscan.HDBSCAN(min_cluster_size=40, min_samples=4)
hdb_orig = hdb.fit(matrix_reduced_orig)
df_anon['Cluster'] = hdb_orig.labels_

# df_orig.sort_values(by='Cluster', inplace=True)
# output_csv_orig = '/Users/ianspence/Desktop/review-analysis/output/embedding_analysis/csv/reviews_qc1_4500_embeddings_minclusters40_minsamples4_.csv'
# # df.to_csv(output_csv_orig, index=False)

In [None]:
# Load and process data (anon)

df_anon = pd.read_csv(input_csv_anon)
df_anon = df_anon.head(4500)  # Limit to first 4500 rows
df_anon['Anonymous_Embedding'] = df_anon.Anonymous_Embedding.apply(ast.literal_eval).apply(np.array)
matrix_anon = np.array(df_anon.Anonymous_Embedding.to_list())

In [None]:
# Wrap text (anon)

df_anon['Wrapped_Anonymized_Review_Text'] = df_anon['Anonymized_Review_Text'].apply(wrap_text)

In [None]:
# Reduce dimensions with UMAP (anon)

um_anon = UMAP(n_components=3, n_neighbors=50)
matrix_reduced_anon = um_anon.fit_transform(matrix_anon)

In [None]:
# Visualize in 3D without colouring by cluster (anon) 

fig = go.Figure(data=[go.Scatter3d(
    x=matrix_reduced_anon[:, 0], y=matrix_reduced_anon[:, 1], z=matrix_reduced_anon[:, 2],
    mode='markers',
    marker=dict(
        size=2,
#         color=df['Cluster'].values,
#         colorscale='Viridis',
        opacity=0.8
    ),
    text=df_anon['Wrapped_Anonymized_Review_Text']
)])

fig.show()

In [None]:
# Cluster data using HBDSCAN (anon)

hdb = hdbscan.HDBSCAN(min_cluster_size=40, min_samples=4)

hdb_anon = hdb.fit(matrix_reduced_anon)
df_anon['Cluster'] = hdb_anon.labels_

# df_anon.sort_values(by='Cluster', inplace=True)
# output_csv_anon = '/Users/ianspence/Desktop/review-analysis/output/embedding_analysis/csv/anonymized_reviews_qc1_4500_embeddings_minclusters40_minsamples4_.csv'
# df.to_csv(output_csv_anon, index=False)

In [None]:
# Testing area for exploring different parameters for UMAP and HBDSCAN

In [None]:
um_anon = UMAP(n_components=3, n_neighbors=25)
matrix_reduced_anon = um_anon.fit_transform(matrix_anon)


In [None]:
fig = go.Figure(data=[go.Scatter3d(
    x=matrix_reduced_anon[:, 0], y=matrix_reduced_anon[:, 1], z=matrix_reduced_anon[:, 2],
    mode='markers',
    marker=dict(
        size=2,
#         color=df['Cluster'].values,
#         colorscale='Viridis',
        opacity=0.8
    ),
    text=df_anon['Wrapped_Anonymized_Review_Text']
)])

fig.show()

In [None]:
um_anon = UMAP(n_components=3, n_neighbors=10)
matrix_reduced_anon = um_anon.fit_transform(matrix_anon)


In [None]:
fig = go.Figure(data=[go.Scatter3d(
    x=matrix_reduced_anon[:, 0], y=matrix_reduced_anon[:, 1], z=matrix_reduced_anon[:, 2],
    mode='markers',
    marker=dict(
        size=2,
#         color=df['Cluster'].values,
#         colorscale='Viridis',
        opacity=0.8
    ),
    text=df_anon['Wrapped_Anonymized_Review_Text']
)])

fig.show()

In [None]:
um_anon = UMAP(n_components=3, n_neighbors=18)
matrix_reduced_anon = um_anon.fit_transform(matrix_anon)

In [None]:
fig = go.Figure(data=[go.Scatter3d(
    x=matrix_reduced_anon[:, 0], y=matrix_reduced_anon[:, 1], z=matrix_reduced_anon[:, 2],
    mode='markers',
    marker=dict(
        size=2,
#         color=df['Cluster'].values,
#         colorscale='Viridis',
        opacity=0.8
    ),
    text=df_anon['Wrapped_Anonymized_Review_Text']
)])

fig.show()

In [None]:
um_anon = UMAP(n_components=3, n_neighbors=5)
matrix_reduced_anon = um_anon.fit_transform(matrix_anon)

In [None]:
fig = go.Figure(data=[go.Scatter3d(
    x=matrix_reduced_anon[:, 0], y=matrix_reduced_anon[:, 1], z=matrix_reduced_anon[:, 2],
    mode='markers',
    marker=dict(
        size=2,
#         color=df['Cluster'].values,
#         colorscale='Viridis',
        opacity=0.8
    ),
    text=df_anon['Wrapped_Anonymized_Review_Text']
)])

fig.show()

In [None]:
um_anon = UMAP(n_components=2, n_neighbors=5)
matrix_reduced_anon = um_anon.fit_transform(matrix_anon)


In [None]:

hdb = hdbscan.HDBSCAN(min_cluster_size=25, min_samples=5)

hdb_output = hdb.fit(matrix_anon)
df_anon['Cluster'] = hdb_output.labels_
# df_anon.sort_values(by='Cluster', inplace=True)

output_csv_anon = '/Users/ianspence/Desktop/review-analysis/output/embedding_analysis/csv/anonymized_reviews_qc1_4500_embeddings_min_cluster_50.csv'

df_anon.to_csv(output_csv_anon, index=False)

In [None]:
fig = go.Figure(data=go.Scatter(
    x=matrix_reduced_anon[:, 0], 
    y=matrix_reduced_anon[:, 1], 
    mode='markers',
    marker=dict(
        size=5,
        color=df_anon['Cluster'].values,
        colorscale='Viridis',
        opacity=0.8
    ),
    hovertext=df_anon['Wrapped_Anonymized_Review_Text'],
    hoverinfo='text'
))

fig.show()

In [None]:
um_anon = UMAP(n_components=2, n_neighbors=3)
matrix_reduced_anon = um_anon.fit_transform(matrix_anon)


In [None]:
fig = go.Figure(data=go.Scatter(
    x=matrix_reduced_anon[:, 0], 
    y=matrix_reduced_anon[:, 1], 
    mode='markers',
    marker=dict(
        size=5,
#         color=df_anon['Cluster'].values,
#         colorscale='Viridis',
        opacity=0.8
    ),
    hovertext=df_anon['Wrapped_Anonymized_Review_Text'],
    hoverinfo='text'
))

fig.show()

In [None]:
um_anon = UMAP(n_components=3, n_neighbors=5)
matrix_reduced_anon = um_anon.fit_transform(matrix_anon)


In [None]:

hdb = hdbscan.HDBSCAN(min_cluster_size=40, min_samples=4)

hdb_output = hdb.fit(matrix_reduced_anon)
df_anon['Cluster'] = hdb_output.labels_
# df_anon.sort_values(by='Cluster', inplace=True)

output_csv_anon = '/Users/ianspence/Desktop/review-analysis/output/embedding_analysis/csv/anonymized_reviews_qc1_4500_embeddings_min_cluster_50.csv'

# df_anon.to_csv(output_csv_anon, index=False)

In [None]:
# fig = go.Figure(data=go.Scatter(
#     x=matrix_reduced_anon[:, 0], 
#     y=matrix_reduced_anon[:, 1], 
#     mode='markers',
#     marker=dict(
#         size=5,
#         color=df_anon['Cluster'].values,
# #         colorscale='Viridis',
#         opacity=0.8
#     ),
#     hovertext=df_anon['Wrapped_Anonymized_Review_Text'],
#     hoverinfo='text'
# ))

# fig.show()

fig = go.Figure(data=[go.Scatter3d(
    x=matrix_reduced_anon[:, 0], y=matrix_reduced_anon[:, 1], z=matrix_reduced_anon[:, 2],
    mode='markers',
    marker=dict(
        size=2,
        color=df_anon['Cluster'].values,
#         colorscale='Viridis',
        opacity=0.8
    ),
    text=df_anon['Wrapped_Anonymized_Review_Text']
)])

fig.show()

In [None]:
# df_anon.sort_values(by='Cluster', inplace=True)
output_csv_anon = '/Users/ianspence/Desktop/review-analysis/output/embedding_analysis/csv/anonymized_reviews_qc1_4500_embeddings_min_cluster_30_min_samples_15.csv'

df_anon_save = df_anon.copy()
df_anon_save.sort_values(by='Cluster', inplace=True)
df_anon_save.to_csv(output_csv_anon, index=False)

In [None]:
um_anon = UMAP(n_components=3, n_neighbors=4)
matrix_reduced_anon = um_anon.fit_transform(matrix_anon)

In [None]:
hdb = hdbscan.HDBSCAN(min_cluster_size=40, min_samples=4)

hdb_output = hdb.fit(matrix_reduced_anon)
df_anon['Cluster'] = hdb_output.labels_
# df_anon.sort_values(by='Cluster', inplace=True)

output_csv_anon = '/Users/ianspence/Desktop/review-analysis/output/embedding_analysis/csv/anonymized_reviews_qc1_4500_embeddings_min_cluster_50.csv'

# df_anon.to_csv(output_csv_anon, index=False)

In [None]:
fig = go.Figure(data=[go.Scatter3d(
    x=matrix_reduced_anon[:, 0], y=matrix_reduced_anon[:, 1], z=matrix_reduced_anon[:, 2],
    mode='markers',
    marker=dict(
        size=2,
        color=df_anon['Cluster'].values,
#         colorscale='Viridis',
        opacity=0.8
    ),
    text=df_anon['Wrapped_Anonymized_Review_Text']
)])

fig.show()

In [None]:
um_anon = UMAP(n_components=3, n_neighbors=3)
matrix_reduced_anon = um_anon.fit_transform(matrix_anon)


In [None]:
hdb = hdbscan.HDBSCAN(min_cluster_size=40, min_samples=4)

hdb_output = hdb.fit(matrix_reduced_anon)
df_anon['Cluster'] = hdb_output.labels_
# df_anon.sort_values(by='Cluster', inplace=True)

output_csv_anon = '/Users/ianspence/Desktop/review-analysis/output/embedding_analysis/csv/anonymized_reviews_qc1_4500_embeddings_min_cluster_50.csv'


In [None]:
fig = go.Figure(data=[go.Scatter3d(
    x=matrix_reduced_anon[:, 0], y=matrix_reduced_anon[:, 1], z=matrix_reduced_anon[:, 2],
    mode='markers',
    marker=dict(
        size=3,
        color=df_anon['Cluster'].values,
#         colorscale='Viridis',
        opacity=0.8
    ),
    text=df_anon['Wrapped_Anonymized_Review_Text']
)])

fig.show()

In [None]:
um_anon = UMAP(n_components=3, n_neighbors=4)
matrix_reduced_anon = um_anon.fit_transform(matrix_anon)


In [None]:
hdb = hdbscan.HDBSCAN(min_cluster_size=10, min_samples=4)

hdb_output = hdb.fit(matrix_reduced_anon)
df_anon['Cluster'] = hdb_output.labels_
# df_anon.sort_values(by='Cluster', inplace=True)

output_csv_anon = '/Users/ianspence/Desktop/review-analysis/output/embedding_analysis/csv/anonymized_reviews_qc1_4500_embeddings_min_cluster_50.csv'


In [None]:
fig = go.Figure(data=[go.Scatter3d(
    x=matrix_reduced_anon[:, 0], y=matrix_reduced_anon[:, 1], z=matrix_reduced_anon[:, 2],
    mode='markers',
    marker=dict(
        size=2,
        color=df_anon['Cluster'].values,
#         colorscale='Viridis',
        opacity=0.8
    ),
    text=df_anon['Wrapped_Anonymized_Review_Text']
)])

fig.show()