In [1]:
import pandas as pd

In [2]:
# Read in cluster data
cluster_df = pd.read_csv('data/cluster_temp.csv') 

cluster_df = cluster_df.rename(columns={"id": "listing_id"})


print(cluster_df.shape)
print(cluster_df.columns)
print(cluster_df["listing_id"].nunique())

(13225, 5)
Index(['listing_id', 'host_id', 'name', 'neighbourhood_cleansed',
       'cluster_kmeans'],
      dtype='object')
13225


In [3]:
# I saved like this: results_df.to_csv(f"results_df_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv", index=False)
# Read in results_df
results_df = pd.read_csv('results_df_20251118_125309.csv')


In [4]:
merged_df = results_df[results_df["listing_id"].isin(cluster_df["listing_id"])].merge(cluster_df, on="listing_id", how="left")
merged_df.head()

Unnamed: 0,id,vader_neg,vader_neu,vader_pos,vader_compound,roberta_neg,roberta_neu,roberta_pos,listing_id,date,reviewer_id,reviewer_name,Review,host_id,name,neighbourhood_cleansed,cluster_kmeans
0,1,0.0,0.424,0.576,0.9118,0.004214,0.090394,0.905392,7.374532e+17,2023-07-13,168460048.0,Shianne,Elizabeth was super responsive and was kind en...,64023897,Lejlighed centralt i Valby,Valby,0
1,2,0.0,0.748,0.252,0.4019,0.004356,0.229301,0.766343,7.374532e+17,2023-07-13,168460048.0,Shianne,Place matched the description and photos and w...,64023897,Lejlighed centralt i Valby,Valby,0
2,3,0.0,1.0,0.0,0.0,0.008691,0.473369,0.51794,7.374532e+17,2023-07-13,168460048.0,Shianne,"Everything is within walking distance, especia...",64023897,Lejlighed centralt i Valby,Valby,0
3,4,0.0,0.455,0.545,0.7184,0.001651,0.021786,0.976563,7.374532e+17,2023-07-13,168460048.0,Shianne,"Enjoyed my stay, would definitely book again",64023897,Lejlighed centralt i Valby,Valby,0
4,5,0.0,0.541,0.459,0.7783,0.001498,0.010797,0.987705,2907347.0,2024-08-18,11387465.0,Yannis,Thanks for having us - we loved staying in the...,14853464,Nice getaway at Vesterbro,Vesterbro-Kongens Enghave,3


In [6]:
import time
import torch
import pandas as pd
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from umap import UMAP

# Use a small sample first
docs = merged_df.head(50)["Review"].astype(str).tolist()

# CPU first (stable)
device = torch.device("cpu")
print("Device:", device)

# Load embeddings
t0 = time.time()
embedding_model = SentenceTransformer("all-MiniLM-L6-v2", device=device)
print("Loaded model in", time.time() - t0)

t1 = time.time()
embeddings = embedding_model.encode(
    docs,
    batch_size=16,
    show_progress_bar=True
)
print("Embeddings in", time.time() - t1)

# Correct UMAP config for macOS + Python 3.11 + UMAP 0.5.x
umap_model = UMAP(
    n_neighbors=15,
    n_components=5,
    metric="cosine",
    init="random",
    min_dist=0.0,
    low_memory=True,
    random_state=42,
    transform_seed=42,
    verbose=True
)

topic_model = BERTopic(
    embedding_model=None,
    umap_model=umap_model,
    verbose=True
)

t2 = time.time()
topics, probs = topic_model.fit_transform(docs, embeddings=embeddings)
print("BERTopic took", time.time() - t2, "seconds")

print(topic_model.get_topic_info().head())


Device: cpu
Loaded model in 2.059757947921753


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2025-11-19 18:07:08,997 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


Embeddings in 0.11794114112854004
UMAP(init='random', metric='cosine', min_dist=0.0, n_components=5, random_state=42, verbose=True)


AttributeError: 'UMAP' object has no attribute '_sparse_data'

In [None]:
import pandas as pd
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# Prepare long-form dataframe for each sentiment
df_long = merged_df[["cluster_kmeans", "roberta_neg", "roberta_neu", "roberta_pos"]].melt(
    id_vars="cluster_kmeans",
    value_vars=["roberta_neg", "roberta_neu", "roberta_pos"],
    var_name="sentiment",
    value_name="score"
)

# One Tukey test per sentiment
for s in ["roberta_neg", "roberta_neu", "roberta_pos"]:
    print("\nTukey for:", s)
    tukey = pairwise_tukeyhsd(
        endog=df_long.loc[df_long.sentiment == s, "score"],
        groups=df_long.loc[df_long.sentiment == s, "cluster_kmeans"],
        alpha=0.05
    )
    print(tukey)
