In [1]:
import nltk
import gensim
import gensim.corpora as corpora
import pandas as pd
import numpy as np
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from gensim.models.coherencemodel import CoherenceModel
from sklearn.datasets import fetch_20newsgroups

In [2]:
dataset = pd.read_csv('Data/Preprocessed_TS_new.csv')

In [3]:
# Step 2.1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2.2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=5, n_components=2, metric='cosine')

# Step 2.3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=4, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# Step 2.4 - Tokenize topics
vectorizer_model = CountVectorizer()

# Step 2.5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()

In [4]:
topic_model = BERTopic(
  embedding_model=embedding_model,    # Step 1 - Extract embeddings
  umap_model=umap_model,              # Step 2 - Reduce dimensionality
  hdbscan_model=hdbscan_model,        # Step 3 - Cluster reduced embeddings
  vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
  ctfidf_model=ctfidf_model,          # Step 5 - Extract topic words
  min_topic_size=10                        
)

In [5]:
topics, probabilities = topic_model.fit_transform(dataset['full_text'])

In [6]:
topic_model.visualize_barchart()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [10]:
topic_model.get_topic_info().head(7).set_index('Topic')[['Count', 'Name', 'Representation']]

Unnamed: 0_level_0,Count,Name,Representation
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,127,-1_bis_jalan_brt_koridor,"[bis, jalan, brt, koridor, gt, halo, bus, simp..."
0,32,0_transit_dimana_transitnya_kagok,"[transit, dimana, transitnya, kagok, naiknya, ..."
1,22,1_feeder_ugalan_banyumanik_hendrarprihadi,"[feeder, ugalan, banyumanik, hendrarprihadi, k..."
2,16,2_acnya_sore_tawang_kak,"[acnya, sore, tawang, kak, jam, stasiun, ungar..."
3,14,3_halte_balaikota_info_imam,"[halte, balaikota, info, imam, tanah, sma, put..."
4,14,4_kota_semarang_gading_memfasilitasi,"[kota, semarang, gading, memfasilitasi, teman,..."
5,14,5_bus_iv_cangkiran_trans,"[bus, iv, cangkiran, trans, layanan, out, day,..."


In [11]:
topic_model.get_topic_info()['Representation'][:10]

0    [bis, jalan, brt, koridor, gt, halo, bus, simp...
1    [transit, dimana, transitnya, kagok, naiknya, ...
2    [feeder, ugalan, banyumanik, hendrarprihadi, k...
3    [acnya, sore, tawang, kak, jam, stasiun, ungar...
4    [halte, balaikota, info, imam, tanah, sma, put...
5    [kota, semarang, gading, memfasilitasi, teman,...
6    [bus, iv, cangkiran, trans, layanan, out, day,...
7    [khusus, ngerasain, ts, berlaku, karna, cm, no...
8    [alun, max, dimana, jurusan, terdekat, turun, ...
9    [tanah, putih, halte, gimana, polines, undip, ...
Name: Representation, dtype: object

In [12]:
for i, (topic, prob) in enumerate(zip(topics, probabilities)):
    print(f"Document {i} is assigned to Topic {topic} with probability {prob}")

Document 0 is assigned to Topic 38 with probability 1.0
Document 1 is assigned to Topic 1 with probability 0.8379637575041963
Document 2 is assigned to Topic 11 with probability 0.9650978905273307
Document 3 is assigned to Topic 22 with probability 0.8559407953382454
Document 4 is assigned to Topic 16 with probability 0.7894281219261419
Document 5 is assigned to Topic -1 with probability 0.0
Document 6 is assigned to Topic 24 with probability 1.0
Document 7 is assigned to Topic 8 with probability 1.0
Document 8 is assigned to Topic 17 with probability 0.7477426499580687
Document 9 is assigned to Topic 5 with probability 0.6961547281840768
Document 10 is assigned to Topic 2 with probability 0.9405509491565707
Document 11 is assigned to Topic -1 with probability 0.0
Document 12 is assigned to Topic 2 with probability 0.9121632180160653
Document 13 is assigned to Topic 31 with probability 1.0
Document 14 is assigned to Topic 4 with probability 0.8978263038193373
Document 15 is assigned to

In [14]:
docs = [(topic, i) for i, topic in enumerate(topics) if topic>0 and topic<11]
print(docs)

[(1, 1), (8, 7), (5, 9), (2, 10), (2, 12), (4, 14), (6, 15), (1, 24), (4, 34), (3, 35), (1, 38), (1, 43), (1, 47), (5, 48), (1, 49), (9, 50), (2, 55), (2, 56), (1, 65), (9, 73), (2, 86), (3, 88), (6, 91), (1, 100), (4, 108), (9, 111), (3, 112), (2, 115), (1, 117), (9, 120), (5, 121), (7, 125), (6, 132), (9, 140), (4, 150), (3, 153), (7, 157), (9, 159), (7, 164), (7, 166), (3, 172), (3, 185), (3, 186), (6, 187), (8, 195), (8, 197), (7, 200), (7, 202), (2, 211), (8, 212), (8, 213), (4, 216), (6, 217), (2, 218), (3, 221), (4, 225), (2, 235), (2, 237), (5, 241), (5, 242), (6, 245), (5, 248), (3, 255), (1, 256), (5, 259), (1, 275), (10, 276), (2, 284), (9, 286), (1, 290), (3, 294), (2, 295), (7, 296), (6, 297), (1, 298), (10, 302), (1, 306), (8, 308), (8, 309), (5, 315), (3, 318), (8, 323), (8, 330), (10, 335), (3, 336), (6, 345), (7, 348), (5, 351), (10, 355), (3, 358), (3, 361), (7, 370), (2, 375), (8, 377), (6, 380), (4, 386), (9, 387), (2, 389), (8, 392), (9, 396), (4, 397), (1, 399), (

In [15]:
from collections import defaultdict

docs = defaultdict(list)

for i, topic in enumerate(topics):
    if topic>0 and topic<11:
        docs[topic].append(i)

print(docs[1])

[1, 24, 38, 43, 47, 49, 65, 100, 117, 256, 275, 290, 298, 306, 399, 445, 464, 468, 480, 481, 503, 504]


In [16]:
ts_sentiment = pd.read_csv('Output/TS_withSentiment.csv')

In [17]:
sent_vals = []

for sent in ts_sentiment['sentiment']:
    if sent == 'Positive':
        sent_vals.append(1)
    elif sent == 'Negative':
        sent_vals.append(-1)
    else:
        sent_vals.append(0)

In [18]:
ts_sentiment['sentiment_values'] = sent_vals

In [19]:
total_sent_val = []

for topic in docs:
    sum = 0
    for index in docs[topic]:
        sum += ts_sentiment['sentiment_values'][index]
    total_sent_val.append(sum)

In [20]:
topics_ts = pd.DataFrame()

topics_ts['topic_id'] = [topic for topic in docs]
topics_ts['sentiment_value'] = total_sent_val

In [21]:
topic_names = [', '.join([word for word, prob in topic_model.get_topic(i)[:5]]) for i in range (1,11)]
topics_ts['topic_name'] = topic_names
print(topic_names)

['feeder, ugalan, banyumanik, hendrarprihadi, kuota', 'acnya, sore, tawang, kak, jam', 'halte, balaikota, info, imam, tanah', 'kota, semarang, gading, memfasilitasi, teman', 'bus, iv, cangkiran, trans, layanan', 'khusus, ngerasain, ts, berlaku, karna', 'alun, max, dimana, jurusan, terdekat', 'tanah, putih, halte, gimana, polines', 'armada, godong, banget, ditambah, armadanya', 'cash, tap, tarif, tunai, cashless']


In [25]:
topics_ts.head(10)

Unnamed: 0,topic_id,sentiment_value,topic_name,sentiment
0,1,-1,"feeder, ugalan, banyumanik, hendrarprihadi, kuota",Negative
1,8,-2,"acnya, sore, tawang, kak, jam",Negative
2,5,-3,"halte, balaikota, info, imam, tanah",Negative
3,2,0,"kota, semarang, gading, memfasilitasi, teman",Neutral
4,4,7,"bus, iv, cangkiran, trans, layanan",Positive
5,6,1,"khusus, ngerasain, ts, berlaku, karna",Positive
6,3,-1,"alun, max, dimana, jurusan, terdekat",Negative
7,9,-4,"tanah, putih, halte, gimana, polines",Negative
8,7,-1,"armada, godong, banget, ditambah, armadanya",Negative
9,10,-1,"cash, tap, tarif, tunai, cashless",Negative


In [23]:
sent_names = []
for sent in topics_ts['sentiment_value']:
    if sent<0:
        sent_names.append('Negative')
    elif sent==0:
        sent_names.append('Neutral')
    else:
        sent_names.append('Positive')

In [24]:
topics_ts['sentiment'] = sent_names

In [67]:
topics_ts.to_csv('Output/TS_topics.csv')

In [13]:
documents = pd.DataFrame({"Document": dataset['full_text'],
                          "ID": range(len(dataset['full_text'])),
                          "Topic": topics})
documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)

# Extract vectorizer and analyzer from BERTopic
vectorizer = topic_model.vectorizer_model
analyzer = vectorizer.build_analyzer()

# Extract features for Topic Coherence evaluation
# words = vectorizer.get_feature_names()
tokens = [analyzer(doc) for doc in cleaned_docs]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]
topic_words = [[words for words, _ in topic_model.get_topic(topic)] 
               for topic in range(len(set(topics))-1)]

# Evaluate
coherence_model = CoherenceModel(topics=topic_words, 
                                 texts=tokens, 
                                 corpus=corpus,
                                 dictionary=dictionary, 
                                 coherence='c_v')
coherence = coherence_model.get_coherence()
print(coherence)

0.7031292720784231


In [15]:

def tune_bertopic(embedding_model_name, n_neighbors, min_cluster_size, n_components):
    embedding_model = SentenceTransformer(embedding_model_name)
    umap_model = UMAP(n_neighbors=n_neighbors, n_components=n_components, metric='cosine')
    hdbscan_model = HDBSCAN(min_cluster_size=min_cluster_size, metric='euclidean', cluster_selection_method='eom')
    topic_model = BERTopic(embedding_model=embedding_model, umap_model=umap_model, hdbscan_model=hdbscan_model)

    topics, probs = topic_model.fit_transform(dataset['full_text'])
    topic_words = topic_model.get_topics()
    topics = [[word for word, _ in topic_words[topic_id]] for topic_id in topic_words]
    
    # Create the coherence model
    coherence_model = CoherenceModel(
        topics=topics, 
        texts=tokens, 
        dictionary=dictionary, 
        coherence='c_v'
    )
    
    # Get the coherence score
    coherence_score = coherence_model.get_coherence()
    print(f"Coherence Score: {coherence_score}")
    return topic_model, coherence_score

# Define hyperparameters to tune
embedding_models = ['all-MiniLM-L6-v2', 'paraphrase-MiniLM-L6-v2']
n_neighbors_values = [10, 15, 30]
min_cluster_size_values = [5, 10, 20]
n_components_values = [2, 5, 8]

# Perform hyperparameter tuning
best_model = None
best_score = -1
for embedding_model in embedding_models:
    for n_neighbors in n_neighbors_values:
        for min_cluster_size in min_cluster_size_values:
            for n_components in n_components_values:
                print(f"Evaluating: embedding_model={embedding_model}, n_neighbors={n_neighbors}, min_cluster_size={min_cluster_size}, n_components={n_components}")
                model, score = tune_bertopic(embedding_model, n_neighbors, min_cluster_size, n_components)
                if score > best_score:
                    best_score = score
                    best_model = model

# Display the best score and model
print(f"Best Coherence Score: {best_score}")

Evaluating: embedding_model=all-MiniLM-L6-v2, n_neighbors=10, min_cluster_size=5, n_components=2
Coherence Score: 0.5584738112190346
Evaluating: embedding_model=all-MiniLM-L6-v2, n_neighbors=10, min_cluster_size=5, n_components=5
Coherence Score: 0.5584738112190346
Evaluating: embedding_model=all-MiniLM-L6-v2, n_neighbors=10, min_cluster_size=5, n_components=8
Coherence Score: 0.5584738112190346
Evaluating: embedding_model=all-MiniLM-L6-v2, n_neighbors=10, min_cluster_size=10, n_components=2
Coherence Score: 0.5584738112190346
Evaluating: embedding_model=all-MiniLM-L6-v2, n_neighbors=10, min_cluster_size=10, n_components=5
Coherence Score: 0.5584738112190346
Evaluating: embedding_model=all-MiniLM-L6-v2, n_neighbors=10, min_cluster_size=10, n_components=8
Coherence Score: 0.5584738112190346
Evaluating: embedding_model=all-MiniLM-L6-v2, n_neighbors=10, min_cluster_size=20, n_components=2
Coherence Score: 0.3414742928287255
Evaluating: embedding_model=all-MiniLM-L6-v2, n_neighbors=10, min

In [4]:
import pandas as pd

In [38]:
# docs = [(1, 1), (8, 7), (5, 9), (2, 10), (2, 12), (4, 14), (6, 15), (1, 24), (4, 34), (3, 35), (1, 38), (1, 43), (1, 47), (5, 48), (1, 49), (9, 50), (2, 55), (2, 56), (1, 65), (9, 73), (2, 86), (3, 88), (6, 91), (1, 100), (4, 108), (9, 111), (3, 112), (2, 115), (1, 117), (9, 120), (5, 121), (7, 125), (6, 132), (9, 140), (4, 150), (3, 153), (7, 157), (9, 159), (7, 164), (7, 166), (3, 172), (3, 185), (3, 186), (6, 187), (8, 195), (8, 197), (7, 200), (7, 202), (2, 211), (8, 212), (8, 213), (4, 216), (6, 217), (2, 218), (3, 221), (4, 225), (2, 235), (2, 237), (5, 241), (5, 242), (6, 245), (5, 248), (3, 255), (1, 256), (5, 259), (1, 275), (10, 276), (2, 284), (9, 286), (1, 290), (3, 294), (2, 295), (7, 296), (6, 297), (1, 298), (10, 302), (1, 306), (8, 308), (8, 309), (5, 315), (3, 318), (8, 323), (8, 330), (10, 335), (3, 336), (6, 345), (7, 348), (5, 351), (10, 355), (3, 358), (3, 361), (7, 370), (2, 375), (8, 377), (6, 380), (4, 386), (9, 387), (2, 389), (8, 392), (9, 396), (4, 397), (1, 399), (6, 404), (2, 405), (10, 406), (7, 413), (2, 414), (5, 417), (4, 424), (4, 425), (6, 430), (7, 433), (9, 434), (1, 445), (6, 447), (10, 448), (10, 450), (4, 457), (8, 458), (5, 461), (1, 464), (1, 468), (5, 470), (5, 471), (6, 475), (7, 477), (1, 480), (1, 481), (10, 482), (10, 486), (10, 487), (4, 490), (4, 491), (10, 494), (4, 497), (9, 499), (5, 501), (1, 503), (1, 504)]
docs = [(4, 1), (2, 4), (3, 8), (8, 10), (3, 16), (7, 22), (1, 26), (7, 27), (3, 33), (5, 37), (7, 40), (7, 41), (2, 42), (7, 43), (7, 46), (7, 47), (7, 48), (7, 49), (1, 51), (7, 52), (10, 53), (5, 54), (7, 58), (1, 59), (4, 63), (1, 64), (1, 65), (6, 68), (9, 69), (10, 70), (10, 73), (1, 74), (4, 76), (2, 80), (4, 83), (2, 85), (3, 86), (4, 87), (5, 91), (2, 97), (3, 99), (2, 103), (3, 106), (2, 111), (4, 116), (2, 117), (9, 118), (3, 120), (1, 125), (1, 128), (2, 131), (3, 132), (1, 133), (9, 134), (6, 143), (4, 144), (10, 145), (4, 146), (10, 150), (9, 152), (6, 153), (9, 155), (2, 159), (2, 162), (1, 170), (1, 181), (4, 182), (4, 188), (2, 189), (5, 191), (6, 194), (2, 196), (1, 198), (2, 199), (5, 201), (8, 206), (8, 208), (6, 219), (6, 221), (2, 222), (6, 224), (6, 228), (8, 229), (5, 245), (1, 246), (1, 247), (2, 251), (1, 258), (3, 259), (3, 260), (2, 262), (6, 265), (8, 266), (2, 267), (8, 270), (4, 283), (1, 284), (1, 285), (3, 286), (1, 289), (1, 290), (2, 291), (1, 297), (2, 300), (2, 302), (2, 306), (2, 308), (2, 315), (3, 318), (4, 322), (1, 330), (3, 333), (1, 334), (3, 336), (5, 338), (2, 343), (3, 345), (5, 346), (1, 349), (1, 351), (7, 352), (4, 354), (9, 356), (3, 361), (5, 362), (3, 375), (1, 376), (3, 387), (1, 388), (9, 390), (10, 394), (2, 399), (8, 400), (2, 402), (2, 406), (4, 407), (1, 408), (5, 413), (5, 414), (9, 415), (3, 417), (10, 419), (8, 420), (9, 421), (10, 422), (10, 427), (10, 433), (9, 437), (7, 442), (3, 452), (3, 453), (8, 455), (2, 458), (3, 461), (1, 462), (4, 463), (1, 465), (3, 471), (5, 475), (8, 477), (1, 483), (1, 484), (3, 487), (6, 488), (5, 491), (9, 492), (6, 493), (1, 495), (2, 500), (5, 502), (2, 503), (4, 505), (2, 507), (4, 508), (3, 510), (2, 511), (5, 512), (2, 513), (9, 516), (9, 517), (1, 518), (2, 519), (1, 522), (2, 523), (3, 527), (10, 530), (8, 531), (1, 535), (8, 536), (3, 537), (4, 538), (6, 540), (1, 541), (6, 542), (2, 548), (10, 551), (5, 552), (7, 555), (3, 556), (1, 557), (2, 558), (1, 559), (1, 560), (8, 566), (6, 573)]

In [39]:
topics_ts = pd.read_csv('Output/TJ_topics.csv')
ts_sentiment = pd.read_csv('Output/TJ_withSentiment.csv')

In [40]:
from collections import defaultdict

docs_list = defaultdict(list)

for topic, doc in docs:
    if topic>0 and topic<11:
        docs_list[topic].append(doc)

print(docs_list)

defaultdict(<class 'list'>, {4: [1, 63, 76, 83, 87, 116, 144, 146, 182, 188, 283, 322, 354, 407, 463, 505, 508, 538], 2: [4, 42, 80, 85, 97, 103, 111, 117, 131, 159, 162, 189, 196, 199, 222, 251, 262, 267, 291, 300, 302, 306, 308, 315, 343, 399, 402, 406, 458, 500, 503, 507, 511, 513, 519, 523, 548, 558], 3: [8, 16, 33, 86, 99, 106, 120, 132, 259, 260, 286, 318, 333, 336, 345, 361, 375, 387, 417, 452, 453, 461, 471, 487, 510, 527, 537, 556], 8: [10, 206, 208, 229, 266, 270, 400, 420, 455, 477, 531, 536, 566], 7: [22, 27, 40, 41, 43, 46, 47, 48, 49, 52, 58, 352, 442, 555], 1: [26, 51, 59, 64, 65, 74, 125, 128, 133, 170, 181, 198, 246, 247, 258, 284, 285, 289, 290, 297, 330, 334, 349, 351, 376, 388, 408, 462, 465, 483, 484, 495, 518, 522, 535, 541, 557, 559, 560], 5: [37, 54, 91, 191, 201, 245, 338, 346, 362, 413, 414, 475, 491, 502, 512, 552], 10: [53, 70, 73, 145, 150, 394, 419, 422, 427, 433, 530, 551], 6: [68, 143, 153, 194, 219, 221, 224, 228, 265, 488, 493, 540, 542, 573], 9: [69, 

In [41]:
neg_counts = []
for i in range(1,11):
    neg_count = 0
    for doc in docs_list[i]:
        if ts_sentiment['sentiment'][doc] == 'Negative':
            neg_count += 1
    neg_counts.append(neg_count)


In [42]:
pos_counts = []
for i in range(1,11):
    pos_count = 0
    for doc in docs_list[i]:
        if ts_sentiment['sentiment'][doc] == 'Positive':
            pos_count += 1
    pos_counts.append(pos_count)

In [43]:
neu_counts = []
for i in range(1,11):
    neu_count = 0
    for doc in docs_list[i]:
        if ts_sentiment['sentiment'][doc] == 'Neutral':
            neu_count += 1
    neu_counts.append(neu_count)

In [44]:
topics_ts['positive_count'] = pos_counts
topics_ts['neutral_count'] = neu_counts
topics_ts['negative_count'] = neg_counts

In [45]:
topics_ts.head()

Unnamed: 0.1,Unnamed: 0,topic_id,sentiment_value,topic_name,sentiment,positive_count,neutral_count,negative_count
0,0,4,-8,"gimana, mall, rutenya, tj, central",Negative,1,29,9
1,1,2,0,"kah, tj, koridor, pake, tolong",Neutral,5,28,5
2,2,3,-6,"halte, gbk, arah, senayan, kuningan",Negative,0,22,6
3,3,8,-1,"tap, out, in, kepotong, saldo",Negative,1,8,9
4,4,7,0,"halo, rute, layanan, slipi, padam",Neutral,0,15,1


In [37]:
topics_ts.to_csv('Output/TJ_topics_rev.csv')