In [2]:
import nltk
import gensim
import gensim.corpora as corpora
import pandas as pd
import numpy as np
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from gensim.models.coherencemodel import CoherenceModel
from sklearn.datasets import fetch_20newsgroups

In [3]:
dataset = pd.read_csv('Data/Preprocessed_TJ_new.csv')

In [4]:
# Step 2.1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2.2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=5, n_components=2, metric='cosine')

# Step 2.3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=4, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# Step 2.4 - Tokenize topics
vectorizer_model = CountVectorizer()

# Step 2.5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()

In [5]:
topic_model = BERTopic(
  embedding_model=embedding_model,    # Step 1 - Extract embeddings
  umap_model=umap_model,              # Step 2 - Reduce dimensionality
  hdbscan_model=hdbscan_model,        # Step 3 - Cluster reduced embeddings
  vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
  ctfidf_model=ctfidf_model,          # Step 5 - Extract topic words
  min_topic_size=10                        
)

In [6]:
topics, probabilities = topic_model.fit_transform(dataset['full_text'])

In [7]:
topic_model.visualize_barchart()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [8]:
topic_model.get_topic_info().head(7).set_index('Topic')[['Count', 'Name', 'Representation']]

Unnamed: 0_level_0,Count,Name,Representation
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,142,-1_pintu_jam_transjakarta_halte,"[pintu, jam, transjakarta, halte, rute, cawang..."
0,82,0_bus_kembalikan_stop_menit,"[bus, kembalikan, stop, menit, busway, nya, nu..."
1,39,1_gimana_mall_rutenya_tj,"[gimana, mall, rutenya, tj, central, halte, lo..."
2,38,2_kah_tj_koridor_pake,"[kah, tj, koridor, pake, tolong, jak, jalur, l..."
3,28,3_halte_gbk_arah_senayan,"[halte, gbk, arah, senayan, kuningan, numpuk, ..."
4,18,4_tap_out_in_kepotong,"[tap, out, in, kepotong, saldo, potong, pas, k..."
5,16,5_halo_rute_layanan_slipi,"[halo, rute, layanan, slipi, padam, makasii, t..."


In [14]:
topic_model.get_topic_info()['Representation'][:10]

0    [pintu, jam, transjakarta, halte, rute, cawang...
1    [bus, kembalikan, stop, menit, busway, nya, nu...
2    [gimana, mall, rutenya, tj, central, halte, lo...
3    [kah, tj, koridor, pake, tolong, jak, jalur, l...
4    [halte, gbk, arah, senayan, kuningan, numpuk, ...
5    [tap, out, in, kepotong, saldo, potong, pas, k...
6    [halo, rute, layanan, slipi, padam, makasii, t...
7    [jam, sampe, kak, rute, jurusan, besok, brp, t...
8    [tracking, maps, google, live, fitur, gmaps, a...
9    [jam, brp, operasional, weekday, weekend, halo...
Name: Representation, dtype: object

In [9]:
for i, (topic, prob) in enumerate(zip(topics, probabilities)):
    print(f"Document {i} is assigned to Topic {topic} with probability {prob}")

Document 0 is assigned to Topic 13 with probability 1.0
Document 1 is assigned to Topic 4 with probability 0.9417519453758805
Document 2 is assigned to Topic -1 with probability 0.0
Document 3 is assigned to Topic 21 with probability 0.7592889608718711
Document 4 is assigned to Topic 2 with probability 1.0
Document 5 is assigned to Topic -1 with probability 0.0
Document 6 is assigned to Topic 0 with probability 1.0
Document 7 is assigned to Topic -1 with probability 0.0
Document 8 is assigned to Topic 3 with probability 0.9404130448046087
Document 9 is assigned to Topic -1 with probability 0.0
Document 10 is assigned to Topic 8 with probability 0.9234339650445295
Document 11 is assigned to Topic -1 with probability 0.0
Document 12 is assigned to Topic 26 with probability 1.0
Document 13 is assigned to Topic 13 with probability 0.6871604961537413
Document 14 is assigned to Topic 24 with probability 0.8630214883223194
Document 15 is assigned to Topic 0 with probability 1.0
Document 16 is

In [18]:
docs = [(topic, i) for i, topic in enumerate(topics) if topic>0 and topic<11]
print(docs)

[(4, 1), (2, 4), (3, 8), (8, 10), (3, 16), (7, 22), (1, 26), (7, 27), (3, 33), (5, 37), (7, 40), (7, 41), (2, 42), (7, 43), (7, 46), (7, 47), (7, 48), (7, 49), (1, 51), (7, 52), (10, 53), (5, 54), (7, 58), (1, 59), (4, 63), (1, 64), (1, 65), (6, 68), (9, 69), (10, 70), (10, 73), (1, 74), (4, 76), (2, 80), (4, 83), (2, 85), (3, 86), (4, 87), (5, 91), (2, 97), (3, 99), (2, 103), (3, 106), (2, 111), (4, 116), (2, 117), (9, 118), (3, 120), (1, 125), (1, 128), (2, 131), (3, 132), (1, 133), (9, 134), (6, 143), (4, 144), (10, 145), (4, 146), (10, 150), (9, 152), (6, 153), (9, 155), (2, 159), (2, 162), (1, 170), (1, 181), (4, 182), (4, 188), (2, 189), (5, 191), (6, 194), (2, 196), (1, 198), (2, 199), (5, 201), (8, 206), (8, 208), (6, 219), (6, 221), (2, 222), (6, 224), (6, 228), (8, 229), (5, 245), (1, 246), (1, 247), (2, 251), (1, 258), (3, 259), (3, 260), (2, 262), (6, 265), (8, 266), (2, 267), (8, 270), (4, 283), (1, 284), (1, 285), (3, 286), (1, 289), (1, 290), (2, 291), (1, 297), (2, 300)

In [34]:
from collections import defaultdict

docs = defaultdict(list)

for i, topic in enumerate(topics):
    if topic>0 and topic<11:
        docs[topic].append(i)

print(docs[1])

[26, 51, 59, 64, 65, 74, 125, 128, 133, 170, 181, 198, 246, 247, 258, 284, 285, 289, 290, 297, 330, 334, 349, 351, 376, 388, 408, 462, 465, 483, 484, 495, 518, 522, 535, 541, 557, 559, 560]


In [35]:
tj_sentiment = pd.read_csv('Output/TJ_withSentiment.csv')

In [36]:
sent_vals = []

for sent in tj_sentiment['sentiment']:
    if sent == 'Positive':
        sent_vals.append(1)
    elif sent == 'Negative':
        sent_vals.append(-1)
    else:
        sent_vals.append(0)

In [37]:
tj_sentiment['sentiment_values'] = sent_vals

In [47]:
total_sent_val = []

for topic in docs:
    sum = 0
    for index in docs[topic]:
        sum += tj_sentiment['sentiment_values'][index]
    total_sent_val.append(sum)

In [48]:
topics_tj = pd.DataFrame()

topics_tj['topic_id'] = [topic for topic in docs]
topics_tj['sentiment_value'] = total_sent_val

In [62]:
topic_names = [', '.join([word for word, prob in topic_model.get_topic(i)[:5]]) for i in range (1,11)]
topics_tj['topic_name'] = topic_names
print(topic_names)

['gimana, mall, rutenya, tj, central', 'kah, tj, koridor, pake, tolong', 'halte, gbk, arah, senayan, kuningan', 'tap, out, in, kepotong, saldo', 'halo, rute, layanan, slipi, padam', 'jam, sampe, kak, rute, jurusan', 'tracking, maps, google, live, fitur', 'jam, brp, operasional, weekday, weekend', 'kemayoran, halte, jiexpo, gmna, rutenya', 'penumpang, minggu, supir, kendaraan, ngetem']


In [66]:
topics_tj.head(10)

Unnamed: 0,topic_id,sentiment_value,topic_name,sentiment
0,4,-8,"gimana, mall, rutenya, tj, central",Negative
1,2,0,"kah, tj, koridor, pake, tolong",Neutral
2,3,-6,"halte, gbk, arah, senayan, kuningan",Negative
3,8,-1,"tap, out, in, kepotong, saldo",Negative
4,7,0,"halo, rute, layanan, slipi, padam",Neutral
5,1,-8,"jam, sampe, kak, rute, jurusan",Negative
6,5,-1,"tracking, maps, google, live, fitur",Negative
7,10,-5,"jam, brp, operasional, weekday, weekend",Negative
8,6,0,"kemayoran, halte, jiexpo, gmna, rutenya",Neutral
9,9,0,"penumpang, minggu, supir, kendaraan, ngetem",Neutral


In [64]:
sent_names = []
for sent in topics_tj['sentiment_value']:
    if sent<0:
        sent_names.append('Negative')
    elif sent==0:
        sent_names.append('Neutral')
    else:
        sent_names.append('Positive')

In [65]:
topics_tj['sentiment'] = sent_names

In [67]:
topics_tj.to_csv('Output/TJ_topics.csv')

In [10]:
documents = pd.DataFrame({"Document": dataset['full_text'],
                          "ID": range(len(dataset['full_text'])),
                          "Topic": topics})
documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)

# Extract vectorizer and analyzer from BERTopic
vectorizer = topic_model.vectorizer_model
analyzer = vectorizer.build_analyzer()

# Extract features for Topic Coherence evaluation
# words = vectorizer.get_feature_names()
tokens = [analyzer(doc) for doc in cleaned_docs]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]
topic_words = [[words for words, _ in topic_model.get_topic(topic)] 
               for topic in range(len(set(topics))-1)]

# Evaluate
coherence_model = CoherenceModel(topics=topic_words, 
                                 texts=tokens, 
                                 corpus=corpus,
                                 dictionary=dictionary, 
                                 coherence='c_v')
coherence = coherence_model.get_coherence()
print(coherence)

0.7132123597326215


In [15]:

def tune_bertopic(embedding_model_name, n_neighbors, min_cluster_size, n_components):
    embedding_model = SentenceTransformer(embedding_model_name)
    umap_model = UMAP(n_neighbors=n_neighbors, n_components=n_components, metric='cosine')
    hdbscan_model = HDBSCAN(min_cluster_size=min_cluster_size, metric='euclidean', cluster_selection_method='eom')
    topic_model = BERTopic(embedding_model=embedding_model, umap_model=umap_model, hdbscan_model=hdbscan_model)

    topics, probs = topic_model.fit_transform(dataset['full_text'])
    topic_words = topic_model.get_topics()
    topics = [[word for word, _ in topic_words[topic_id]] for topic_id in topic_words]
    
    # Create the coherence model
    coherence_model = CoherenceModel(
        topics=topics, 
        texts=tokens, 
        dictionary=dictionary, 
        coherence='c_v'
    )
    
    # Get the coherence score
    coherence_score = coherence_model.get_coherence()
    print(f"Coherence Score: {coherence_score}")
    return topic_model, coherence_score

# Define hyperparameters to tune
embedding_models = ['all-MiniLM-L6-v2', 'paraphrase-MiniLM-L6-v2']
n_neighbors_values = [10, 15, 30]
min_cluster_size_values = [5, 10, 20]
n_components_values = [2, 5, 8]

# Perform hyperparameter tuning
best_model = None
best_score = -1
for embedding_model in embedding_models:
    for n_neighbors in n_neighbors_values:
        for min_cluster_size in min_cluster_size_values:
            for n_components in n_components_values:
                print(f"Evaluating: embedding_model={embedding_model}, n_neighbors={n_neighbors}, min_cluster_size={min_cluster_size}, n_components={n_components}")
                model, score = tune_bertopic(embedding_model, n_neighbors, min_cluster_size, n_components)
                if score > best_score:
                    best_score = score
                    best_model = model

# Display the best score and model
print(f"Best Coherence Score: {best_score}")

Evaluating: embedding_model=all-MiniLM-L6-v2, n_neighbors=10, min_cluster_size=5, n_components=2
Coherence Score: 0.5584738112190346
Evaluating: embedding_model=all-MiniLM-L6-v2, n_neighbors=10, min_cluster_size=5, n_components=5
Coherence Score: 0.5584738112190346
Evaluating: embedding_model=all-MiniLM-L6-v2, n_neighbors=10, min_cluster_size=5, n_components=8
Coherence Score: 0.5584738112190346
Evaluating: embedding_model=all-MiniLM-L6-v2, n_neighbors=10, min_cluster_size=10, n_components=2
Coherence Score: 0.5584738112190346
Evaluating: embedding_model=all-MiniLM-L6-v2, n_neighbors=10, min_cluster_size=10, n_components=5
Coherence Score: 0.5584738112190346
Evaluating: embedding_model=all-MiniLM-L6-v2, n_neighbors=10, min_cluster_size=10, n_components=8
Coherence Score: 0.5584738112190346
Evaluating: embedding_model=all-MiniLM-L6-v2, n_neighbors=10, min_cluster_size=20, n_components=2
Coherence Score: 0.3414742928287255
Evaluating: embedding_model=all-MiniLM-L6-v2, n_neighbors=10, min