In [22]:
import pandas as pd
from bertopic import BERTopic

In [23]:
tj = pd.read_csv('Output/TJ_withSentiment.csv')

In [154]:
import re
from nltk.corpus import stopwords

stop_words = set(stopwords.words('indonesian')).union({
    'yg', 'dgn', 'kalo', 'bgt', 'aja', 'dr', 'sbg', 'utk', 'tbh', 'tsb', 'tp', 'ya', 'kl', 'klw', 'oh', 'duh', 'waduh', 'min', 'b', 'si', 'nya', 'sih', 'loh', 'ye', 'banget', 'bang'
})

abbreviation_dict = {
    'gk': 'tidak',
    'ga': 'tidak',
    'gak': 'tidak',
    'aja': 'saja',
    'dr': 'dari',
    'tj': 'transjakarta',
    'tije': 'transjakarta',
    'jl': 'jalan',
    'jln': 'jalan'
}

for i, tweet in enumerate(tj['full_text']):
    #to lowercase
    tweet = tweet.lower()
    #remove urls
    tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet, flags=re.MULTILINE)
    #remove mentions
    tweet = re.sub(r'@\w+', '', tweet)
    #remove numbers
    tweet = re.sub(r'\d+', '', tweet)
    #remove punctuation
    tweet = re.sub(r'\W', ' ', tweet)
    #ganti singkatan
    words = tweet.split()
    normalized_words = [abbreviation_dict[word] if word in abbreviation_dict else word for word in words]
    tweet = ' '.join(normalized_words)
    #filter stopwords
    tweet = ' '.join([word for word in tweet.split() if word not in stop_words])

    tj.loc[i, 'full_text'] = tweet

In [155]:
positive_tj = tj[tj['sentiment']=='Positive']
negative_tj = tj[tj['sentiment']=='Negative']

In [127]:
positive_tj.head()

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,created_at,favorite_count,full_text,id_str,lang,location,quote_count,reply_count,retweet_count,username,keyword,created_day,sentiment
1,1,1,1,Fri Jun 21 11:43:35 +0000 2024,0,hallo sebernya penarikan dana tap in tap out s...,1804117959805395101,in,"Jakarta, Indonesia",0,1,0,haloakubecca,Transjakarta,Fri,Positive
38,38,38,38,Fri Jun 21 02:32:47 +0000 2024,720,bangga semalem bus p jarang mudah ditrack live...,1803979346971140493,in,"Jakarta, ID | Breda, NL",59,18,275,adriansyahyasin,Transjakarta,Fri,Positive
180,180,180,180,Mon Jun 17 16:08:53 +0000 2024,2,semenjak c berubah cawang sentral nunggu bus c...,1802735173102080112,in,"Bogor, Indonesia",0,1,0,dnprxx,Transjakarta,Mon,Positive
189,189,189,189,Mon Jun 17 11:39:28 +0000 2024,0,unj transjakarta enaknya turun stasiun transja...,1802667373389848997,in,,0,2,0,lv_ellly,Transjakarta,Mon,Positive
192,192,192,192,Mon Jun 17 10:15:16 +0000 2024,0,saran rute h rute extend terminal pasar minggu...,1802646182688387531,in,south korea,0,1,0,akupacarjenjen,Transjakarta,Mon,Positive


In [128]:
model = BERTopic(language="multilingual")
topics, probs = model.fit_transform(negative_tj['full_text'])
model.get_topic_info().head(7).set_index('Topic')[['Count', 'Name', 'Representation']]

Unnamed: 0_level_0,Count,Name,Representation
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,14,-1_emang_transjakarta_gw_vendor,"[emang, transjakarta, gw, vendor, plz, kebersi..."
0,77,0_bus_halte_penumpang_transjakarta,"[bus, halte, penumpang, transjakarta, jam, pag..."
1,68,1_tap_transjakarta_udah_saldo,"[tap, transjakarta, udah, saldo, menit, kepoto..."


In [61]:
model = BERTopic(language="multilingual")
topics, probs = model.fit_transform(positive_tj['full_text'])
model.get_topic_info().head(7).set_index('Topic')[['Count', 'Name', 'Representation']]

Unnamed: 0_level_0,Count,Name,Representation
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,24,-1_transjakarta_koridor_halte_bus,"[transjakarta, koridor, halte, bus, nunggu, ba..."


In [15]:
import nltk
import gensim
import gensim.corpora as corpora
import pandas as pd
import numpy as np
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from gensim.models.coherencemodel import CoherenceModel
from sklearn.datasets import fetch_20newsgroups

In [191]:
# Step 2.1 - Extract embeddings
embedding_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

# Step 2.2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=4, n_components=2, metric='cosine')

# Step 2.3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=3, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# Step 2.4 - Tokenize topics
vectorizer_model = CountVectorizer()

# Step 2.5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()

In [192]:
topic_model = BERTopic(
  embedding_model=embedding_model,    # Step 1 - Extract embeddings
  umap_model=umap_model,              # Step 2 - Reduce dimensionality
  hdbscan_model=hdbscan_model,        # Step 3 - Cluster reduced embeddings
  vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
  ctfidf_model=ctfidf_model,          # Step 5 - Extract topic words
  nr_topics=10                        
)

In [196]:
topics, probabilities = topic_model.fit_transform(positive_tj['full_text'])
topic_model.get_topic_info().head(7).set_index('Topic')[['Count', 'Name', 'Representation']]

Unnamed: 0_level_0,Count,Name,Representation
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,3,-1_transjakarta_arah_chandra_unj,"[transjakarta, arah, chandra, unj, turun, plui..."
0,6,0_bus_pintu_sentral_pulang,"[bus, pintu, sentral, pulang, penumpang, kadan..."
1,15,1_halte_you_jak_menit,"[halte, you, jak, menit, rute, jalan, gimana, ..."


In [197]:
vectorizer = topic_model.vectorizer_model
analyzer = vectorizer.build_analyzer()

tokens = [analyzer(doc) for doc in positive_tj['full_text']]
dictionary = corpora.Dictionary(tokens)
topics, probs = topic_model.fit_transform(positive_tj['full_text'])
topic_words = topic_model.get_topics()
topics = [[word for word, _ in topic_words[topic_id]] for topic_id in topic_words]

# Create the coherence model
coherence_model = CoherenceModel(
    topics=topics, 
    texts=tokens, 
    dictionary=dictionary, 
    coherence='c_v'
)

# Get the coherence score
coherence_score = coherence_model.get_coherence()
print(f"Coherence Score: {coherence_score}")

Coherence Score: 0.3245258199212662


In [200]:
topics, probabilities = topic_model.fit_transform(negative_tj['full_text'])
topic_model.get_topic_info().head(7).set_index('Topic')[['Count', 'Name', 'Representation']]

Unnamed: 0_level_0,Count,Name,Representation
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,21,-1_rute_halo_event_sejam,"[rute, halo, event, sejam, duduk, gimana, arah..."
0,46,0_pintu_transjakarta_udah_halte,"[pintu, transjakarta, udah, halte, gimana, ara..."
1,41,1_bus_driver_jalan_stop,"[bus, driver, jalan, stop, jam, pagi, halte, j..."
2,15,2_halte_gbk_jam_stuck,"[halte, gbk, jam, stuck, buka, area, numpuk, a..."
3,11,3_petugas_emang_lari_transjakarta,"[petugas, emang, lari, transjakarta, sekelas, ..."
4,8,4_marah_menit_nunggu_gila,"[marah, menit, nunggu, gila, dah, orang, kenap..."
5,5,5_top_up_gagal_solusinya,"[top, up, gagal, solusinya, qris, pake, bnn, a..."


In [184]:
vectorizer = topic_model.vectorizer_model
analyzer = vectorizer.build_analyzer()

tokens = [analyzer(doc) for doc in negative_tj['full_text']]
dictionary = corpora.Dictionary(tokens)
topics, probs = topic_model.fit_transform(negative_tj['full_text'])
topic_words = topic_model.get_topics()
topics = [[word for word, _ in topic_words[topic_id]] for topic_id in topic_words]

# Create the coherence model
coherence_model = CoherenceModel(
    topics=topics, 
    texts=tokens, 
    dictionary=dictionary, 
    coherence='c_v'
)

# Get the coherence score
coherence_score = coherence_model.get_coherence()
print(f"Coherence Score: {coherence_score}")

Coherence Score: 0.5527278029524528


In [141]:
neutral_tj =  tj[tj['sentiment']=='Neutral']
topics, probabilities = topic_model.fit_transform(neutral_tj['full_text'])
topic_model.get_topic_info().head(7).set_index('Topic')[['Count', 'Name', 'Representation']]

Unnamed: 0_level_0,Count,Name,Representation
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,80,-1_halte_arah_halo_busway,"[halte, arah, halo, busway, rute, jak, transja..."
0,156,0_halte_rute_transjakarta_kembalikan,"[halte, rute, transjakarta, kembalikan, halo, ..."
1,50,1_bus_menit_busway_berhenti,"[bus, menit, busway, berhenti, mohon, info, ko..."
2,45,2_jam_beroperasi_brp_besok,"[jam, beroperasi, brp, besok, rute, sampe, ope..."
3,14,3_admin_malam_selamat_terima,"[admin, malam, selamat, terima, irti, gimana, ..."
4,12,4_koridor_bis_beroperasi_kyk,"[koridor, bis, beroperasi, kyk, bmp, jam, pagi..."
5,11,5_tracking_google_live_maps,"[tracking, google, live, maps, fitur, gmaps, a..."
