# **Topic Modeling and Search with Top2Vec :** 
# **Sentosa Beaches and Boardwalk Waterfront**

# **1. Import and Setup**

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import sys
sys.path.append('/content/gdrive/My Drive/nlp')

## **a) To install Top2Vec library and pre-trained BERT sentence transformer options:**

In [None]:
# !pip install top2vec[sentence_transformers]
!pip install pynndescent


## **b) Import Libraries**

In [None]:
from top2vec import Top2Vec 

import pandas as pd
import numpy as np

# **2. Import Dataset**

In [None]:
df = pd.read_excel("/content/gdrive/MyDrive/data/clean/formatted_reviews.xlsx")
df.sample(4)

Unnamed: 0,date,source,attraction,reviews,rating
3460,2019.0,traveloka,sea_aquarium,sangat memuaskan sekali memanjakan mata dengan...,10.0
4552,2018.0,tripadvisor,skyline_luge,kalau naik hanya sekali pasti kurang kami memb...,8.0
170,2017.0,google_reviews,uss,tempatnya menabjukan penggabungan antara seni ...,10.0
1806,2020.0,klook,uss,datang ke sini pas saya sakit tapi tidak mengu...,10.0


In [None]:
df['date'] = df['date'].astype(int)

In [None]:
beach_df = df[df['attraction'] == 'beach_waterfront']
beach_df = beach_df.sort_values(by=['date','rating'],ascending=[False,False])
# resetting index
beach_df = beach_df.reset_index(drop=True)
beach_df.tail(4)

Unnamed: 0,date,source,attraction,reviews,rating
80,2017,tripadvisor,beach_waterfront,untuk ukuran pantai buatan pantai ini indah se...,8.0
81,2017,tripadvisor,beach_waterfront,alternatif lain menuju pulau sentosa berjalan ...,8.0
82,2017,tripadvisor,beach_waterfront,pantai di pulau sentosa pantai ini katanya ada...,6.0
83,2017,tripadvisor,beach_waterfront,lagi direnov saya kesana akhir oktober kalau k...,6.0


In [None]:
beach_docs = beach_df.loc[:, "reviews"].astype(str).values.tolist()
beach_docs[:7]

['one of beautiful beach in singapore selain pantainya yang bersih dan juga cantik disini selalu digelar pagelaran yang bernama wings of time suatu show yang benarbenar memanjakan mata dengan menggunakan teknik lampu yang super canggih',
 'pantainya bagus ada mcdonald juga jadi gampang kalau mau makan',
 'kami jalan kaki saja biar sehat dari dermaga ke sentosa island asik biar lihat pemandangan',
 'pantainya lumayan bagus dan bersih tapi jelas masih kalah dibandingkan dengan pantai2 asli yang ada di indonesia',
 'anchorid of sentosa',
 'belum pernah dateng ke sini',
 'pantainya sangat sangat bersih sekali dan tidak ada samapah pasirnya juga alus dan bersih cocok untuk menikmati sunset di sore hari dan arus airnya juga tenang tidak ada gelombang']

# **3. Create and Train N-Gram model (Bi-Gram and Tri-Gram)**
* Top2Vec uses Gensim **simple_preprocess** to do tokenization. 
* To create Trigrams and Bigrams, pass a customized tokenizer into Top2Vec tokenizer parameter. 
* Use Gensim to train this N-gram model. Documents passed to the Phrases function to inspect for N-grams text corpus.

In [None]:
import gensim
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import strip_tags

# Build the bigram and trigram models 
sentence_stream = [doc.split(" ") for doc in beach_docs]
bigram = Phrases(sentence_stream, min_count=5, threshold=5, delimiter=b' ')
trigram = Phrases(bigram[sentence_stream], threshold=5, delimiter=b' ')

bigram_phraser = Phraser(bigram)
trigram_phraser = Phraser(trigram)



In [None]:
# Create Bi-Grams
def bi_gram(doc):
    sentence_stream = simple_preprocess(strip_tags(str(doc)), deacc=True)
    return bigram_phraser[sentence_stream] 


# Create Bi-Grams and Tri-Grams
def tri_gram(doc):
    sentence_stream = simple_preprocess(strip_tags(str(doc)), deacc=True)
    gen_bigram = bigram_phraser[sentence_stream]
    gen_trigram = trigram_phraser[bigram_phraser[sentence_stream]]
    return gen_trigram

# **4. Train Top2Vec Model**

### **Parameters**:
* **Documents**: Input corpus, should be a list of strings.
* **Min_count**: (Optional, default 50) Ignores all words with total frequency lower than this. For smaller corpora a smaller min_count will be necessary.
* **Embedding_model** (string or callable) – The valid string options are: doc2vec , universal-sentence-encoder , universal-sentence-encoder-multilingual,distiluse-base-multilingual-cased , all-MiniLM-L6-v2 , paraphrase-multilingual-MiniLM-L12-v2 
* **tokenizer** (callable (Optional, default None)) – Override the default tokenization method. If None then gensim.utils.simple_preprocess will be used. ***Tokenizer must take a document and return a list of tokens***.


In [None]:
# FORM UNI-GRAMS, BI-GRAMS WITH BI-GRAM MODEL 

topicMODEL_bi = Top2Vec(documents= beach_docs, min_count=5, embedding_model = 'paraphrase-multilingual-MiniLM-L12-v2', tokenizer= bi_gram)  

2022-06-23 23:34:10,626 - top2vec - INFO - Pre-processing documents for training
2022-06-23 23:34:10,654 - top2vec - INFO - Downloading paraphrase-multilingual-MiniLM-L12-v2 model


Downloading:   0%|          | 0.00/968 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.79k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/645 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/471M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/14.8M [00:00<?, ?B/s]

2022-06-23 23:34:44,224 - top2vec - INFO - Creating joint document/word embedding
2022-06-23 23:34:55,960 - top2vec - INFO - Creating lower dimension embedding of documents
2022-06-23 23:35:04,674 - top2vec - INFO - Finding dense areas of documents
2022-06-23 23:35:04,685 - top2vec - INFO - Finding topics


In [None]:
# FORM UNI-GRAMS, BI-GRAMS, TRI-GRAMS WITH TRI-GRAM MODEL 
 
topicMODEL_tri = Top2Vec(documents= beach_docs, min_count=5, embedding_model = 'distiluse-base-multilingual-cased', tokenizer = tri_gram)  

2022-06-23 23:35:35,526 - top2vec - INFO - Pre-processing documents for training
2022-06-23 23:35:35,559 - top2vec - INFO - Downloading distiluse-base-multilingual-cased model


Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/114 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.37k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/607 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/341 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/539M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/528 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/996k [00:00<?, ?B/s]

2022-06-23 23:35:51,231 - top2vec - INFO - Creating joint document/word embedding
2022-06-23 23:35:51,650 - top2vec - INFO - Creating lower dimension embedding of documents
2022-06-23 23:35:54,850 - top2vec - INFO - Finding dense areas of documents
2022-06-23 23:35:54,860 - top2vec - INFO - Finding topics


In [None]:
# SAVE MODELS FOR FUTURE USE

topicMODEL_bi.save("/content/gdrive/MyDrive/data/models/bigram_beach_june24")

topicMODEL_tri.save("/content/gdrive/MyDrive/data/models/trigram_beach_june24")

In [None]:
# LOAD MODEL FROM SAVED MODELS

topicMODEL_bi = Top2Vec.load("/content/gdrive/MyDrive/data/models/bigram_beach_june24")

topicMODEL_tri = Top2Vec.load("/content/gdrive/MyDrive/data/models/trigram_beach_june24")

# **5. Perform Topic Reduction** 

* Reduce the number of topics discovered by Top2Vec.
* The most representative topics of the corpus will be found, by iteratively merging each smallest topic to the most similar topic until num_topics is reached. 
* Get the hierarchy of reduced topics. The mapping of each original topic to the reduced topics is returned.

In [None]:
# BI-GRAM MODEL TOPIC REDUCTION
topicMODEL_bi.hierarchical_topic_reduction(num_topics=10) 

topicMODEL_bi.get_topic_hierarchy()

In [None]:
# TRI-GRAM MODEL TOPIC REDUCTION 
topicMODEL_tri.hierarchical_topic_reduction(num_topics=10) 

topicMODEL_tri.get_topic_hierarchy()

# **6. Explore Discovered Topics**

## **6a) Get Number of Topics**
This will return the number of topics that Top2Vec has found in the data.

In [None]:
# Show the Total Number of Topics 

print(f"Total Number of Topics generated (Topic Reduction Applied): \n")
print(f"Bi-gram Model : {topicMODEL_bi.get_num_topics(reduced=False)}")
print(f"Tri-gram Model : {topicMODEL_tri.get_num_topics(reduced=False)}")

Total Number of Topics generated (Topic Reduction Applied): 

Bi-gram Model : 2
Tri-gram Model : 2


## **6b) Get Topic Sizes**
This will return the number of documents most similar to each topic. Topics are in decreasing order of size.

Returns:
* topic_sizes: The number of documents most similar to each topic.
* topic_nums: The unique index of every topic will be returned.

### **6b.1. Bi-Gram and Tri-Gram Model Topic Sizes**

In [None]:
print("BI-GRAM MODEL where N = 1 and 2")
print("================================\n")
topic_sizes_bi, topic_nums_bi = topicMODEL_bi.get_topic_sizes(reduced=False) 

print(f"Unique index numbers of every topic: {topic_nums_bi} ")
print()
print(f"Number of documents for each unique topic: {topic_sizes_bi} \n")

print("TRI-GRAM MODEL where N = 1, 2 and 3")
print("====================================\n")
topic_sizes_tri, topic_nums_tri = topicMODEL_tri.get_topic_sizes(reduced=False) 
print(f"Unique index numbers of every topic: {topic_nums_tri} ")
print()
print(f"Number of documents for each unique topic: {topic_sizes_tri} ") 

BI-GRAM MODEL where N = 1 and 2

Unique index numbers of every topic: [0 1] 

Number of documents for each unique topic: [46 38] 

TRI-GRAM MODEL where N = 1, 2 and 3

Unique index numbers of every topic: [0 1] 

Number of documents for each unique topic: [55 29] 


## **6c) Get Topics**
This will return the topics in decreasing size.

Returns:

* topic_words: For each topic the top 50 words are returned, in decreasing order of semantic similarity to topic.

* word_scores: For each topic the cosine similarity scores (in decreasing order) of the top 50 words to the topic are returned.

* topic_nums: The unique index of every topic will be returned.

### **c.1.  BI-GRAM Model Topics**

In [None]:
print(f"TOPICS FOR BI-GRAM MODEL: \n")
topic_words_bi, word_scores_bi, topic_nums_bi = topicMODEL_bi.get_topics(topicMODEL_bi.get_num_topics(reduced=False))

for count,(words_bi, scores_bi, num_bi) in enumerate(zip(topic_words_bi, word_scores_bi, topic_nums_bi)):
    print(f"{count+1}. Topic {num_bi+1}") 
    print(f"Topic Words : \n{words_bi} \n")
    # print(f"Cosine Similarity :\n{scores_bi}\n")

TOPICS FOR BI-GRAM MODEL: 

1. Topic 1
Topic Words : 
['pantai nya' 'pantai ini' 'pantai buatan' 'pantainya' 'pantai'
 'siloso beach' 'beach' 'pulau sentosa' 'sentosa boardwalk'
 'sentosa island' 'pasir' 'pulau' 'island' 'pemandangan' 'nyaman' 'indah'
 'tempat' 'sentosa' 'ke sentosa' 'panas' 'cocok untuk' 'bersih' 'jauh'
 'lumayan' 'sampai' 'jalan' 'menuju' 'capek' 'banget' 'pulang' 'bagus'
 'kalau' 'mau' 'disini' 'akan' 'sangat' 'ingin' 'banyak' 'tenang'
 'alternatif' 'naik' 'buat' 'saja' 'bisa' 'untuk' 'biasa' 'berjalan' 'ga'
 'anda' 'jalan kaki'] 

2. Topic 2
Topic Words : 
['sentosa boardwalk' 'pulau sentosa' 'sentosa island' 'jalan kaki'
 'pemandangan' 'jalan' 'berjalan' 'pulau' 'pantai ini' 'pantai buatan'
 'island' 'pantai nya' 'naik monorail' 'broadwalk' 'pantainya' 'jauh'
 'tempat' 'siloso beach' 'pantai' 'pulang' 'beach' 'menuju' 'jembatan'
 'nyaman' 'alternatif' 'naik' 'sampai' 'indah' 'disini' 'cocok untuk'
 'sentosa' 'banyak' 'ke sentosa' 'city' 'turun' 'gratis' 'ingin' 'p

### **c.2. Tri-Gram Model Topics**

In [None]:
print(f"TOPICS FOR TRI-GRAM MODEL : \n")
topic_words_tri, word_scores_tri, topic_nums_tri = topicMODEL_tri.get_topics(topicMODEL_tri.get_num_topics(reduced=False))

for count, (words_tri, scores_tri, num_tri) in enumerate(zip(topic_words_tri, word_scores_tri, topic_nums_tri)):
    print(f"{count+1}. Topic {num_tri+1}") 
    print(f"Topic Words : \n{words_tri}\n")
    # print(f"Cosine Similarity :\n{scores_tri}\n")

TOPICS FOR TRI-GRAM MODEL : 

1. Topic 1
Topic Words : 
['siloso beach' 'pantai buatan' 'pantai ini' 'pantainya' 'pantai nya'
 'beach' 'pantai' 'ke sentosa island' 'sentosa island' 'pulau sentosa'
 'pulau' 'bersih' 'sentosa boardwalk' 'indah' 'tidak terlalu' 'ini'
 'jembatan' 'lumayan' 'disini' 'sangat' 'bagus' 'broadwalk' 'bus' 'cukup'
 'sampai' 'panas' 'cocok untuk' 'karena' 'adalah' 'tempat' 'putih'
 'gratis' 'untuk' 'bisa' 'ada' 'nyaman' 'jadi' 'yang' 'alternatif' 'dan'
 'saja' 'tapi' 'di' 'juga' 'dengan' 'kalau' 'tidak' 'banyak' 'ke' 'akan']

2. Topic 2
Topic Words : 
['ke sentosa island' 'sentosa island' 'pulau sentosa' 'pulau'
 'sentosa boardwalk' 'broadwalk' 'jalan kaki' 'berjalan' 'naik monorail'
 'jembatan' 'jalan' 'pantai buatan' 'pantai ini' 'siloso beach'
 'pantai nya' 'gratis' 'pantainya' 'alternatif' 'pantai' 'beach' 'jauh'
 'bus' 'menuju' 'indah' 'dari vivo city' 'kaki' 'tidak terlalu' 'sangat'
 'untuk' 'lumayan' 'karena' 'jadi' 'dari' 'pemandangan' 'ini' 'sampai'
 'bag

## **6d) Search Documents by Topic**
We are going to search by topic; for example Topic 3. 

Returns:

* documents:  The documents in a list, the most similar are first.
* doc_scores:  Semantic similarity of document to topic. The cosine similarity of the document and topic vector.
* doc_ids:  Unique ids of documents. If ids were not given, the index of document in the original corpus.

For each of the returned documents we are going to print its content, score and document number.

### **d.1. Search Documents By Topic Number Using Bi-Gram Model**

In [None]:
# SEARCH DOCUMENTS BY TOPIC NUMBER USING BI-GRAM MODEL 
topic_num_bi = 0
documents_bi, document_scores_bi, document_ids_bi = topicMODEL_bi.search_documents_by_topic(topic_num= topic_num_bi , num_docs= 5, reduced=False) 

print(f"TOPIC NUMBER: {topic_num_bi}\n")
for count, (doc_bi, docscore_bi, doc_id_bi) in enumerate(zip(documents_bi, document_scores_bi, document_ids_bi)):
    print(f"{count}. Document: {doc_id_bi},  Score: {docscore_bi}")
    print("-----------")
    print(doc_bi)
    print("-----------\n")

TOPIC NUMBER: 0

0. Document: 10,  Score: 4.495950698852539
-----------
pantai yang bersih tenang dengan pemandangan yang bagus
-----------

1. Document: 42,  Score: 3.909059524536133
-----------
pantai yang bersih dengan jembatan panjang
-----------

2. Document: 33,  Score: 3.883267879486084
-----------
pantai yang tenang bed sun available 22 di pinggir pantai pantai iya kelihatan dari ketinggian naik skyline
-----------

3. Document: 32,  Score: 3.8165242671966553
-----------
pantai di pesisir sentosa ini berpasir putih dan indah
-----------

4. Document: 25,  Score: 3.468386173248291
-----------
pantai nya nyaman ombak kecil buat berenang banyak wahana di sekitar pantai sunset cukup bagus pantai nya bersih sangat cocok untuk liburan keluarga dan family sunset nya lumayan
-----------



### **d.2. Search Documents By Topic Number Using Tri-Gram Model**

In [None]:
# SEARCH DOCUMENTS BY TOPIC NUMBER USING TRI-GRAM MODEL 
topic_num_tri = 1
documents_tri, document_scores_tri, document_ids_tri = topicMODEL_tri.search_documents_by_topic(topic_num= topic_num_tri, num_docs= 5, reduced=False) 
print(f"TOPIC NUMBER: {topic_num_bi}\n")
for count, (doc_tri, docscore_tri, doc_id_tri) in enumerate(zip(documents_tri, document_scores_tri, document_ids_tri)):
    print(f"{count}. Document: {doc_id_tri},  Score: {docscore_tri}")
    print("-----------")
    print(doc_tri)
    print("-----------\n")

TOPIC NUMBER: 0

0. Document: 74,  Score: 0.6807977557182312
-----------
berjalan kaki tidak terasa capek untuk menyeberang ke sentosa island saya lebih juga berjalan kaki dari vivo city mall melalui sentosa boardwalk ketika saya berkunjung pada mei 2017 travelator sedang ada perbaikan jika dihitung jaraknya cukup jauh namun berlajaln kaki tidak terlalu capek karena banyak spot indah dalam perjalanan dari sentosa island ke vivo city arah balik baru kemudian naik monorail gratis hehehe
-----------

1. Document: 58,  Score: 0.6802978515625
-----------
cara gratis ke sentosa island jalan kaki lewat sentosa boardwalk ke sentosa island sangat menghemat pengeluaran wisata jangan kawatir capek karena sepanjang perjalanan banyak eskalator yang disediakan dan selama perjalanan akan disuguhi pemandangan laut yang indah waktu saya berkunjung loket tiket masuk sentosa island ditutup jadi langsung bisa masuk gratis tanpa membayar sepeserpun
-----------

2. Document: 81,  Score: 0.6611872315406799
-

# **7) Constructing Dataframe to Collate Information of Key Topics**

## **7a) BI-GRAM MODEL**

In [None]:
# CONSTRUCT DATAFRAME OF BI-GRAM MODEL WITH KEY INFORMATION FOR EXPORT
# initialize data of lists.
bigram_table = {
              'topic number': [num_bi for num_bi in topic_nums_bi],
              'topic words': [words_bi for words_bi in topic_words_bi],
              'cosine similarity metrics': [scores_bi for scores_bi in word_scores_bi],
              'similar documents': [size_bi for size_bi in topic_sizes_bi],
               }
 
# Create DataFrame
bigram_df = pd.DataFrame(bigram_table)
bigram_df['topic number'] = bigram_df['topic number'].apply(lambda x: x + 1)

# print(bigram_df.head(2))
bigram_df.tail(3)

Unnamed: 0,topic number,topic words,cosine similarity metrics,similar documents
0,1,"[pantai nya, pantai ini, pantai buatan, pantai...","[0.83623326, 0.8348913, 0.8335162, 0.8319118, ...",46
1,2,"[sentosa boardwalk, pulau sentosa, sentosa isl...","[0.6508756, 0.5251312, 0.5131853, 0.4809478, 0...",38


### **7a.1. Build Functions to Expand Column Features in Bi-Gram Dataframe**

In [None]:
def get_document_index(topic_num_bi, num_doc):
    documents_bi, document_scores_bi, document_ids_bi = topicMODEL_bi.search_documents_by_topic(topic_num= topic_num_bi , num_docs= num_doc, reduced=False)
    document_index = [doc_id_bi for doc_id_bi in document_ids_bi]
    return document_index 

def get_document_scores(topic_num_bi, num_doc):
    documents_bi, document_scores_bi, document_ids_bi = topicMODEL_bi.search_documents_by_topic(topic_num= topic_num_bi , num_docs= num_doc, reduced=False)
    document_scores = [docscore_bi for docscore_bi in document_scores_bi]
    return document_scores


def get_documents(topic_num_bi, num_doc):
    documents_bi, document_scores_bi, document_ids_bi = topicMODEL_bi.search_documents_by_topic(topic_num= topic_num_bi , num_docs= num_doc, reduced=False)
    actual_document = [doc_bi for doc_bi in documents_bi]
    return actual_document


In [None]:
bigram_df['document index'] = bigram_df['topic number'].apply(lambda x: get_document_index(x-1, bigram_df['similar documents'].values[x-1]))

bigram_df['document scores'] = bigram_df['topic number'].apply(lambda x: get_document_scores(x-1, bigram_df['similar documents'].values[x-1]))

bigram_df['actual reviews'] = bigram_df['topic number'].apply(lambda x: get_documents(x-1, bigram_df['similar documents'].values[x-1]))

bigram_df

Unnamed: 0,topic number,topic words,cosine similarity metrics,similar documents,document index,document scores,actual reviews
0,1,"[pantai nya, pantai ini, pantai buatan, pantai...","[0.83623326, 0.8348913, 0.8335162, 0.8319118, ...",46,"[10, 42, 33, 32, 25, 6, 65, 14, 82, 7, 43, 3, ...","[4.4959507, 3.9090595, 3.8832679, 3.8165243, 3...",[pantai yang bersih tenang dengan pemandangan ...
1,2,"[sentosa boardwalk, pulau sentosa, sentosa isl...","[0.6508756, 0.5251312, 0.5131853, 0.4809478, 0...",38,"[54, 77, 81, 40, 71, 38, 59, 51, 23, 47, 46, 3...","[3.4052937, 3.168723, 3.138019, 3.0155137, 2.9...",[backpacking dengan adanya travelator sangat m...


In [None]:
# for count, (index, review) in enumerate(zip(bigram_df['document index'][28], bigram_df['actual reviews'][28])):
#   print(f"{count}: {index} = {review})")

bigram_df.to_excel(f'/content/gdrive/MyDrive/data/export/bigram_df_beach.xlsx', index=False)

## **7b) TRI-GRAM MODEL**

In [None]:
# CONSTRUCT DATAFRAME CONTAINING KEY INFORMATION FOR EXPORT
# initialize data of lists.
trigram_table = {
                'topic number': [num_tri for num_tri in topic_nums_tri],
                'topic words': [words_tri for words_tri in topic_words_tri],
                'cosine similarity metrics': [scores_tri for scores_tri in word_scores_tri],
                'similar documents': [size_tri for size_tri in topic_sizes_tri],
                 }

# Create DataFrame
trigram_df = pd.DataFrame(trigram_table)

trigram_df['topic number'] = trigram_df['topic number'].apply(lambda x: x + 1)
trigram_df.tail(3)

Unnamed: 0,topic number,topic words,cosine similarity metrics,similar documents
0,1,"[siloso beach, pantai buatan, pantai ini, pant...","[0.56666857, 0.5575116, 0.5488733, 0.50886035,...",55
1,2,"[ke sentosa island, sentosa island, pulau sent...","[0.37848967, 0.32700586, 0.3155557, 0.2767048,...",29


### **7b.1. Build Functions to Expand Column Features in Tri-Gram Dataframe**

In [None]:
def get_document_index_tri(topic_num_tri, num_docs):
    documents_tri, document_scores_tri, document_ids_tri = topicMODEL_tri.search_documents_by_topic(topic_num= topic_num_tri , num_docs= num_docs, reduced=False)
    document_index = [doc_id_tri for doc_id_tri in document_ids_tri]
    return document_index 

def get_document_scores_tri(topic_num_tri, num_docs):
    documents_tri, document_scores_tri, document_ids_tri = topicMODEL_tri.search_documents_by_topic(topic_num= topic_num_tri , num_docs= num_docs, reduced=False)
    document_scores = [docscore_tri for docscore_tri in document_scores_tri]
    return document_scores


def get_documents_tri(topic_num_tri, num_docs):
    documents_tri, document_scores_tri, document_ids_tri = topicMODEL_tri.search_documents_by_topic(topic_num= topic_num_tri , num_docs= num_docs, reduced=False)
    actual_document = [doc_tri for doc_tri in documents_tri]
    return actual_document


In [None]:
trigram_df['document index'] = trigram_df['topic number'].apply(lambda x: get_document_index_tri(x-1, trigram_df['similar documents'].values[x-1]))

trigram_df['document scores'] = trigram_df['topic number'].apply(lambda x: get_document_scores_tri(x-1, trigram_df['similar documents'].values[x-1]))

trigram_df['actual reviews'] = trigram_df['topic number'].apply(lambda x: get_documents_tri(x-1, trigram_df['similar documents'].values[x-1]))

trigram_df

Unnamed: 0,topic number,topic words,cosine similarity metrics,similar documents,document index,document scores,actual reviews
0,1,"[siloso beach, pantai buatan, pantai ini, pant...","[0.56666857, 0.5575116, 0.5488733, 0.50886035,...",55,"[41, 55, 82, 25, 65, 53, 0, 3, 6, 52, 80, 66, ...","[0.6978262, 0.6625662, 0.6465565, 0.64647555, ...",[bersih dan indah pantai siloso beach merupaka...
1,2,"[ke sentosa island, sentosa island, pulau sent...","[0.37848967, 0.32700586, 0.3155557, 0.2767048,...",29,"[74, 58, 81, 78, 40, 71, 77, 35, 54, 73, 76, 4...","[0.68079776, 0.68029785, 0.66118723, 0.6491896...",[berjalan kaki tidak terasa capek untuk menyeb...


In [None]:
# for count, (index, review) in enumerate(zip(trigram_df['document index'][0], trigram_df['actual reviews'][0])):
#   print(f"{count}: {index} = {review})")

trigram_df.to_excel(f'/content/gdrive/MyDrive/data/export/trigram_df_beach.xlsx', index=False)

# **8. Defining Topic Labels on Reviews** 
## **8a) User-Define Topic Labels** 
### **8a.1:  BI-GRAM Model** 

In [None]:
# Create the dictionary containing the data of the new column
bigram_col_dict = { 'Topic 1': 'visitors experiences',                 'Topic 2': 'features visitors like',
                        
                  }
                   
# Assign the values of the dictionary as the values of the new column
bigram_df['topic label'] = bigram_col_dict.values()
bigram_df

Unnamed: 0,topic number,topic words,cosine similarity metrics,similar documents,document index,document scores,actual reviews,topic label
0,1,"[pantai nya, pantai ini, pantai buatan, pantai...","[0.83623326, 0.8348913, 0.8335162, 0.8319118, ...",46,"[10, 42, 33, 32, 25, 6, 65, 14, 82, 7, 43, 3, ...","[4.4959507, 3.9090595, 3.8832679, 3.8165243, 3...",[pantai yang bersih tenang dengan pemandangan ...,experiences
1,2,"[sentosa boardwalk, pulau sentosa, sentosa isl...","[0.6508756, 0.5251312, 0.5131853, 0.4809478, 0...",38,"[54, 77, 81, 40, 71, 38, 59, 51, 23, 47, 46, 3...","[3.4052937, 3.168723, 3.138019, 3.0155137, 2.9...",[backpacking dengan adanya travelator sangat m...,activities


In [None]:
# CREATE COPY OF ORIGINAL DATAFRAME
beach_DF_BI = beach_df.copy()
beach_DF_BI.tail(3)

Unnamed: 0,date,source,attraction,reviews,rating
81,2017,tripadvisor,beach_waterfront,alternatif lain menuju pulau sentosa berjalan ...,8.0
82,2017,tripadvisor,beach_waterfront,pantai di pulau sentosa pantai ini katanya ada...,6.0
83,2017,tripadvisor,beach_waterfront,lagi direnov saya kesana akhir oktober kalau k...,6.0


In [None]:
print([i for i in range(topicMODEL_bi.get_num_topics(reduced=False))])

[0, 1]


In [None]:
def get_indexLIST(i):
      topic = [doc_ind for doc_ind in bigram_df['document index'][i]]
      return topic

In [None]:
for i in range(topicMODEL_bi.get_num_topics(reduced=False)):
    beach_DF_BI.loc[get_indexLIST(i), 'topic_label'] = bigram_df['topic label'][i]
    

In [None]:
beach_DF_BI.reset_index(drop=True)
beach_DF_BI

Unnamed: 0,date,source,attraction,reviews,rating,topic_label
0,2022,google_reviews,beach_waterfront,one of beautiful beach in singapore selain pan...,10.0,experiences
1,2021,google_reviews,beach_waterfront,pantainya bagus ada mcdonald juga jadi gampang...,10.0,experiences
2,2021,google_reviews,beach_waterfront,kami jalan kaki saja biar sehat dari dermaga k...,10.0,activities
3,2021,google_reviews,beach_waterfront,pantainya lumayan bagus dan bersih tapi jelas ...,8.0,experiences
4,2021,google_reviews,beach_waterfront,anchorid of sentosa,6.0,experiences
...,...,...,...,...,...,...
79,2017,tripadvisor,beach_waterfront,menyebrangi pulau lewat jembatan pantainya ber...,8.0,activities
80,2017,tripadvisor,beach_waterfront,untuk ukuran pantai buatan pantai ini indah se...,8.0,experiences
81,2017,tripadvisor,beach_waterfront,alternatif lain menuju pulau sentosa berjalan ...,8.0,activities
82,2017,tripadvisor,beach_waterfront,pantai di pulau sentosa pantai ini katanya ada...,6.0,experiences


In [None]:
# TO VERIFY TOPIC LABELING IS CORRECT: 
# STEP 1 - GENERATE RANDOM SAMPLE TABLE FROM NEW DATAFRAME 
bigram_df

Unnamed: 0,topic number,topic words,cosine similarity metrics,similar documents,document index,document scores,actual reviews,topic label
0,1,"[pantai nya, pantai ini, pantai buatan, pantai...","[0.83623326, 0.8348913, 0.8335162, 0.8319118, ...",46,"[10, 42, 33, 32, 25, 6, 65, 14, 82, 7, 43, 3, ...","[4.4959507, 3.9090595, 3.8832679, 3.8165243, 3...",[pantai yang bersih tenang dengan pemandangan ...,experiences
1,2,"[sentosa boardwalk, pulau sentosa, sentosa isl...","[0.6508756, 0.5251312, 0.5131853, 0.4809478, 0...",38,"[54, 77, 81, 40, 71, 38, 59, 51, 23, 47, 46, 3...","[3.4052937, 3.168723, 3.138019, 3.0155137, 2.9...",[backpacking dengan adanya travelator sangat m...,activities


In [None]:
# TO VERIFY TOPIC LABELING IS CORRECT: 
# STEP 2 - SELECT INDEXES FROM ORIGINAL DATAFRAME TO COMPARE

beach_DF_BI.iloc[[32, 71]] 

Unnamed: 0,date,source,attraction,reviews,rating,topic_label
32,2019,google_reviews,beach_waterfront,pantai di pesisir sentosa ini berpasir putih d...,10.0,experiences
71,2017,tripadvisor,beach_waterfront,cara gratis menuju sentosa island buat travell...,10.0,activities


In [None]:
beach_DF_BI.to_excel(f'/content/gdrive/MyDrive/data/export/beach_DF_BI.xlsx', index=False)

### **8a.2  TRI-GRAM Model** 

In [None]:
# Create the dictionary containing the data of the new column
trigram_col_dict = { 'Topic 1': 'visitors experiences',                  'Topic 2': 'features visitors like',        
                    }
   

# Assign the values of the dictionary as the values of the new column
trigram_df['topic label'] = trigram_col_dict.values()
trigram_df.tail(3)

Unnamed: 0,topic number,topic words,cosine similarity metrics,similar documents,document index,document scores,actual reviews,topic label
0,1,"[siloso beach, pantai buatan, pantai ini, pant...","[0.56666857, 0.5575116, 0.5488733, 0.50886035,...",55,"[41, 55, 82, 25, 65, 53, 0, 3, 6, 52, 80, 66, ...","[0.6978262, 0.6625662, 0.6465565, 0.64647555, ...",[bersih dan indah pantai siloso beach merupaka...,experiences
1,2,"[ke sentosa island, sentosa island, pulau sent...","[0.37848967, 0.32700586, 0.3155557, 0.2767048,...",29,"[74, 58, 81, 78, 40, 71, 77, 35, 54, 73, 76, 4...","[0.68079776, 0.68029785, 0.66118723, 0.6491896...",[berjalan kaki tidak terasa capek untuk menyeb...,activities


In [None]:
# CREATE COPY OF ORIGINAL DATAFRAME
beach_DF_TRI = beach_df.copy()
beach_DF_TRI.tail(3)

Unnamed: 0,date,source,attraction,reviews,rating
81,2017,tripadvisor,beach_waterfront,alternatif lain menuju pulau sentosa berjalan ...,8.0
82,2017,tripadvisor,beach_waterfront,pantai di pulau sentosa pantai ini katanya ada...,6.0
83,2017,tripadvisor,beach_waterfront,lagi direnov saya kesana akhir oktober kalau k...,6.0


In [None]:
print([i for i in range(topicMODEL_tri.get_num_topics(reduced=False))])

[0, 1]


In [None]:
def get_indexLIST(i):
      topic = [doc_ind for doc_ind in trigram_df['document index'][i]]
      return topic

In [None]:
for i in range(topicMODEL_tri.get_num_topics(reduced=False)):
    beach_DF_TRI.loc[get_indexLIST(i), 'topic_label'] = trigram_df['topic label'][i]

In [None]:
beach_DF_TRI.reset_index(drop=True)
beach_DF_TRI

Unnamed: 0,date,source,attraction,reviews,rating,topic_label
0,2022,google_reviews,beach_waterfront,one of beautiful beach in singapore selain pan...,10.0,experiences
1,2021,google_reviews,beach_waterfront,pantainya bagus ada mcdonald juga jadi gampang...,10.0,experiences
2,2021,google_reviews,beach_waterfront,kami jalan kaki saja biar sehat dari dermaga k...,10.0,activities
3,2021,google_reviews,beach_waterfront,pantainya lumayan bagus dan bersih tapi jelas ...,8.0,experiences
4,2021,google_reviews,beach_waterfront,anchorid of sentosa,6.0,experiences
...,...,...,...,...,...,...
79,2017,tripadvisor,beach_waterfront,menyebrangi pulau lewat jembatan pantainya ber...,8.0,activities
80,2017,tripadvisor,beach_waterfront,untuk ukuran pantai buatan pantai ini indah se...,8.0,experiences
81,2017,tripadvisor,beach_waterfront,alternatif lain menuju pulau sentosa berjalan ...,8.0,activities
82,2017,tripadvisor,beach_waterfront,pantai di pulau sentosa pantai ini katanya ada...,6.0,experiences


In [None]:
beach_DF_TRI.isnull().values.any()

False

In [None]:
# TO VERIFY TOPIC LABELING IS CORRECT: 
# STEP 1 - GENERATE RANDOM SAMPLE TABLE FROM NEW DATAFRAME 
trigram_df

Unnamed: 0,topic number,topic words,cosine similarity metrics,similar documents,document index,document scores,actual reviews,topic label
0,1,"[siloso beach, pantai buatan, pantai ini, pant...","[0.56666857, 0.5575116, 0.5488733, 0.50886035,...",55,"[41, 55, 82, 25, 65, 53, 0, 3, 6, 52, 80, 66, ...","[0.6978262, 0.6625662, 0.6465565, 0.64647555, ...",[bersih dan indah pantai siloso beach merupaka...,experiences
1,2,"[ke sentosa island, sentosa island, pulau sent...","[0.37848967, 0.32700586, 0.3155557, 0.2767048,...",29,"[74, 58, 81, 78, 40, 71, 77, 35, 54, 73, 76, 4...","[0.68079776, 0.68029785, 0.66118723, 0.6491896...",[berjalan kaki tidak terasa capek untuk menyeb...,activities


In [None]:
# TO VERIFY TOPIC LABELING IS CORRECT: 
# STEP 2 - SELECT INDEXES FROM ORIGINAL DATAFRAME TO COMPARE

beach_DF_TRI.iloc[[80,  35]] 

Unnamed: 0,date,source,attraction,reviews,rating,topic_label
80,2017,tripadvisor,beach_waterfront,untuk ukuran pantai buatan pantai ini indah se...,8.0,experiences
35,2019,google_reviews,beach_waterfront,alternatif menuju pulau sentosa disamping naik...,10.0,activities


In [None]:
beach_DF_TRI.to_excel(f'/content/gdrive/MyDrive/data/export/beach_DF_TRI.xlsx', index=False)

# **9.  Visualization**

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
uss_topred = pd.read_excel("/content/gdrive/MyDrive/data/export/uss_DF_BIreduced.xlsx")
uss_topred.isnull().values.any()
uss_topred = uss_topred.loc[uss_topred['date'] != 2017]
uss_topred

In [None]:
sns.color_palette("gist_rainbow")

In [None]:
sns.set_palette("gist_rainbow")
sns.set_theme(style="ticks", font_scale=1.4)

In [None]:
plt.figure(figsize=(18,10))

hue_order = [2022, 2021, 2020, 2019, 2018]
sns.histplot(data=uss_topred, y="topic_label", hue="date", hue_order=hue_order, multiple="stack", bins= 50, palette="Set1") 


In [None]:
sns.color_palette('Set3')
plt.figure(figsize=(25,10))

hue_order = [2018, 2019, 2020,2021, 2022]
sns.histplot(data=uss_topred, x="topic_label", hue="date", hue_order=hue_order, multiple="dodge", bins=100, palette="Set1") 


In [None]:
plt.figure(figsize=(16,11))

sns.countplot(data=uss_topred, y="topic_label", hue="date", dodge=False)