# **Topic Modeling and Search with Top2Vec :** 
# **Universal Studios Singapore**

# **1. Import and Setup**

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
import sys
sys.path.append('/content/gdrive/My Drive/nlp')

## **a) To install Top2Vec library and pre-trained BERT sentence transformer options:**

In [None]:
# !pip install top2vec[sentence_transformers]
!pip install pynndescent


## **b) Import Libraries**

In [4]:
from top2vec import Top2Vec 

import pandas as pd
import numpy as np

# **2. Import Dataset**

In [5]:
df = pd.read_excel("/content/gdrive/MyDrive/data/clean/formatted_reviews.xlsx")
df.tail(4)

Unnamed: 0,date,source,attraction,reviews,rating
4595,2018.0,klook,adventure_cove,sangat menyenangkan berbeda dengan waterpark l...,10.0
4596,2018.0,klook,adventure_cove,tempatnya sangat bagusdan permainannya sangat ...,10.0
4597,2018.0,klook,adventure_cove,bagus permainan airnya anak saya suka sekali k...,10.0
4598,2017.0,klook,adventure_cove,pengalaman menyenangkan beli tiket melalui klo...,10.0


In [6]:
df['date'] = df['date'].astype(int)

uss_df = df[df['attraction'] == 'uss']
uss_df = uss_df.sort_values(by=['date','rating'],ascending=[False,False])
# resetting index
uss_df = uss_df.reset_index(drop=True)
uss_df.tail(4)

Unnamed: 0,date,source,attraction,reviews,rating
3013,2017,traveloka,uss,biasa saja untuk permainan yang menantang sang...,6.0
3014,2017,traveloka,uss,biasa saja banyak permainan yang sama di setia...,6.0
3015,2017,traveloka,uss,kebetulan pada saat kunjungan hujan penggunaan...,6.0
3016,2017,tripadvisor,uss,cocok untuk remaja visit ke atraksi di uss seb...,6.0


In [7]:
uss_docs = uss_df.loc[:, "reviews"].astype(str).values.tolist()
uss_docs[:7]

['luas bangetbanyak spot fotonyawahananya keren',
 'tempat yang wajib dikunjungi ketika ke singapura',
 'good banyak spot fotonyatempat shalatnya sebelah mana iya',
 'seruuu gaeeessss meegilan',
 'tempat yang sangat bagus untuk berlibur saya sangat suka dan puas',
 'wahananya sport jantung semua',
 'lokasinya sangat strategis untuk berwisata berbagai macam wahana yang disediakan dan sangat menghibur mulai dari photo booth bersama beberapa karakter film dari universal studio sampai berbagai macam atraksi yang harus di kunjungi seperti the mummy transformers dan galactica']

# **3. Create and Train N-Gram model (Bi-Gram and Tri-Gram)**
* Top2Vec uses Gensim **simple_preprocess** to do tokenization. 
* To create Trigrams and Bigrams, pass a customized tokenizer into Top2Vec tokenizer parameter. 
* Use Gensim to train this N-gram model. Documents passed to the Phrases function for it to inspect text corpus for N-grams.

In [8]:
import gensim
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import strip_tags

# Build the bigram and trigram models 
sentence_stream = [doc.split(" ") for doc in uss_docs]
bigram = Phrases(sentence_stream, min_count=5, threshold=5, delimiter=b' ')
trigram = Phrases(bigram[sentence_stream], threshold=5, delimiter=b' ')

bigram_phraser = Phraser(bigram)
trigram_phraser = Phraser(trigram)



In [9]:
# Create Bi-Grams
def bi_gram(doc):
    sentence_stream = simple_preprocess(strip_tags(str(doc)), deacc=True)
    return bigram_phraser[sentence_stream] 


# Create Bi-Grams and Tri-Grams
def tri_gram(doc):
    sentence_stream = simple_preprocess(strip_tags(str(doc)), deacc=True)
    gen_bigram = bigram_phraser[sentence_stream]
    gen_trigram = trigram_phraser[bigram_phraser[sentence_stream]]
    return gen_trigram

# **4. Train Top2Vec Model**

### **Parameters**:
* **Documents**: Input corpus, should be a list of strings.
* **Min_count**: (Optional, default 50) Ignores all words with total frequency lower than this. For smaller corpora a smaller min_count will be necessary.
* **Embedding_model** (string or callable) – The valid string options are: doc2vec , universal-sentence-encoder , universal-sentence-encoder-multilingual,distiluse-base-multilingual-cased , all-MiniLM-L6-v2 , paraphrase-multilingual-MiniLM-L12-v2 
* **tokenizer** (callable (Optional, default None)) – Override the default tokenization method. If None then gensim.utils.simple_preprocess will be used. ***Tokenizer must take a document and return a list of tokens***.


In [None]:
# FORM UNI-GRAMS, BI-GRAMS WITH BI-GRAM MODEL 

topicMODEL_bi = Top2Vec(documents= uss_docs, min_count=5, embedding_model = 'distiluse-base-multilingual-cased', tokenizer= bi_gram)  

2022-06-25 16:34:27,073 - top2vec - INFO - Pre-processing documents for training
2022-06-25 16:34:27,435 - top2vec - INFO - Downloading distiluse-base-multilingual-cased model
2022-06-25 16:34:34,440 - top2vec - INFO - Creating joint document/word embedding
2022-06-25 16:34:38,488 - top2vec - INFO - Creating lower dimension embedding of documents
2022-06-25 16:34:56,396 - top2vec - INFO - Finding dense areas of documents
2022-06-25 16:34:56,472 - top2vec - INFO - Finding topics


In [None]:
# FORM UNI-GRAMS, BI-GRAMS, TRI-GRAMS WITH TRI-GRAM MODEL 
 
topicMODEL_tri = Top2Vec(documents= uss_docs, min_count=5, embedding_model = 'distiluse-base-multilingual-cased', tokenizer = tri_gram)  

2022-06-25 16:35:19,126 - top2vec - INFO - Pre-processing documents for training
2022-06-25 16:35:19,710 - top2vec - INFO - Downloading distiluse-base-multilingual-cased model
2022-06-25 16:35:26,682 - top2vec - INFO - Creating joint document/word embedding
2022-06-25 16:35:30,842 - top2vec - INFO - Creating lower dimension embedding of documents
2022-06-25 16:35:47,528 - top2vec - INFO - Finding dense areas of documents
2022-06-25 16:35:47,595 - top2vec - INFO - Finding topics


In [None]:
# SAVE MODELS FOR FUTURE USE

topicMODEL_bi.save("/content/gdrive/MyDrive/data/models/bigram_uss_june26")
topicMODEL_tri.save("/content/gdrive/MyDrive/data/models/trigram_uss_june26")

In [10]:
# LOAD MODEL FROM SAVED MODELS

topicMODEL_bi = Top2Vec.load("/content/gdrive/MyDrive/data/models/bigram_uss_june26")
topicMODEL_tri = Top2Vec.load("/content/gdrive/MyDrive/data/models/trigram_uss_june26")

# **5. Explore Discovered Topics**

## **a) Get Number of Topics**
This will return the number of topics that Top2Vec has found in the data.

In [11]:
# Show the Total Number of Topics (No Topic Reduction Applied)

print(f"Total Number of Topics generated (No Topic Reduction Applied): \n")
print(f"Bi-gram Model : {topicMODEL_bi.get_num_topics(reduced=False)}")
print(f"Tri-gram Model : {topicMODEL_tri.get_num_topics(reduced=False)}")

Total Number of Topics generated (No Topic Reduction Applied): 

Bi-gram Model : 23
Tri-gram Model : 24


## **b) Get Topic Sizes**
This will return the number of documents most similar to each topic. Topics are in decreasing order of size.

Returns:
* topic_sizes: The number of documents most similar to each topic.
* topic_nums: The unique index of every topic will be returned.

### **b.1. Bi-Gram and Tri-Gram Model Topic Sizes**

In [12]:
print("BI-GRAM MODEL where N = 1 and 2")
print("================================\n")
topic_sizes_bi, topic_nums_bi = topicMODEL_bi.get_topic_sizes(reduced=False) 

print(f"Unique index numbers of every topic: {topic_nums_bi} ")
print()
print(f"Number of documents for each unique topic: {topic_sizes_bi} \n")

print("TRI-GRAM MODEL where N = 1, 2 and 3")
print("====================================\n")
topic_sizes_tri, topic_nums_tri = topicMODEL_tri.get_topic_sizes(reduced=False) 
print(f"Unique index numbers of every topic: {topic_nums_tri} ")
print()
print(f"Number of documents for each unique topic: {topic_sizes_tri} ") 

BI-GRAM MODEL where N = 1 and 2

Unique index numbers of every topic: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22] 

Number of documents for each unique topic: [539 414 210 161 141 134 134 130 121 120 117 104  98  93  80  67  65  58
  57  49  45  45  35] 

TRI-GRAM MODEL where N = 1, 2 and 3

Unique index numbers of every topic: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23] 

Number of documents for each unique topic: [503 425 194 147 146 144 138 122 121 115 106 101  95  93  82  76  65  63
  55  55  54  43  41  33] 


## **c) Get Topics**
This will return the topics in decreasing size.

Returns:

* topic_words: For each topic the top 50 words are returned, in decreasing order of semantic similarity to topic.

* word_scores: For each topic the cosine similarity scores (in decreasing order) of the top 50 words to the topic are returned.

* topic_nums: The unique index of every topic will be returned.

### **c.1.  BI-GRAM Model Topics**

In [13]:
print(f"TOPICS FOR BI-GRAM MODEL: \n")
topic_words_bi, word_scores_bi, topic_nums_bi = topicMODEL_bi.get_topics(topicMODEL_bi.get_num_topics(reduced=False))

for count,(words_bi, scores_bi, num_bi) in enumerate(zip(topic_words_bi, word_scores_bi, topic_nums_bi)):
    print(f"{count+1}. Topic {num_bi+1}") 
    print(f"Topic Words : \n{words_bi} \n")
    # print(f"Cosine Similarity :\n{scores_bi}\n")

TOPICS FOR BI-GRAM MODEL: 

1. Topic 1
Topic Words : 
['print tiket' 'tiket masuknya' 'pembelian tiket' 'tiket express'
 'beli tiketnya' 'membeli tiket' 'tiket masuk' 'ticket' 'ticketing'
 'tiketnya' 'harga tiketnya' 'harga tiket' 'penukaran tiket' 'pesan tiket'
 'scan barcode' 'beli tiket' 'langsung scan' 'tiket' 'barcode langsung'
 'para wisatawan' 'barcode saja' 'express pass' 'print out' 'print'
 'daripada beli' 'tempat wisata' 'scan' 'tunjukkan barcode' 'tinggal scan'
 'train' 'wisatawan' 'destinasi wisata' 'melalui traveloka' 'trip'
 'untuk dikunjungi' 'tukar tiket' 'wisata yang' 'museum' 'beli online'
 'di traveloka' 'beli express' 'agak mahal' 'sangat memudahkan' 'travel'
 'wisata' 'barcode' 'via traveloka' 'lebih murah' 'mengunjungi universal'
 'tujuan wisata'] 

2. Topic 2
Topic Words : 
['studio singapore' 'studio singapura' 'ke singapore' 'singapore'
 'ke singapura' 'singapur' 'di singapura' 'singapura' 'universal studios'
 'universal studio' 'mengunjungi universal' 'di ind

### **c.2. Tri-Gram Model Topics**

In [26]:
print(f"TOPICS FOR TRI-GRAM MODEL : \n")
topic_words_tri, word_scores_tri, topic_nums_tri = topicMODEL_tri.get_topics(topicMODEL_tri.get_num_topics(reduced=False))

for count, (words_tri, scores_tri, num_tri) in enumerate(zip(topic_words_tri, word_scores_tri, topic_nums_tri)):
    print(f"{count+1}. Topic {num_tri+1}") 
    print(f"Topic Words : \n{words_tri}\n")
    # print(f"Cosine Similarity :\n{scores_tri}\n")

TOPICS FOR TRI-GRAM MODEL : 

1. Topic 1
Topic Words : 
['untuk membeli tiket' 'print tiket' 'tiket masuknya' 'dapat tiket'
 'pembelian tiket' 'tiket express' 'beli tiketnya' 'membeli tiket'
 'harga tiket masuk' 'tiket masuk' 'ticket' 'tiket lagi' 'ticketing'
 'beli tiket di traveloka' 'tiketnya' 'beli tiket express'
 'harga tiketnya' 'harga tiket' 'penukaran tiket' 'beli tiket online'
 'langsung scan barcode' 'tinggal scan barcode saja' 'pesan tiket'
 'antri beli tiket' 'beli express pass' 'tinggal scan barcode'
 'scan barcode' 'beli tiket' 'langsung scan' 'beli di traveloka' 'tiket'
 'traveloka sangat membantu' 'barcode langsung' 'para wisatawan'
 'tidak perlu diprint' 'express pass' 'tinggal scan saja'
 'tidak perlu antri beli' 'tunjukkan barcode' 'lebih murah daripada beli'
 'print out' 'beli tiket uss' 'print' 'scan' 'tinggal scan'
 'tempat wisata' 'train' 'wisatawan' 'tempat wisata yang'
 'destinasi wisata']

2. Topic 2
Topic Words : 
['universal studios singapore' 'universal stu

## **d) Search Documents by Topic**
We are going to search by topic; for example Topic 3. 

Returns:

* documents:  The documents in a list, the most similar are first.
* doc_scores:  Semantic similarity of document to topic. The cosine similarity of the document and topic vector.
* doc_ids:  Unique ids of documents. If ids were not given, the index of document in the original corpus.

For each of the returned documents we are going to print its content, score and document number.

### **d.1. Search Documents By Topic Number Using Bi-Gram Model**

In [27]:
# SEARCH DOCUMENTS BY TOPIC NUMBER USING BI-GRAM MODEL 
topic_num_bi = 0
documents_bi, document_scores_bi, document_ids_bi = topicMODEL_bi.search_documents_by_topic(topic_num= topic_num_bi , num_docs= 5, reduced=False) 

print(f"TOPIC NUMBER: {topic_num_bi}\n")
for count, (doc_bi, docscore_bi, doc_id_bi) in enumerate(zip(documents_bi, document_scores_bi, document_ids_bi)):
    print(f"{count}. Document: {doc_id_bi},  Score: {docscore_bi}")
    print("-----------")
    print(doc_bi)
    print("-----------\n")

TOPIC NUMBER: 0

0. Document: 1766,  Score: 0.7136781215667725
-----------
beli tiket melalui traveloka sangat mudah langsung masuk dengan scan barcode saja gak perlu mengantri dapat discount banyak lagi beli 3 gratis 1 tiket masuk terima kasih traveloka sangat membantu sekali
-----------

1. Document: 1621,  Score: 0.7053670883178711
-----------
it was so fun so easy to redeem the ticket only scan it and i belum enter the best so far
-----------

2. Document: 1884,  Score: 0.6885074377059937
-----------
the ticket is cheaper easy to redeem only show e ticket when you entrance thank you traveloka
-----------

3. Document: 1430,  Score: 0.6769388914108276
-----------
we really had a good time spent in uss in addition we also got the ticket from traveloka with a nice discount so its definitely cheaper and super easyone trick eventhough in the ticket said that it is needed to print the ticket you definitely shouldnt because they belum easily scan the qrbarcode thru ur phone time saver and

In [15]:
uss_df.iloc[[781]]

Unnamed: 0,date,source,attraction,reviews,rating
781,2020,traveloka,uss,pembelian tiket melalui traveloka sangat mudah...,8.0


### **d.2. Search Documents By Topic Number Using Tri-Gram Model**

In [28]:
# SEARCH DOCUMENTS BY TOPIC NUMBER USING TRI-GRAM MODEL 
topic_num_tri = 0
documents_tri, document_scores_tri, document_ids_tri = topicMODEL_tri.search_documents_by_topic(topic_num= topic_num_tri, num_docs= 5, reduced=False) 
print(f"TOPIC NUMBER: {topic_num_bi}\n")
for count, (doc_tri, docscore_tri, doc_id_tri) in enumerate(zip(documents_tri, document_scores_tri, document_ids_tri)):
    print(f"{count}. Document: {doc_id_tri},  Score: {docscore_tri}")
    print("-----------")
    print(doc_tri)
    print("-----------\n")

TOPIC NUMBER: 0

0. Document: 1766,  Score: 0.7151544690132141
-----------
beli tiket melalui traveloka sangat mudah langsung masuk dengan scan barcode saja gak perlu mengantri dapat discount banyak lagi beli 3 gratis 1 tiket masuk terima kasih traveloka sangat membantu sekali
-----------

1. Document: 1621,  Score: 0.7036617398262024
-----------
it was so fun so easy to redeem the ticket only scan it and i belum enter the best so far
-----------

2. Document: 1884,  Score: 0.6849204301834106
-----------
the ticket is cheaper easy to redeem only show e ticket when you entrance thank you traveloka
-----------

3. Document: 1430,  Score: 0.6770241260528564
-----------
we really had a good time spent in uss in addition we also got the ticket from traveloka with a nice discount so its definitely cheaper and super easyone trick eventhough in the ticket said that it is needed to print the ticket you definitely shouldnt because they belum easily scan the qrbarcode thru ur phone time saver and

In [17]:
uss_df.iloc[[2317]]

Unnamed: 0,date,source,attraction,reviews,rating
2317,2019,traveloka,uss,redeem tiketnya mudah tinggal scan barcode tan...,8.0


# **6) Constructing Dataframe to Collate Information of Key Topics**

## **6a) BI-GRAM MODEL**

In [29]:
# CONSTRUCT DATAFRAME OF BI-GRAM MODEL WITH KEY INFORMATION FOR EXPORT
# initialize data of lists.
bigram_table = {
              'topic number': [num_bi for num_bi in topic_nums_bi],
              'topic words': [words_bi for words_bi in topic_words_bi],
              'cosine similarity metrics': [scores_bi for scores_bi in word_scores_bi],
              'similar documents': [size_bi for size_bi in topic_sizes_bi],
               }
 
# Create DataFrame
bigram_df = pd.DataFrame(bigram_table)
bigram_df['topic number'] = bigram_df['topic number'].apply(lambda x: x + 1)

# print(bigram_df.head(2))
bigram_df.tail(3)

Unnamed: 0,topic number,topic words,cosine similarity metrics,similar documents
20,21,"[tempatnya bersih, bersih dan, kebersihan, ber...","[0.78127277, 0.7225082, 0.6755369, 0.6518676, ...",45
21,22,"[great experience, pengalaman yang, pengalaman...","[0.84718263, 0.68488556, 0.6509856, 0.63761175...",45
22,23,"[panas banget, panas, cuaca, tempatnya bagus, ...","[0.6878185, 0.6220603, 0.38984385, 0.36566377,...",35


### **6a.1. Build Functions to Expand Column Features in Bi-Gram Dataframe**

In [30]:
def get_document_index(topic_num_bi, num_doc):
    documents_bi, document_scores_bi, document_ids_bi = topicMODEL_bi.search_documents_by_topic(topic_num= topic_num_bi , num_docs= num_doc)
    document_index = [doc_id_bi for doc_id_bi in document_ids_bi]
    return document_index 

def get_document_scores(topic_num_bi, num_doc):
    documents_bi, document_scores_bi, document_ids_bi = topicMODEL_bi.search_documents_by_topic(topic_num= topic_num_bi , num_docs= num_doc)
    document_scores = [docscore_bi for docscore_bi in document_scores_bi]
    return document_scores


def get_documents(topic_num_bi, num_doc):
    documents_bi, document_scores_bi, document_ids_bi = topicMODEL_bi.search_documents_by_topic(topic_num= topic_num_bi , num_docs= num_doc)
    actual_document = [doc_bi for doc_bi in documents_bi]
    return actual_document


In [31]:
bigram_df['document index'] = bigram_df['topic number'].apply(lambda x: get_document_index(x-1, bigram_df['similar documents'].values[x-1]))

bigram_df['document scores'] = bigram_df['topic number'].apply(lambda x: get_document_scores(x-1, bigram_df['similar documents'].values[x-1]))

bigram_df['actual reviews'] = bigram_df['topic number'].apply(lambda x: get_documents(x-1, bigram_df['similar documents'].values[x-1]))

# bigram_df = bigram_df.drop(['topic label'], axis = 1)
bigram_df.head(3)

Unnamed: 0,topic number,topic words,cosine similarity metrics,similar documents,document index,document scores,actual reviews
0,1,"[print tiket, tiket masuknya, pembelian tiket,...","[0.48053187, 0.4744914, 0.46251032, 0.46125066...",539,"[1766, 1621, 1884, 1430, 781, 685, 1516, 2317,...","[0.7136781, 0.7053671, 0.68850744, 0.6769389, ...",[beli tiket melalui traveloka sangat mudah lan...
1,2,"[studio singapore, studio singapura, ke singap...","[0.56663513, 0.54078025, 0.46721637, 0.4585612...",414,"[448, 2695, 1616, 462, 2498, 943, 2834, 2887, ...","[0.7398058, 0.72662103, 0.7210042, 0.71862245,...",[universal studio menjadi salah satu destinasi...
2,3,"[bawa botol, untuk liburan, agak mahal, sangat...","[0.16208713, 0.14814812, 0.14463222, 0.1359866...",210,"[558, 1759, 1345, 3010, 1866, 2891, 2213, 766,...","[0.5157841, 0.509369, 0.5049091, 0.50187576, 0...",[kami perginya hari biasa berkesan sekali kare...


In [33]:
# for count, (index, review) in enumerate(zip(bigram_df['document index'][28], bigram_df['actual reviews'][28])):
#   print(f"{count}: {index} = {review})")

bigram_df.to_excel(f'/content/gdrive/MyDrive/data/export/bigram_df.xlsx', index=False)

## **6b) TRI-GRAM MODEL**

In [32]:
# CONSTRUCT DATAFRAME CONTAINING KEY INFORMATION FOR EXPORT
# initialize data of lists.
trigram_table = {
                'topic number': [num_tri for num_tri in topic_nums_tri],
                'topic words': [words_tri for words_tri in topic_words_tri],
                'cosine similarity metrics': [scores_tri for scores_tri in word_scores_tri],
                'similar documents': [size_tri for size_tri in topic_sizes_tri],
                 }

# Create DataFrame
trigram_df = pd.DataFrame(trigram_table)

trigram_df['topic number'] = trigram_df['topic number'].apply(lambda x: x + 1)
trigram_df.tail(3)

Unnamed: 0,topic number,topic words,cosine similarity metrics,similar documents
21,22,"[pengalaman yang sangat menyenangkan, pengalam...","[0.8980728, 0.86042047, 0.85842794, 0.85593617...",43
22,23,"[tempatnya bersih, bersih dan, kebersihan, ber...","[0.7736695, 0.72027516, 0.6680286, 0.64303523,...",41
23,24,"[panas banget, panas, cuaca, tempat yang sanga...","[0.6955763, 0.6286695, 0.39588356, 0.37437475,...",33


### **6b.1. Build Functions to Expand Column Features in Tri-Gram Dataframe**

In [34]:
def get_document_index_tri(topic_num_tri, num_docs):
    documents_tri, document_scores_tri, document_ids_tri = topicMODEL_tri.search_documents_by_topic(topic_num= topic_num_tri , num_docs= num_docs)
    document_index = [doc_id_tri for doc_id_tri in document_ids_tri]
    return document_index 

def get_document_scores_tri(topic_num_tri, num_docs):
    documents_tri, document_scores_tri, document_ids_tri = topicMODEL_tri.search_documents_by_topic(topic_num= topic_num_tri , num_docs= num_docs)
    document_scores = [docscore_tri for docscore_tri in document_scores_tri]
    return document_scores


def get_documents_tri(topic_num_tri, num_docs):
    documents_tri, document_scores_tri, document_ids_tri = topicMODEL_tri.search_documents_by_topic(topic_num= topic_num_tri , num_docs= num_docs)
    actual_document = [doc_tri for doc_tri in documents_tri]
    return actual_document


In [35]:
trigram_df['document index'] = trigram_df['topic number'].apply(lambda x: get_document_index_tri(x-1, trigram_df['similar documents'].values[x-1]))

trigram_df['document scores'] = trigram_df['topic number'].apply(lambda x: get_document_scores_tri(x-1, trigram_df['similar documents'].values[x-1]))

trigram_df['actual reviews'] = trigram_df['topic number'].apply(lambda x: get_documents_tri(x-1, trigram_df['similar documents'].values[x-1]))

trigram_df.tail(3)

Unnamed: 0,topic number,topic words,cosine similarity metrics,similar documents,document index,document scores,actual reviews
21,22,"[pengalaman yang sangat menyenangkan, pengalam...","[0.8980728, 0.86042047, 0.85842794, 0.85593617...",43,"[1566, 2874, 467, 545, 1555, 1750, 1011, 540, ...","[0.85168314, 0.7343966, 0.6842847, 0.6584915, ...","[great best experience i ever had, one of the ..."
22,23,"[tempatnya bersih, bersih dan, kebersihan, ber...","[0.7736695, 0.72027516, 0.6680286, 0.64303523,...",41,"[378, 348, 1807, 1460, 341, 1279, 403, 1079, 1...","[0.60679775, 0.5852969, 0.5476792, 0.54500735,...","[tempat yang meriah dan bersih benas polusi, i..."
23,24,"[panas banget, panas, cuaca, tempat yang sanga...","[0.6955763, 0.6286695, 0.39588356, 0.37437475,...",33,"[947, 2728, 1154, 2448, 1199, 658, 2408, 953, ...","[0.5307628, 0.5059658, 0.502086, 0.490764, 0.4...","[panas banget kalau siang tapi bagus, bagus ta..."


In [38]:
# for count, (index, review) in enumerate(zip(trigram_df['document index'][0], trigram_df['actual reviews'][0])):
#   print(f"{count}: {index} = {review})")

trigram_df.to_excel(f'/content/gdrive/MyDrive/data/export/trigram_df.xlsx', index=False)

# **7. Defining Topic Labels on Reviews** 

## **7a) User-Define Topic Labels** 

### **7a.1:  BI-GRAM Model** 

In [60]:
# Create the dictionary containing the data of the new column
bigram_col_dict = { 'Topic 1': 'ticketing',                     'Topic 2': 'must visit place',
                    'Topic 3': 'visitors experiences',          'Topic 4': 'waiting times',
                    'Topic 5': 'amusement rides',               'Topic 6': 'amusement rides',
                    'Topic 7': 'visitors experiences',          'Topic 8': 'visitors sentiments',
                    'Topic 9': 'must visit place',              'Topic 10': "great choice for families",
                    'Topic 11': 'vacation choice',              'Topic 12': 'likely to return',
                    'Topic 13': 'great place for photos',       'Topic 14': 'great choice for families',
                    'Topic 15': 'great choice for families',    'Topic 16': 'visitor sentiments',
                    'Topic 17': 'vacation choice',              'Topic 18': 'vacation choice',
                    'Topic 19': 'visitors experiences',         'Topic 20': 'great choice for families',
                    'Topic 21': 'features visitors like',       'Topic 22': 'likely to return',
                    'Topic 23': 'visitors experiences',                
                    }

# Assign the values of the dictionary as the values of the new column
bigram_df['topic label'] = bigram_col_dict.values()
bigram_df.tail()

Unnamed: 0,topic number,topic words,cosine similarity metrics,similar documents,document index,document scores,actual reviews,topic label
18,19,"[hujan jadi, jas hujan, hujan, cuaca, water wo...","[0.6622517, 0.6202771, 0.59615654, 0.31625953,...",57,"[2059, 3000, 1643, 2127, 2101, 1629, 848, 778,...","[0.62204134, 0.610506, 0.5848205, 0.5805275, 0...",[sempat hujan walau sebentar untungnya setelah...,visitors experiences
19,20,"[untuk anak, anakanak senang, buat anakanak, b...","[0.5115683, 0.5008489, 0.47810382, 0.45241556,...",49,"[2971, 2221, 744, 1140, 635, 2557, 413, 2211, ...","[0.73169804, 0.6672112, 0.62780046, 0.5950476,...","[good destination for kids we all enjoy it, me...",great choice for families
20,21,"[tempatnya bersih, bersih dan, kebersihan, ber...","[0.78127277, 0.7225082, 0.6755369, 0.6518676, ...",45,"[378, 348, 1460, 1807, 341, 1279, 403, 1079, 1...","[0.61077255, 0.5825862, 0.54547447, 0.5447869,...","[tempat yang meriah dan bersih benas polusi, i...",features visitors like
21,22,"[great experience, pengalaman yang, pengalaman...","[0.84718263, 0.68488556, 0.6509856, 0.63761175...",45,"[1566, 2874, 467, 545, 1555, 1750, 1011, 662, ...","[0.8426788, 0.7263293, 0.6884376, 0.6562073, 0...","[great best experience i ever had, one of the ...",likely to return
22,23,"[panas banget, panas, cuaca, tempatnya bagus, ...","[0.6878185, 0.6220603, 0.38984385, 0.36566377,...",35,"[947, 1154, 2728, 2448, 658, 1199, 2408, 953, ...","[0.5249679, 0.5028735, 0.5001302, 0.4937374, 0...","[panas banget kalau siang tapi bagus, tempat y...",visitors experiences


In [61]:
# CREATE COPY OF ORIGINAL DATAFRAME
uss_DF_BI = uss_df.copy()
uss_DF_BI.tail(3)

Unnamed: 0,date,source,attraction,reviews,rating
3014,2017,traveloka,uss,biasa saja banyak permainan yang sama di setia...,6.0
3015,2017,traveloka,uss,kebetulan pada saat kunjungan hujan penggunaan...,6.0
3016,2017,tripadvisor,uss,cocok untuk remaja visit ke atraksi di uss seb...,6.0


In [62]:
print([i for i in range(topicMODEL_bi.get_num_topics(reduced=False))])

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]


In [63]:
# FUNCTION TO CONNECT TOPIC LABELS TO INDEX LIST
def get_indexLIST(i):
      topic = [doc_ind for doc_ind in bigram_df['document index'][i]]
      return topic

In [64]:
for i in range(topicMODEL_bi.get_num_topics(reduced=False)):
    uss_DF_BI.loc[get_indexLIST(i), 'topic label'] = bigram_df['topic label'][i]
    

In [65]:
# FUNCTION TO CONNECT DOCUMENT COSINE SCORE TO INDEX LIST
def get_scores(i):
    score = [doc_score for doc_score in bigram_df['document scores'][i]]
    return score

In [66]:
for i in range(topicMODEL_bi.get_num_topics(reduced=False)):
    uss_DF_BI.loc[get_indexLIST(i), 'cosine score review/topic'] = get_scores(i) 

In [67]:
uss_DF_BI.reset_index(drop=True)
uss_DF_BI

Unnamed: 0,date,source,attraction,reviews,rating,topic label,cosine score review/topic
0,2022,google_reviews,uss,luas bangetbanyak spot fotonyawahananya keren,10.0,must visit place,0.368599
1,2022,google_reviews,uss,tempat yang wajib dikunjungi ketika ke singapura,10.0,vacation choice,0.359690
2,2022,google_reviews,uss,good banyak spot fotonyatempat shalatnya sebel...,10.0,great place for photos,0.465302
3,2022,google_reviews,uss,seruuu gaeeessss meegilan,10.0,visitors experiences,0.378129
4,2022,google_reviews,uss,tempat yang sangat bagus untuk berlibur saya s...,10.0,must visit place,0.537937
...,...,...,...,...,...,...,...
3012,2017,tripadvisor,uss,amazing gak pernah nyesel masuk ketempat ini w...,8.0,amusement rides,0.494483
3013,2017,traveloka,uss,biasa saja untuk permainan yang menantang sang...,6.0,amusement rides,0.292150
3014,2017,traveloka,uss,biasa saja banyak permainan yang sama di setia...,6.0,amusement rides,0.331226
3015,2017,traveloka,uss,kebetulan pada saat kunjungan hujan penggunaan...,6.0,visitors experiences,0.425192


In [68]:
# TO VERIFY TOPIC LABELING IS CORRECT: 
# STEP 1 - GENERATE RANDOM SAMPLE TABLE FROM NEW DATAFRAME 
bigram_df.sample(4)

Unnamed: 0,topic number,topic words,cosine similarity metrics,similar documents,document index,document scores,actual reviews,topic label
12,13,"[spot foto, foto foto, fotofoto, bisa berfoto,...","[0.4962579, 0.49174786, 0.48143056, 0.44849306...",98,"[2880, 171, 296, 2075, 1682, 1180, 1741, 803, ...","[0.58654296, 0.58290195, 0.57051057, 0.5160266...",[unforgettable place tempat ini sangat bagus b...,great place for photos
22,23,"[panas banget, panas, cuaca, tempatnya bagus, ...","[0.6878185, 0.6220603, 0.38984385, 0.36566377,...",35,"[947, 1154, 2728, 2448, 658, 1199, 2408, 953, ...","[0.5249679, 0.5028735, 0.5001302, 0.4937374, 0...","[panas banget kalau siang tapi bagus, tempat y...",visitors experiences
7,8,"[sangat menyenangkan, menyenangkan sekali, san...","[0.8143233, 0.7388338, 0.71513903, 0.71454847,...",130,"[1586, 1798, 393, 1936, 1166, 671, 1069, 392, ...","[0.7915581, 0.74954116, 0.71967494, 0.67786956...","[amazing very fun love it, really nice fun and...",visitors sentiments
20,21,"[tempatnya bersih, bersih dan, kebersihan, ber...","[0.78127277, 0.7225082, 0.6755369, 0.6518676, ...",45,"[378, 348, 1460, 1807, 341, 1279, 403, 1079, 1...","[0.61077255, 0.5825862, 0.54547447, 0.5447869,...","[tempat yang meriah dan bersih benas polusi, i...",features visitors like


In [70]:
# TO VERIFY TOPIC LABELING IS CORRECT: 
# STEP 2 - SELECT INDEXES FROM ORIGINAL DATAFRAME TO COMPARE

uss_DF_BI.iloc[[2880, 947, 1586, 378]] 

Unnamed: 0,date,source,attraction,reviews,rating,topic label,cosine score review/topic
2880,2017,tripadvisor,uss,unforgettable place tempat ini sangat bagus ba...,10.0,great place for photos,0.586543
947,2019,google_reviews,uss,panas banget kalau siang tapi bagus,10.0,visitors experiences,0.524968
1586,2019,traveloka,uss,amazing very fun love it,10.0,visitors sentiments,0.791558
378,2020,google_reviews,uss,tempat yang meriah dan bersih benas polusi,10.0,features visitors like,0.610773


In [71]:
uss_DF_BI.to_excel(f'/content/gdrive/MyDrive/data/export/uss_DF_BI.xlsx', index=False)

In [72]:
# CREATE COPY OF ORIGINAL DATAFRAME
uss_DF = uss_df.copy()
uss_DF['topic_label'] = 'Topic'

In [73]:
# TO CROSS-CHECK ABOVE LABELING. ASSIGN TOPIC LABELS TO ORIGINAL DATAFRAME FOR BI-GRAM MODEL

uss_DF.loc[[a for a in bigram_df['document index'][0]], 'topic_label'] = bigram_df['topic label'][0]
uss_DF.loc[[a for a in bigram_df['document index'][1]], 'topic_label'] = bigram_df['topic label'][1]
uss_DF.loc[[a for a in bigram_df['document index'][2]], 'topic_label'] = bigram_df['topic label'][2]
uss_DF.loc[[a for a in bigram_df['document index'][3]], 'topic_label'] = bigram_df['topic label'][3]
uss_DF.loc[[a for a in bigram_df['document index'][4]], 'topic_label'] = bigram_df['topic label'][4]
uss_DF.loc[[a for a in bigram_df['document index'][5]], 'topic_label'] = bigram_df['topic label'][5]
uss_DF.loc[[a for a in bigram_df['document index'][6]], 'topic_label'] = bigram_df['topic label'][6]
uss_DF.loc[[a for a in bigram_df['document index'][7]], 'topic_label'] = bigram_df['topic label'][7]
uss_DF.loc[[a for a in bigram_df['document index'][8]], 'topic_label'] = bigram_df['topic label'][8]
uss_DF.loc[[a for a in bigram_df['document index'][9]], 'topic_label'] = bigram_df['topic label'][9]
uss_DF.loc[[a for a in bigram_df['document index'][10]], 'topic_label'] = bigram_df['topic label'][10]
uss_DF.loc[[a for a in bigram_df['document index'][11]], 'topic_label'] = bigram_df['topic label'][11]
uss_DF.loc[[a for a in bigram_df['document index'][12]], 'topic_label'] = bigram_df['topic label'][12]
uss_DF.loc[[a for a in bigram_df['document index'][13]], 'topic_label'] = bigram_df['topic label'][13]
uss_DF.loc[[a for a in bigram_df['document index'][14]], 'topic_label'] = bigram_df['topic label'][14]
uss_DF.loc[[a for a in bigram_df['document index'][15]], 'topic_label'] = bigram_df['topic label'][15]
uss_DF.loc[[a for a in bigram_df['document index'][16]], 'topic_label'] = bigram_df['topic label'][16]
uss_DF.loc[[a for a in bigram_df['document index'][17]], 'topic_label'] = bigram_df['topic label'][17]
uss_DF.loc[[a for a in bigram_df['document index'][18]], 'topic_label'] = bigram_df['topic label'][18]
uss_DF.loc[[a for a in bigram_df['document index'][19]], 'topic_label'] = bigram_df['topic label'][19]
uss_DF.loc[[a for a in bigram_df['document index'][20]], 'topic_label'] = bigram_df['topic label'][20] 
uss_DF.loc[[a for a in bigram_df['document index'][21]], 'topic_label'] = bigram_df['topic label'][21]
uss_DF.loc[[a for a in bigram_df['document index'][22]], 'topic_label'] = bigram_df['topic label'][22]


In [74]:
uss_DF.reset_index(drop=True)

Unnamed: 0,date,source,attraction,reviews,rating,topic_label
0,2022,google_reviews,uss,luas bangetbanyak spot fotonyawahananya keren,10.0,must visit place
1,2022,google_reviews,uss,tempat yang wajib dikunjungi ketika ke singapura,10.0,vacation choice
2,2022,google_reviews,uss,good banyak spot fotonyatempat shalatnya sebel...,10.0,great place for photos
3,2022,google_reviews,uss,seruuu gaeeessss meegilan,10.0,visitors experiences
4,2022,google_reviews,uss,tempat yang sangat bagus untuk berlibur saya s...,10.0,must visit place
...,...,...,...,...,...,...
3012,2017,tripadvisor,uss,amazing gak pernah nyesel masuk ketempat ini w...,8.0,amusement rides
3013,2017,traveloka,uss,biasa saja untuk permainan yang menantang sang...,6.0,amusement rides
3014,2017,traveloka,uss,biasa saja banyak permainan yang sama di setia...,6.0,amusement rides
3015,2017,traveloka,uss,kebetulan pada saat kunjungan hujan penggunaan...,6.0,visitors experiences


### **7a.2  TRI-GRAM Model** 

In [77]:
# Create the dictionary containing the data of the new column
trigram_col_dict = { 'Topic 1': 'ticketing',                    'Topic 2': 'must visit place',
                    'Topic 3': 'visitors experiences',          'Topic 4': 'amusement rides',
                    'Topic 5': 'waiting times',                 'Topic 6': 'amusement rides',
                    'Topic 7': 'features visitors like',        'Topic 8': 'visitors sentiments',
                    'Topic 9': 'must visit place',              'Topic 10': "vacation choice",
                    'Topic 11': 'great choice for families',    'Topic 12': 'likely to return',
                    'Topic 13': 'great choice for families',    'Topic 14': 'great place for photos',
                    'Topic 15': 'great choice for families',    'Topic 16': 'features visitors like',
                    'Topic 17': 'vacation choice',              'Topic 18': 'features visitors like',
                    'Topic 19': 'visitors experiences',         'Topic 20': 'great choice for families',
                    'Topic 21': 'vacation choice',              'Topic 22': 'likely to return',
                    'Topic 23': 'features visitors like',       'Topic 24': 'visitors experiences',        
                    }
                   
# Assign the values of the dictionary as the values of the new column
trigram_df['topic label'] = trigram_col_dict.values()
trigram_df.tail(3)

Unnamed: 0,topic number,topic words,cosine similarity metrics,similar documents,document index,document scores,actual reviews,topic label
21,22,"[pengalaman yang sangat menyenangkan, pengalam...","[0.8980728, 0.86042047, 0.85842794, 0.85593617...",43,"[1566, 2874, 467, 545, 1555, 1750, 1011, 540, ...","[0.85168314, 0.7343966, 0.6842847, 0.6584915, ...","[great best experience i ever had, one of the ...",likely to return
22,23,"[tempatnya bersih, bersih dan, kebersihan, ber...","[0.7736695, 0.72027516, 0.6680286, 0.64303523,...",41,"[378, 348, 1807, 1460, 341, 1279, 403, 1079, 1...","[0.60679775, 0.5852969, 0.5476792, 0.54500735,...","[tempat yang meriah dan bersih benas polusi, i...",features visitors like
23,24,"[panas banget, panas, cuaca, tempat yang sanga...","[0.6955763, 0.6286695, 0.39588356, 0.37437475,...",33,"[947, 2728, 1154, 2448, 1199, 658, 2408, 953, ...","[0.5307628, 0.5059658, 0.502086, 0.490764, 0.4...","[panas banget kalau siang tapi bagus, bagus ta...",visitors experiences


In [78]:
# CREATE COPY OF ORIGINAL DATAFRAME
uss_DF_TRI = uss_df.copy()
uss_DF_TRI.tail(3)

Unnamed: 0,date,source,attraction,reviews,rating
3014,2017,traveloka,uss,biasa saja banyak permainan yang sama di setia...,6.0
3015,2017,traveloka,uss,kebetulan pada saat kunjungan hujan penggunaan...,6.0
3016,2017,tripadvisor,uss,cocok untuk remaja visit ke atraksi di uss seb...,6.0


In [79]:
print([i for i in range(topicMODEL_tri.get_num_topics(reduced=False))])

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]


In [80]:
# FUNCTION TO CONNECT TOPIC LABELS TO INDEX LIST
def get_indexLIST(i):
      topic = [doc_ind for doc_ind in trigram_df['document index'][i]]
      return topic

In [81]:
for i in range(topicMODEL_tri.get_num_topics(reduced=False)):
    uss_DF_TRI.loc[get_indexLIST(i), 'topic label'] = trigram_df['topic label'][i]
    

In [82]:
# FUNCTION TO CONNECT DOCUMENT COSINE SCORE TO INDEX LIST
def get_scores(i):
    score = [doc_score for doc_score in trigram_df['document scores'][i]]
    return score

In [83]:
for i in range(topicMODEL_tri.get_num_topics(reduced=False)):
    uss_DF_TRI.loc[get_indexLIST(i), 'cosine score review/topic'] = get_scores(i) 

In [84]:
uss_DF_TRI.reset_index(drop=True)
uss_DF_TRI

Unnamed: 0,date,source,attraction,reviews,rating,topic label,cosine score review/topic
0,2022,google_reviews,uss,luas bangetbanyak spot fotonyawahananya keren,10.0,amusement rides,0.369613
1,2022,google_reviews,uss,tempat yang wajib dikunjungi ketika ke singapura,10.0,vacation choice,0.329048
2,2022,google_reviews,uss,good banyak spot fotonyatempat shalatnya sebel...,10.0,great place for photos,0.476026
3,2022,google_reviews,uss,seruuu gaeeessss meegilan,10.0,amusement rides,0.348943
4,2022,google_reviews,uss,tempat yang sangat bagus untuk berlibur saya s...,10.0,vacation choice,0.541449
...,...,...,...,...,...,...,...
3012,2017,tripadvisor,uss,amazing gak pernah nyesel masuk ketempat ini w...,8.0,amusement rides,0.495568
3013,2017,traveloka,uss,biasa saja untuk permainan yang menantang sang...,6.0,features visitors like,0.303512
3014,2017,traveloka,uss,biasa saja banyak permainan yang sama di setia...,6.0,features visitors like,0.337319
3015,2017,traveloka,uss,kebetulan pada saat kunjungan hujan penggunaan...,6.0,visitors experiences,0.427088


In [85]:
# TO VERIFY TOPIC LABELING IS CORRECT: 
# STEP 1 - GENERATE RANDOM SAMPLE TABLE FROM NEW DATAFRAME 
trigram_df.sample(4)

Unnamed: 0,topic number,topic words,cosine similarity metrics,similar documents,document index,document scores,actual reviews,topic label
2,3,"[air minum gratis, bawa botol, air minum, boto...","[0.21186632, 0.19986334, 0.18350515, 0.1665753...",194,"[558, 3010, 2891, 1345, 2213, 766, 1866, 3002,...","[0.51386213, 0.51201916, 0.50724167, 0.504512,...",[kami perginya hari biasa berkesan sekali kare...,visitors experiences
16,17,"[para wisatawan, tempat yang wajib dikunjungi,...","[0.65556145, 0.64901084, 0.62958825, 0.6106112...",65,"[2603, 2291, 1173, 1232, 2992, 2946, 252, 979,...","[0.65686965, 0.61337245, 0.6081433, 0.5887345,...",[nice place tempat yang wajib dikunjungi oleh ...,vacation choice
5,6,"[wahana transformer, wahana transformers, adal...","[0.6466844, 0.6341004, 0.60494953, 0.6013165, ...",144,"[555, 1546, 1347, 2967, 2454, 3004, 1418, 1605...","[0.6763953, 0.6240482, 0.6180368, 0.5923507, 0...",[very good amazing so nice transformer are the...,amusement rides
9,10,"[tempat yang sangat bagus, tempat yang menyena...","[0.89119315, 0.86382794, 0.8293486, 0.8041558,...",115,"[1545, 1244, 1764, 1255, 2052, 2961, 1722, 193...","[0.73558575, 0.6934129, 0.6352795, 0.63052154,...","[good place for fun there we was enjoy it, goo...",vacation choice


In [86]:
# TO VERIFY TOPIC LABELING IS CORRECT: 
# STEP 2 - SELECT INDEXES FROM ORIGINAL DATAFRAME TO COMPARE

uss_DF_TRI.iloc[[558, 2601, 555, 1545]] 

Unnamed: 0,date,source,attraction,reviews,rating,topic label,cosine score review/topic
558,2020,traveloka,uss,kami perginya hari biasa berkesan sekali karen...,10.0,visitors experiences,0.513862
2601,2018,google_reviews,uss,amazing place senang anak berlibur dsana,10.0,vacation choice,0.559567
555,2020,traveloka,uss,very good amazing so nice transformer are the ...,10.0,amusement rides,0.676395
1545,2019,traveloka,uss,good place for fun there we was enjoy it,10.0,vacation choice,0.735586


In [88]:
uss_DF_TRI.to_excel(f'/content/gdrive/MyDrive/data/export/uss_DF_TRI.xlsx', index=False)

# **8. Fine Tuning Topic Labels or Summarization Tool**


## **8a) Perform Keywords Search for User Defined Topic Labels Verification**
We are going to search for topics most similar to the word "enjoyable", "experience", "return back",  in Indonesian language; which is "menyenangkan" , "pengalaman", "kembali". 

Returns:

* topic_words:  For each topic the top 50 words are returned, in order of semantic similarity to topic.

* word_scores:  For each topic the cosine similarity scores of the top 50 words to the topic are returned.

* topic_scores:  For each topic the cosine similarity to the search keywords will be returned.

* topic_nums:  The unique index of every topic will be returned.

### **8a.1.  BI-GRAM Model**

In [None]:
# GENERATE TOPIC WORDS FOR BI-GRAM MODEL WITH SEARCH KEYWORDS

topic_keywords_bi, keyword_scores_bi, topic_scores_bi, keytopic_nums_bi = topicMODEL_bi.search_topics(keywords=["kembali", "pengalaman", "menyenangkan"], num_topics=5, keywords_neg=None, reduced=False) 

print(f"Unique index of every topic returned: {keytopic_nums_bi} \n") 
print(f"Cosine similarity score of each Topic to the search keywords : {topic_scores_bi} \n")   

for count, (topicword_bi, wordscore_bi, topicscore_bi, topicnum_bi) in enumerate(zip(topic_keywords_bi, keyword_scores_bi, topic_scores_bi, keytopic_nums_bi)):
    print(f"{count+1}  Topic: {topicnum_bi}   ")
    print(f"Cosine Similarity Score of Topic relative to Search Words: {topicscore_bi} \n") 
    print(f"Topic Words : \n{topicword_bi} \n")
    # print(f"Cosine Similarity Score of Words relative to Topic :\n{wordscore_bi}\n")


### **8a.2.   TRI-GRAM Model**

In [None]:
# GENERATE TOPIC WORDS FOR TRI-GRAM MODEL WITH SEARCH KEYWORDS

topic_keywords_tri, keyword_scores_tri, topic_scores_tri, keytopic_nums_tri = topicMODEL_tri.search_topics(keywords=["kembali", "pengalaman", "menyenangkan"], num_topics=5, keywords_neg=None, reduced=False) 

print(f"Unique index of every topic returned: {keytopic_nums_tri} \n") 
print(f"Cosine similarity score of each Topic to the search keywords : {topic_scores_tri} \n")  

for count, (topicword_tri, wordscore_tri, topicscore_tri, topicnum_tri) in enumerate(zip(topic_keywords_tri, keyword_scores_tri, topic_scores_tri, keytopic_nums_tri)):
    print(f"{count}  Topic: {topicnum_tri} ")
    print(f"Cosine Similarity Score of Topic relative to Search Words: {topicscore_tri} \n") 
    print(f"Topic Words : \n{topicword_tri} \n")
    # print(f"Cosine Similarity Score of Words relative to Topic :\n{wordscore_tri}\n")


## **8b) Semantic Search Documents by Keywords**
Search documents for content semantically similar.

In [None]:
# Search Documents by Key Words with Bi-Gram Model
documents_bi, document_scores_bi, document_ids_bi = topicMODEL_bi.search_documents_by_keywords(keywords=["kembali", "pengalaman", "menyenangkan"], num_docs=5)

for count, (doc_bi, score_bi, doc_id_bi) in enumerate(zip(documents_bi, document_scores_bi, document_ids_bi)):
    print(f"{count}. Document: {doc_id_bi}, Score: {score_bi}")
    print("-----------")
    print(doc_bi)
    print("-----------")
    print()

In [None]:
# Search Documents by Key Words with Tri-Gram Model
documents_tri, document_scores_tri, document_ids_tri = topicMODEL_tri.search_documents_by_keywords(keywords=["kembali", "pengalaman", "menyenangkan"], num_docs=5)

for count, (doc_tri, score_tri, doc_id_tri) in enumerate(zip(documents_tri, document_scores_tri, document_ids_tri)):
    print(f"{count}. Document: {doc_id_tri}, Score: {score_tri}")
    print("-----------")
    print(doc_tri)
    print("-----------")
    print()