# **Topic Modeling and Search with Top2Vec :** 
# **Skyline Luge**

# **1. Import and Setup**

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
import sys
sys.path.append('/content/gdrive/My Drive/nlp')

## **a) To install Top2Vec library and pre-trained BERT sentence transformer options:**

In [5]:
# !pip install top2vec[sentence_transformers]
!pip install pynndescent

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pynndescent
  Downloading pynndescent-0.5.7.tar.gz (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 4.9 MB/s 
Building wheels for collected packages: pynndescent
  Building wheel for pynndescent (setup.py) ... [?25l[?25hdone
  Created wheel for pynndescent: filename=pynndescent-0.5.7-py3-none-any.whl size=54286 sha256=c4ba96c924df4a78a7c1675a8251921783dfca7e50c3b53a8160abb71d8d0406
  Stored in directory: /root/.cache/pip/wheels/7f/2a/f8/7bd5dcec71bd5c669f6f574db3113513696b98f3f9b51f496c
Successfully built pynndescent
Installing collected packages: pynndescent
Successfully installed pynndescent-0.5.7



## **b) Import Libraries**

In [6]:
from top2vec import Top2Vec 

import pandas as pd
import numpy as np

# **2. Import Dataset**

In [7]:
df = pd.read_excel("/content/gdrive/MyDrive/data/clean/formatted_reviews.xlsx")
df.sample(4)

Unnamed: 0,date,source,attraction,reviews,rating
3322,2018.0,tripadvisor,uss,destinasi yang wajib anda kunjungi di singapor...,10.0
2893,2019.0,traveloka,uss,voucher traveloka langsung discan barcodenya d...,10.0
3016,2019.0,traveloka,uss,wahana dan atraksi sangat menarik dan sesuai t...,9.0
3967,2019.0,klook,cable_car,voucher mudah untuk digunakan dan merupakan pe...,10.0


In [8]:
df['date'] = df['date'].astype(int)

In [9]:
luge_df = df[df['attraction'] == 'skyline_luge']
luge_df = luge_df.sort_values(by=['date','rating'],ascending=[False,False])
# resetting index
luge_df = luge_df.reset_index(drop=True)
luge_df.tail(4)

Unnamed: 0,date,source,attraction,reviews,rating
258,2017,tripadvisor,skyline_luge,because once is never enough jika teman2 ada y...,10.0
259,2017,tripadvisor,skyline_luge,awesone luge inilah permainan yang paling meny...,10.0
260,2017,tripadvisor,skyline_luge,fun amazing view kami mencoba skyline seperti ...,8.0
261,2017,tripadvisor,skyline_luge,boleh di coba ini buat yang seneng ngebut ini ...,8.0


In [10]:
luge_docs = luge_df.loc[:, "reviews"].astype(str).values.tolist()
luge_docs[:7]

['it was a great memories travelling right before covid in 2020 senang deh beli tiket awal harga pun lebih murah klau ada voucher code or anything',
 'main sekali akan ketagihan kita beli paket dari traveloka yang bisa main 2 kali sama skyride ketika main ini kalian akan merasa waktu singkat banget yang bikin mau lagi',
 'saya kesini bersama keluarga awalnya anak saya takut untuk naik wahananya tapi setelah mencoba 1x jadi pengen lagi kalau mau lihat hasil foto bisa scan di barcode yang ada di helm dan kalau suka bisa print untuk dibawa pulang',
 'permainan simple yang disukai dari family hingga anakanak permainan ini save buat anak2 untuk skyride nya untuk anak2 atau dewasa yang takut ketinggian lebih baik naek monorail saja pas mau main luge nya lagi he',
 'assiiikk dan sangat seru pengalaman yang menyenangkan',
 'seru bangeeetttapalagi lugenya bisa ngebut tanpa takut di bukti pelanggaran hehetp yang punya panic attack hati2 naik skyline karena minim safetynya',
 'tempat parkir yang 

# **3. Create and Train N-Gram model (Bi-Gram and Tri-Gram)**
* Top2Vec uses Gensim **simple_preprocess** to do tokenization. 
* To create Trigrams and Bigrams, pass a customized tokenizer into Top2Vec tokenizer parameter. 
* Use Gensim to train this N-gram model. Documents passed to the Phrases function to inspect for N-grams text corpus.

In [11]:
import gensim
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import strip_tags

# Build the bigram and trigram models 
sentence_stream = [doc.split(" ") for doc in luge_docs]
bigram = Phrases(sentence_stream, min_count=5, threshold=5, delimiter=b' ')
trigram = Phrases(bigram[sentence_stream], threshold=5, delimiter=b' ')

bigram_phraser = Phraser(bigram)
trigram_phraser = Phraser(trigram)



In [12]:
# Create Bi-Grams
def bi_gram(doc):
    sentence_stream = simple_preprocess(strip_tags(str(doc)), deacc=True)
    return bigram_phraser[sentence_stream] 


# Create Bi-Grams and Tri-Grams
def tri_gram(doc):
    sentence_stream = simple_preprocess(strip_tags(str(doc)), deacc=True)
    gen_bigram = bigram_phraser[sentence_stream]
    gen_trigram = trigram_phraser[bigram_phraser[sentence_stream]]
    return gen_trigram

# **4. Train Top2Vec Model**

### **Parameters**:
* **Documents**: Input corpus, should be a list of strings.
* **Min_count**: (Optional, default 50) Ignores all words with total frequency lower than this. For smaller corpora a smaller min_count will be necessary.
* **Embedding_model** (string or callable) – The valid string options are: doc2vec , universal-sentence-encoder , universal-sentence-encoder-multilingual,distiluse-base-multilingual-cased , all-MiniLM-L6-v2 , paraphrase-multilingual-MiniLM-L12-v2 
* **tokenizer** (callable (Optional, default None)) – Override the default tokenization method. If None then gensim.utils.simple_preprocess will be used. ***Tokenizer must take a document and return a list of tokens***.

In [13]:
# FORM UNI-GRAMS, BI-GRAMS WITH BI-GRAM MODEL 

topicMODEL_bi = Top2Vec(documents= luge_docs, min_count=5, embedding_model = 'distiluse-base-multilingual-cased', tokenizer= bi_gram)  

2022-06-26 05:09:56,176 - top2vec - INFO - Pre-processing documents for training
2022-06-26 05:09:56,248 - top2vec - INFO - Downloading distiluse-base-multilingual-cased model


Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/114 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.37k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/607 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/341 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/539M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/528 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/996k [00:00<?, ?B/s]

2022-06-26 05:10:36,212 - top2vec - INFO - Creating joint document/word embedding
2022-06-26 05:10:58,453 - top2vec - INFO - Creating lower dimension embedding of documents
2022-06-26 05:11:09,754 - top2vec - INFO - Finding dense areas of documents
2022-06-26 05:11:09,773 - top2vec - INFO - Finding topics


In [14]:
# FORM UNI-GRAMS, BI-GRAMS, TRI-GRAMS WITH TRI-GRAM MODEL 
 
topicMODEL_tri = Top2Vec(documents= luge_docs, min_count=5, embedding_model = 'distiluse-base-multilingual-cased', tokenizer = tri_gram)  

2022-06-26 05:11:09,800 - top2vec - INFO - Pre-processing documents for training
2022-06-26 05:11:09,936 - top2vec - INFO - Downloading distiluse-base-multilingual-cased model
2022-06-26 05:11:13,609 - top2vec - INFO - Creating joint document/word embedding
2022-06-26 05:11:34,293 - top2vec - INFO - Creating lower dimension embedding of documents
2022-06-26 05:11:38,457 - top2vec - INFO - Finding dense areas of documents
2022-06-26 05:11:38,469 - top2vec - INFO - Finding topics


In [15]:
# SAVE MODELS FOR FUTURE USE

topicMODEL_bi.save("/content/gdrive/MyDrive/data/models/bigram_luge_june26")

topicMODEL_tri.save("/content/gdrive/MyDrive/data/models/trigram_luge_june26")

In [None]:
# LOAD MODEL FROM SAVED MODELS

topicMODEL_bi = Top2Vec.load("/content/gdrive/MyDrive/data/models/bigram_luge_june26")

topicMODEL_tri = Top2Vec.load("/content/gdrive/MyDrive/data/models/trigram_luge_june26")

# **5. Perform Topic Reduction** 

* Reduce the number of topics discovered by Top2Vec.
* The most representative topics of the corpus will be found, by iteratively merging each smallest topic to the most similar topic until num_topics is reached. 
* Get the hierarchy of reduced topics. The mapping of each original topic to the reduced topics is returned.

In [None]:
# BI-GRAM MODEL TOPIC REDUCTION
topicMODEL_bi.hierarchical_topic_reduction(num_topics=10) 

topicMODEL_bi.get_topic_hierarchy()

In [None]:
# TRI-GRAM MODEL TOPIC REDUCTION 
topicMODEL_tri.hierarchical_topic_reduction(num_topics=10) 

topicMODEL_tri.get_topic_hierarchy()

# **6. Explore Discovered Topics**

## **6a) Get Number of Topics**
This will return the number of topics that Top2Vec has found in the data.

In [17]:
# Show the Total Number of Topics 

print(f"Total Number of Topics generated (Topic Reduction Applied): \n")
print(f"Bi-gram Model : {topicMODEL_bi.get_num_topics(reduced=False)}")
print(f"Tri-gram Model : {topicMODEL_tri.get_num_topics(reduced=False)}")

Total Number of Topics generated (Topic Reduction Applied): 

Bi-gram Model : 2
Tri-gram Model : 2


## **6b) Get Topic Sizes**
This will return the number of documents most similar to each topic. Topics are in decreasing order of size.

Returns:
* topic_sizes: The number of documents most similar to each topic.
* topic_nums: The unique index of every topic will be returned.

### **6b.1. Bi-Gram and Tri-Gram Model Topic Sizes**

In [18]:
print("BI-GRAM MODEL where N = 1 and 2")
print("================================\n")
topic_sizes_bi, topic_nums_bi = topicMODEL_bi.get_topic_sizes(reduced=False) 

print(f"Unique index numbers of every topic: {topic_nums_bi} ")
print()
print(f"Number of documents for each unique topic: {topic_sizes_bi} \n")

print("TRI-GRAM MODEL where N = 1, 2 and 3")
print("====================================\n")
topic_sizes_tri, topic_nums_tri = topicMODEL_tri.get_topic_sizes(reduced=False) 
print(f"Unique index numbers of every topic: {topic_nums_tri} ")
print()
print(f"Number of documents for each unique topic: {topic_sizes_tri} ") 

BI-GRAM MODEL where N = 1 and 2

Unique index numbers of every topic: [0 1] 

Number of documents for each unique topic: [151 111] 

TRI-GRAM MODEL where N = 1, 2 and 3

Unique index numbers of every topic: [0 1] 

Number of documents for each unique topic: [150 112] 


## **6c) Get Topics**
This will return the topics in decreasing size.

Returns:

* topic_words: For each topic the top 50 words are returned, in decreasing order of semantic similarity to topic.

* word_scores: For each topic the cosine similarity scores (in decreasing order) of the top 50 words to the topic are returned.

* topic_nums: The unique index of every topic will be returned.

### **c.1.  BI-GRAM Model Topics**

In [19]:
print(f"TOPICS FOR BI-GRAM MODEL: \n")
topic_words_bi, word_scores_bi, topic_nums_bi = topicMODEL_bi.get_topics(topicMODEL_bi.get_num_topics(reduced=False))

for count,(words_bi, scores_bi, num_bi) in enumerate(zip(topic_words_bi, word_scores_bi, topic_nums_bi)):
    print(f"{count+1}. Topic {num_bi+1}") 
    print(f"Topic Words : \n{words_bi} \n")
    # print(f"Cosine Similarity :\n{scores_bi}\n")

TOPICS FOR BI-GRAM MODEL: 

1. Topic 1
Topic Words : 
['beli tiket' 'sangat menyenangkan' 'tiket' 'fun' 'menyenangkan'
 'permainan ini' 'naik skyline' 'ride' 'lebih murah' 'kereta gantung'
 'permainan' 'main ini' 'traveloka' 'skyline' 'adrenalin' 'station'
 'skyline luge' 'bagus' 'sangat' 'murah' 'jalur' 'mudah' 'bermain' 'keren'
 'terlalu' 'lumayan' 'singapore' 'tidak cukup' 'ketinggian' 'ini'
 'ga cukup' 'track' 'pengalaman yang' 'mesin' 'itu' 'senang' 'skyride'
 'panjang' 'karena' 'pengalaman' 'kecil' 'kursi' 'suka' 'cukup' 'voucher'
 'tetapi' 'sih' 'recommended' 'enak' 'paket'] 

2. Topic 2
Topic Words : 
['sangat menyenangkan' 'fun' 'menyenangkan' 'never enough' 'once is'
 'keren' 'ga cukup' 'bermain' 'lama' 'permainan' 'tidak cukup' 'lagi'
 'permainan ini' 'bahagia' 'panjang' 'bagus' 'banget main' 'terlalu'
 'senang' 'gampang banget' 'cukup' 'adrenalin' 'mencoba' 'deh' 'main ini'
 'lumayan' 'lebih baik' 'sore' 'seru banget' 'skyline luge' 'sekali'
 'ride' 'kecil' 'sangat' 'putara

### **c.2. Tri-Gram Model Topics**

In [20]:
print(f"TOPICS FOR TRI-GRAM MODEL : \n")
topic_words_tri, word_scores_tri, topic_nums_tri = topicMODEL_tri.get_topics(topicMODEL_tri.get_num_topics(reduced=False))

for count, (words_tri, scores_tri, num_tri) in enumerate(zip(topic_words_tri, word_scores_tri, topic_nums_tri)):
    print(f"{count+1}. Topic {num_tri+1}") 
    print(f"Topic Words : \n{words_tri}\n")
    # print(f"Cosine Similarity :\n{scores_tri}\n")

TOPICS FOR TRI-GRAM MODEL : 

1. Topic 1
Topic Words : 
['sangat menyenangkan' 'beli tiket' 'tiket' 'fun' 'permainan ini'
 'menyenangkan' 'permainan yang' 'permainan' 'ride' 'main ini'
 'lebih murah' 'naik skyline' 'kereta gantung' 'traveloka' 'skyline'
 'adrenalin' 'station' 'bagus' 'bermain' 'skyline luge' 'singapore'
 'sangat' 'murah' 'keren' 'jalur' 'mudah' 'terlalu' 'tidak cukup' 'ini'
 'lumayan' 'ketinggian' 'ga cukup' 'once is never enough'
 'pengalaman yang' 'track' 'senang' 'itu' 'mesin' 'kecil' 'panjang'
 'pengalaman' 'karena' 'recommended' 'paket' 'suka' 'skyride' 'voucher'
 'cukup' 'terima kasih' 'bahagia']

2. Topic 2
Topic Words : 
['sangat menyenangkan' 'once is never enough' 'fun' 'menyenangkan' 'keren'
 'bahagia' 'ga cukup' 'bermain' 'permainan yang' 'lama' 'permainan'
 'senang' 'permainan ini' 'lagi' 'bagus' 'banget main' 'panjang'
 'tidak cukup' 'terlalu' 'deh' 'adrenalin' 'gampang banget'
 'pengalaman yang' 'sore' 'seru banget' 'cukup' 'lumayan' 'skyline luge'
 'men

## **6d) Search Documents by Topic**
We are going to search by topic; for example Topic 3. 

Returns:

* documents:  The documents in a list, the most similar are first.
* doc_scores:  Semantic similarity of document to topic. The cosine similarity of the document and topic vector.
* doc_ids:  Unique ids of documents. If ids were not given, the index of document in the original corpus.

For each of the returned documents we are going to print its content, score and document number.

### **d.1. Search Documents By Topic Number Using Bi-Gram Model**

In [21]:
# SEARCH DOCUMENTS BY TOPIC NUMBER USING BI-GRAM MODEL 
topic_num_bi = 0
documents_bi, document_scores_bi, document_ids_bi = topicMODEL_bi.search_documents_by_topic(topic_num= topic_num_bi , num_docs= 5, reduced=False) 

print(f"TOPIC NUMBER: {topic_num_bi}\n")
for count, (doc_bi, docscore_bi, doc_id_bi) in enumerate(zip(documents_bi, document_scores_bi, document_ids_bi)):
    print(f"{count}. Document: {doc_id_bi},  Score: {docscore_bi}")
    print("-----------")
    print(doc_bi)
    print("-----------\n")

TOPIC NUMBER: 0

0. Document: 250,  Score: 0.5566443204879761
-----------
fun and the view is amazing skyline luge ini terletak di dekat pantai siloso rekomen beli yang combo jadi untuk atraksi nya tiket combo bisa naik kereta gantung lalu turun menggunaka atraksi yang luge nya lalu naik lagi menggunakan kereta gantungnya seru dan iya seperti yang dibilang yang lain satu ga cukup ngomong-ngomong untuk print foto saat kita bermain luamayan mahal sekita 2039 usd sg hhehe
-----------

1. Document: 197,  Score: 0.5355770587921143
-----------
tiket harus dicetak dulu di mesin dekat dengan loket tiket kemudian baru ke area sky line dan luge untuk mendapatkan struk setelah itu tinggal antri saja jika datangnya belum terlalu siang antrian belum panjang permainannya menyenangkan tidak puas kalau hanya sekali
-----------

2. Document: 50,  Score: 0.5328277349472046
-----------
lumayan lebih murah sedikit dibandingkan beli di counter tetapi di counter ada tiket combo untuk family namun saran saya

### **d.2. Search Documents By Topic Number Using Tri-Gram Model**

In [22]:
# SEARCH DOCUMENTS BY TOPIC NUMBER USING TRI-GRAM MODEL 
topic_num_tri = 1
documents_tri, document_scores_tri, document_ids_tri = topicMODEL_tri.search_documents_by_topic(topic_num= topic_num_tri, num_docs= 5, reduced=False) 
print(f"TOPIC NUMBER: {topic_num_bi}\n")
for count, (doc_tri, docscore_tri, doc_id_tri) in enumerate(zip(documents_tri, document_scores_tri, document_ids_tri)):
    print(f"{count}. Document: {doc_id_tri},  Score: {docscore_tri}")
    print("-----------")
    print(doc_tri)
    print("-----------\n")

TOPIC NUMBER: 0

0. Document: 192,  Score: 0.6404720544815063
-----------
its greattt i love playing there once is not enough
-----------

1. Document: 257,  Score: 0.5168337821960449
-----------
seru amazing once is never enough
-----------

2. Document: 174,  Score: 0.44408470392227173
-----------
permainan yang sangat menyenangkan tidak cukup hanya sekali bermain pasti menginginkan berulangulang
-----------

3. Document: 170,  Score: 0.4436345100402832
-----------
satu kali tidak cukup sangat menyenangkan sekali puas dan terbaik
-----------

4. Document: 82,  Score: 0.4400729835033417
-----------
awesome nice good bahagia fun
-----------



# **7) Constructing Dataframe to Collate Information of Key Topics**

## **7a) BI-GRAM MODEL**

In [23]:
# CONSTRUCT DATAFRAME OF BI-GRAM MODEL WITH KEY INFORMATION FOR EXPORT
# initialize data of lists.
bigram_table = {
              'topic number': [num_bi for num_bi in topic_nums_bi],
              'topic words': [words_bi for words_bi in topic_words_bi],
              'cosine similarity metrics': [scores_bi for scores_bi in word_scores_bi],
              'similar documents': [size_bi for size_bi in topic_sizes_bi],
               }
 
# Create DataFrame
bigram_df = pd.DataFrame(bigram_table)
bigram_df['topic number'] = bigram_df['topic number'].apply(lambda x: x + 1)

# print(bigram_df.head(2))
bigram_df.tail()

Unnamed: 0,topic number,topic words,cosine similarity metrics,similar documents
0,1,"[beli tiket, sangat menyenangkan, tiket, fun, ...","[0.19643107, 0.18893042, 0.1861104, 0.1591903,...",151
1,2,"[sangat menyenangkan, fun, menyenangkan, never...","[0.41782498, 0.39005685, 0.33098108, 0.2861668...",111


### **7a.1. Build Functions to Expand Column Features in Bi-Gram Dataframe**

In [24]:
def get_document_index(topic_num_bi, num_doc):
    documents_bi, document_scores_bi, document_ids_bi = topicMODEL_bi.search_documents_by_topic(topic_num= topic_num_bi , num_docs= num_doc, reduced=False)
    document_index = [doc_id_bi for doc_id_bi in document_ids_bi]
    return document_index 

def get_document_scores(topic_num_bi, num_doc):
    documents_bi, document_scores_bi, document_ids_bi = topicMODEL_bi.search_documents_by_topic(topic_num= topic_num_bi , num_docs= num_doc, reduced=False)
    document_scores = [docscore_bi for docscore_bi in document_scores_bi]
    return document_scores


def get_documents(topic_num_bi, num_doc):
    documents_bi, document_scores_bi, document_ids_bi = topicMODEL_bi.search_documents_by_topic(topic_num= topic_num_bi , num_docs= num_doc, reduced=False)
    actual_document = [doc_bi for doc_bi in documents_bi]
    return actual_document


In [25]:
bigram_df['document index'] = bigram_df['topic number'].apply(lambda x: get_document_index(x-1, bigram_df['similar documents'].values[x-1]))

bigram_df['document scores'] = bigram_df['topic number'].apply(lambda x: get_document_scores(x-1, bigram_df['similar documents'].values[x-1]))

bigram_df['actual reviews'] = bigram_df['topic number'].apply(lambda x: get_documents(x-1, bigram_df['similar documents'].values[x-1]))

bigram_df

Unnamed: 0,topic number,topic words,cosine similarity metrics,similar documents,document index,document scores,actual reviews
0,1,"[beli tiket, sangat menyenangkan, tiket, fun, ...","[0.19643107, 0.18893042, 0.1861104, 0.1591903,...",151,"[250, 197, 50, 259, 133, 178, 86, 247, 157, 20...","[0.5566443, 0.53557706, 0.53282773, 0.5283109,...",[fun and the view is amazing skyline luge ini ...
1,2,"[sangat menyenangkan, fun, menyenangkan, never...","[0.41782498, 0.39005685, 0.33098108, 0.2861668...",111,"[192, 257, 174, 170, 236, 82, 61, 71, 190, 179...","[0.65113056, 0.52872, 0.45630592, 0.44390193, ...",[its greattt i love playing there once is not ...


In [27]:
# for count, (index, review) in enumerate(zip(bigram_df['document index'][28], bigram_df['actual reviews'][28])):
#   print(f"{count}: {index} = {review})")

bigram_df.to_excel(f'/content/gdrive/MyDrive/data/export/bigram_df_luge.xlsx', index=False)

## **7b) TRI-GRAM MODEL**

In [28]:
# CONSTRUCT DATAFRAME CONTAINING KEY INFORMATION FOR EXPORT
# initialize data of lists.
trigram_table = {
                'topic number': [num_tri for num_tri in topic_nums_tri],
                'topic words': [words_tri for words_tri in topic_words_tri],
                'cosine similarity metrics': [scores_tri for scores_tri in word_scores_tri],
                'similar documents': [size_tri for size_tri in topic_sizes_tri],
                 }

# Create DataFrame
trigram_df = pd.DataFrame(trigram_table)

trigram_df['topic number'] = trigram_df['topic number'].apply(lambda x: x + 1)
trigram_df.tail()

Unnamed: 0,topic number,topic words,cosine similarity metrics,similar documents
0,1,"[sangat menyenangkan, beli tiket, tiket, fun, ...","[0.19460928, 0.19261208, 0.18238278, 0.1684075...",150
1,2,"[sangat menyenangkan, once is never enough, fu...","[0.4636202, 0.42285305, 0.42201543, 0.37326694...",112


### **7b.1. Build Functions to Expand Column Features in Tri-Gram Dataframe**

In [29]:
def get_document_index_tri(topic_num_tri, num_docs):
    documents_tri, document_scores_tri, document_ids_tri = topicMODEL_tri.search_documents_by_topic(topic_num= topic_num_tri , num_docs= num_docs, reduced=False)
    document_index = [doc_id_tri for doc_id_tri in document_ids_tri]
    return document_index 

def get_document_scores_tri(topic_num_tri, num_docs):
    documents_tri, document_scores_tri, document_ids_tri = topicMODEL_tri.search_documents_by_topic(topic_num= topic_num_tri , num_docs= num_docs, reduced=False)
    document_scores = [docscore_tri for docscore_tri in document_scores_tri]
    return document_scores


def get_documents_tri(topic_num_tri, num_docs):
    documents_tri, document_scores_tri, document_ids_tri = topicMODEL_tri.search_documents_by_topic(topic_num= topic_num_tri , num_docs= num_docs, reduced=False)
    actual_document = [doc_tri for doc_tri in documents_tri]
    return actual_document


In [30]:
trigram_df['document index'] = trigram_df['topic number'].apply(lambda x: get_document_index_tri(x-1, trigram_df['similar documents'].values[x-1]))

trigram_df['document scores'] = trigram_df['topic number'].apply(lambda x: get_document_scores_tri(x-1, trigram_df['similar documents'].values[x-1]))

trigram_df['actual reviews'] = trigram_df['topic number'].apply(lambda x: get_documents_tri(x-1, trigram_df['similar documents'].values[x-1]))

trigram_df

Unnamed: 0,topic number,topic words,cosine similarity metrics,similar documents,document index,document scores,actual reviews
0,1,"[sangat menyenangkan, beli tiket, tiket, fun, ...","[0.19460928, 0.19261208, 0.18238278, 0.1684075...",150,"[250, 197, 50, 133, 259, 178, 86, 247, 157, 20...","[0.5542214, 0.533089, 0.53064436, 0.5218531, 0...",[fun and the view is amazing skyline luge ini ...
1,2,"[sangat menyenangkan, once is never enough, fu...","[0.4636202, 0.42285305, 0.42201543, 0.37326694...",112,"[192, 257, 174, 170, 82, 236, 61, 179, 71, 162...","[0.64047205, 0.5168338, 0.4440847, 0.4436345, ...",[its greattt i love playing there once is not ...


In [31]:
# for count, (index, review) in enumerate(zip(trigram_df['document index'][0], trigram_df['actual reviews'][0])):
#   print(f"{count}: {index} = {review})")

trigram_df.to_excel(f'/content/gdrive/MyDrive/data/export/trigram_df_luge.xlsx', index=False)

# **8. Defining Topic Labels on Reviews** 
## **8a) User-Define Topic Labels** 
### **8a.1:  BI-GRAM Model** 

In [33]:
# Create the dictionary containing the data of the new column
bigram_col_dict = { 'Topic 1': 'features visitors like',                 'Topic 2': 'visitors experiences',
                  
                  }
                   
# Assign the values of the dictionary as the values of the new column
bigram_df['topic label'] = bigram_col_dict.values()
bigram_df

Unnamed: 0,topic number,topic words,cosine similarity metrics,similar documents,document index,document scores,actual reviews,topic label
0,1,"[beli tiket, sangat menyenangkan, tiket, fun, ...","[0.19643107, 0.18893042, 0.1861104, 0.1591903,...",151,"[250, 197, 50, 259, 133, 178, 86, 247, 157, 20...","[0.5566443, 0.53557706, 0.53282773, 0.5283109,...",[fun and the view is amazing skyline luge ini ...,features visitors like
1,2,"[sangat menyenangkan, fun, menyenangkan, never...","[0.41782498, 0.39005685, 0.33098108, 0.2861668...",111,"[192, 257, 174, 170, 236, 82, 61, 71, 190, 179...","[0.65113056, 0.52872, 0.45630592, 0.44390193, ...",[its greattt i love playing there once is not ...,visitors experiences


In [34]:
# CREATE COPY OF ORIGINAL DATAFRAME
luge_DF_BI = luge_df.copy()
luge_DF_BI.tail(3)

Unnamed: 0,date,source,attraction,reviews,rating
259,2017,tripadvisor,skyline_luge,awesone luge inilah permainan yang paling meny...,10.0
260,2017,tripadvisor,skyline_luge,fun amazing view kami mencoba skyline seperti ...,8.0
261,2017,tripadvisor,skyline_luge,boleh di coba ini buat yang seneng ngebut ini ...,8.0


In [35]:
print([i for i in range(topicMODEL_bi.get_num_topics(reduced=False))])

[0, 1]


In [36]:
# FUNCTION TO CONNECT TOPIC LABELS TO INDEX LIST
def get_indexLIST(i):
      topic = [doc_ind for doc_ind in bigram_df['document index'][i]]
      return topic

In [37]:
for i in range(topicMODEL_bi.get_num_topics(reduced=False)):
    luge_DF_BI.loc[get_indexLIST(i), 'topic label'] = bigram_df['topic label'][i]
    

In [38]:
# FUNCTION TO CONNECT DOCUMENT COSINE SCORE TO INDEX LIST
def get_scores(i):
    score = [doc_score for doc_score in bigram_df['document scores'][i]]
    return score

In [39]:
for i in range(topicMODEL_bi.get_num_topics(reduced=False)):
    luge_DF_BI.loc[get_indexLIST(i), 'cosine score review/topic'] = get_scores(i) 

In [40]:
luge_DF_BI.reset_index(drop=True)
luge_DF_BI

Unnamed: 0,date,source,attraction,reviews,rating,topic label,cosine score review/topic
0,2021,klook,skyline_luge,it was a great memories travelling right befor...,10.0,features visitors like,0.463260
1,2020,google_reviews,skyline_luge,main sekali akan ketagihan kita beli paket dar...,10.0,features visitors like,0.391792
2,2020,google_reviews,skyline_luge,saya kesini bersama keluarga awalnya anak saya...,10.0,features visitors like,0.309620
3,2020,google_reviews,skyline_luge,permainan simple yang disukai dari family hing...,10.0,features visitors like,0.431256
4,2020,google_reviews,skyline_luge,assiiikk dan sangat seru pengalaman yang menye...,10.0,visitors experiences,0.278652
...,...,...,...,...,...,...,...
257,2017,traveloka,skyline_luge,seru amazing once is never enough,10.0,visitors experiences,0.528720
258,2017,tripadvisor,skyline_luge,because once is never enough jika teman2 ada y...,10.0,features visitors like,0.392003
259,2017,tripadvisor,skyline_luge,awesone luge inilah permainan yang paling meny...,10.0,features visitors like,0.528311
260,2017,tripadvisor,skyline_luge,fun amazing view kami mencoba skyline seperti ...,8.0,features visitors like,0.483437


In [41]:
# TO VERIFY TOPIC LABELING IS CORRECT: 
# STEP 1 - GENERATE RANDOM SAMPLE TABLE FROM NEW DATAFRAME 
bigram_df.sample(2)

Unnamed: 0,topic number,topic words,cosine similarity metrics,similar documents,document index,document scores,actual reviews,topic label
1,2,"[sangat menyenangkan, fun, menyenangkan, never...","[0.41782498, 0.39005685, 0.33098108, 0.2861668...",111,"[192, 257, 174, 170, 236, 82, 61, 71, 190, 179...","[0.65113056, 0.52872, 0.45630592, 0.44390193, ...",[its greattt i love playing there once is not ...,visitors experiences
0,1,"[beli tiket, sangat menyenangkan, tiket, fun, ...","[0.19643107, 0.18893042, 0.1861104, 0.1591903,...",151,"[250, 197, 50, 259, 133, 178, 86, 247, 157, 20...","[0.5566443, 0.53557706, 0.53282773, 0.5283109,...",[fun and the view is amazing skyline luge ini ...,features visitors like


In [42]:
# TO VERIFY TOPIC LABELING IS CORRECT: 
# STEP 2 - SELECT INDEXES FROM ORIGINAL DATAFRAME TO COMPARE

luge_DF_BI.iloc[[192, 250]] 

Unnamed: 0,date,source,attraction,reviews,rating,topic label,cosine score review/topic
192,2019,traveloka,skyline_luge,its greattt i love playing there once is not e...,9.0,visitors experiences,0.651131
250,2018,tripadvisor,skyline_luge,fun and the view is amazing skyline luge ini t...,10.0,features visitors like,0.556644


In [43]:
luge_DF_BI.to_excel(f'/content/gdrive/MyDrive/data/export/luge_DF_BI.xlsx', index=False)

### **8a.2  TRI-GRAM Model** 

In [44]:
# Create the dictionary containing the data of the new column
trigram_col_dict = { 'Topic 1': 'features visitors like',                 'Topic 2': 'visitors experience',
                }
   
# Assign the values of the dictionary as the values of the new column
trigram_df['topic label'] = trigram_col_dict.values()
trigram_df.tail(3)

Unnamed: 0,topic number,topic words,cosine similarity metrics,similar documents,document index,document scores,actual reviews,topic label
0,1,"[sangat menyenangkan, beli tiket, tiket, fun, ...","[0.19460928, 0.19261208, 0.18238278, 0.1684075...",150,"[250, 197, 50, 133, 259, 178, 86, 247, 157, 20...","[0.5542214, 0.533089, 0.53064436, 0.5218531, 0...",[fun and the view is amazing skyline luge ini ...,features visitors like
1,2,"[sangat menyenangkan, once is never enough, fu...","[0.4636202, 0.42285305, 0.42201543, 0.37326694...",112,"[192, 257, 174, 170, 82, 236, 61, 179, 71, 162...","[0.64047205, 0.5168338, 0.4440847, 0.4436345, ...",[its greattt i love playing there once is not ...,visitors experience


In [45]:
# CREATE COPY OF ORIGINAL DATAFRAME
luge_DF_TRI = luge_df.copy()
luge_DF_TRI.tail(3)

Unnamed: 0,date,source,attraction,reviews,rating
259,2017,tripadvisor,skyline_luge,awesone luge inilah permainan yang paling meny...,10.0
260,2017,tripadvisor,skyline_luge,fun amazing view kami mencoba skyline seperti ...,8.0
261,2017,tripadvisor,skyline_luge,boleh di coba ini buat yang seneng ngebut ini ...,8.0


In [46]:
print([i for i in range(topicMODEL_tri.get_num_topics(reduced=False))])

[0, 1]


In [47]:
# FUNCTION TO CONNECT TOPIC LABELS TO INDEX LIST
def get_indexLIST(i):
      topic = [doc_ind for doc_ind in trigram_df['document index'][i]]
      return topic

In [48]:
for i in range(topicMODEL_tri.get_num_topics(reduced=False)):
    luge_DF_TRI.loc[get_indexLIST(i), 'topic_label'] = trigram_df['topic label'][i]

In [49]:
# FUNCTION TO CONNECT DOCUMENT COSINE SCORE TO INDEX LIST
def get_scores(i):
    score = [doc_score for doc_score in trigram_df['document scores'][i]]
    return score

In [50]:
for i in range(topicMODEL_tri.get_num_topics(reduced=False)):
    luge_DF_TRI.loc[get_indexLIST(i), 'cosine score review/topic'] = get_scores(i) 

In [51]:
luge_DF_TRI.reset_index(drop=True)
luge_DF_TRI

Unnamed: 0,date,source,attraction,reviews,rating,topic_label,cosine score review/topic
0,2021,klook,skyline_luge,it was a great memories travelling right befor...,10.0,features visitors like,0.463745
1,2020,google_reviews,skyline_luge,main sekali akan ketagihan kita beli paket dar...,10.0,features visitors like,0.402059
2,2020,google_reviews,skyline_luge,saya kesini bersama keluarga awalnya anak saya...,10.0,features visitors like,0.309880
3,2020,google_reviews,skyline_luge,permainan simple yang disukai dari family hing...,10.0,features visitors like,0.436903
4,2020,google_reviews,skyline_luge,assiiikk dan sangat seru pengalaman yang menye...,10.0,visitors experience,0.307983
...,...,...,...,...,...,...,...
257,2017,traveloka,skyline_luge,seru amazing once is never enough,10.0,visitors experience,0.516834
258,2017,tripadvisor,skyline_luge,because once is never enough jika teman2 ada y...,10.0,features visitors like,0.397714
259,2017,tripadvisor,skyline_luge,awesone luge inilah permainan yang paling meny...,10.0,features visitors like,0.521218
260,2017,tripadvisor,skyline_luge,fun amazing view kami mencoba skyline seperti ...,8.0,features visitors like,0.479489


In [52]:
luge_DF_TRI.isnull().values.any()

False

In [53]:
luge_DF_TRI.to_excel(f'/content/gdrive/MyDrive/data/export/luge_DF_TRI.xlsx', index=False)

# **9.  Visualization**

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
uss_topred = pd.read_excel("/content/gdrive/MyDrive/data/export/uss_DF_BIreduced.xlsx")
uss_topred.isnull().values.any()
uss_topred = uss_topred.loc[uss_topred['date'] != 2017]
uss_topred

In [None]:
sns.color_palette("gist_rainbow")

In [None]:
sns.set_palette("gist_rainbow")
sns.set_theme(style="ticks", font_scale=1.4)

In [None]:
plt.figure(figsize=(18,10))

hue_order = [2022, 2021, 2020, 2019, 2018]
sns.histplot(data=uss_topred, y="topic_label", hue="date", hue_order=hue_order, multiple="stack", bins= 50, palette="Set1") 


In [None]:
sns.color_palette('Set3')
plt.figure(figsize=(25,10))

hue_order = [2018, 2019, 2020,2021, 2022]
sns.histplot(data=uss_topred, x="topic_label", hue="date", hue_order=hue_order, multiple="dodge", bins=100, palette="Set1") 


In [None]:
plt.figure(figsize=(16,11))

sns.countplot(data=uss_topred, y="topic_label", hue="date", dodge=False)