In [3]:
import pandas as pd
import ast
import string
import re

from transformers import pipeline
import sentencepiece

import nltk
nltk.download('punkt')

ModuleNotFoundError: No module named 'nltk'

In [4]:
sentiment_model = pipeline(
    "sentiment-analysis",
    model="w11wo/indonesian-roberta-base-sentiment-classifier",
    tokenizer="w11wo/indonesian-roberta-base-sentiment-classifier"
)

ner_model = pipeline(
    "ner",
    model="cahya/xlm-roberta-large-indonesian-NER",
    tokenizer="cahya/xlm-roberta-large-indonesian-NER",
    aggregation_strategy="simple",
)

Downloading: 100%|██████████| 476M/476M [00:04<00:00, 103MB/s]  
Downloading: 100%|██████████| 2.09G/2.09G [01:36<00:00, 23.2MB/s]  
Downloading: 100%|██████████| 49.0/49.0 [00:00<00:00, 52.3kB/s]
Downloading: 100%|██████████| 4.83M/4.83M [00:03<00:00, 1.57MB/s]
Downloading: 100%|██████████| 150/150 [00:00<00:00, 160kB/s]


In [5]:
def get_sentiment(text):
    prediction = sentiment_model(text)[0]
    if prediction["label"] == "positive":
        return 1.0
    elif prediction["label"] == "neutral":
        return 0.0
    else:
        return -1.0

def get_entities(text):
    predictions = ner_model(text)
    entity_list = []
    for ent in predictions:
        ent_name = str(ent["word"]).strip().lower()
        if len(ent_name) > 1:
            entity_list.append(ent_name)
    return entity_list

In [6]:
df = pd.read_excel("../summarization/summary_output/uss_summarized.xlsx")
df.head()

Unnamed: 0,user,date,rating,review,page_url,date_scraped,source,attraction
0,mpocasino,2020,10,Universal Studio Singapore yang berada di Pula...,https://www.tripadvisor.co.id/Attraction_Revie...,4/26/2022,tripadvisor,uss
1,Rudy K,2019,10,cukup beruntung hanya membeli tiket biasa dapa...,https://www.tripadvisor.co.id/Attraction_Revie...,4/26/2022,tripadvisor,uss
2,Alma Jaya,2019,10,Tempat liburan keluarga yang sangat menarik un...,https://www.tripadvisor.co.id/Attraction_Revie...,4/26/2022,tripadvisor,uss
3,Sri Pratiwi,2020,10,Beli tiketnya via OTA dan kebetulan sudah prin...,https://www.tripadvisor.co.id/Attraction_Revie...,4/26/2022,tripadvisor,uss
4,Didik K,2020,10,"Jalan jalan bareng si kecil, emang menyenangka...",https://www.tripadvisor.co.id/Attraction_Revie...,4/26/2022,tripadvisor,uss


In [9]:
sentiment_summary = {}

for index, row in df.iterrows():
    text = row["review"]
    sentiment = get_sentiment(text)
    entities = get_entities(text)
    for ent in entities:
        try:
            # add new entry to existing entity
            sentiment_summary[ent].append(sentiment)
        except:
            # if new entity, create a list with first entry
            sentiment_summary[ent] = [sentiment]

In [11]:
with open('../resources/combined_stop_words.txt') as f:
    stopwords = [line.rstrip() for line in f]

In [12]:
# aggregate statistics

entity_names = []
entity_counts = []
entity_sentiments = []

for entity in sentiment_summary:
    entity_count = len(sentiment_summary[entity])
    # calculate average sentiment (from pos=1, neg=-1)
    overall_sentiment = sum(sentiment_summary[entity])/entity_count
    entity_names.append(entity)
    entity_counts.append(entity_count)
    entity_sentiments.append(overall_sentiment)
    
df_summary = pd.DataFrame(list(zip(entity_names, entity_counts, entity_sentiments)),
                          columns =["entity", "count", "sentiment"])
for i in stopwords:
    df_summary.drop(df_summary[df_summary.entity == i].index, inplace=True)

In [18]:
# top occuring entities
df_summary.sort_values(by="count",ascending=False, inplace=True)
df_summary.head(20)

Unnamed: 0,entity,count,sentiment
99,traveloka,341,0.935484
11,uss,270,0.87037
46,tiket,168,0.791667
0,universal studio,119,0.87395
1,singapore,116,0.896552
3,singapura,85,0.858824
32,sing,66,0.969697
35,us,59,0.881356
208,oka,59,0.898305
130,transformer,51,0.784314


In [17]:
# most positive entities
df_summary.sort_values(by="sentiment",ascending=False, inplace=True)
df_summary.head(20)

Unnamed: 0,entity,count,sentiment
598,stamp,1,1.0
708,disneyland hk,1,1.0
198,second,1,1.0
202,email,1,1.0
923,6+ tahun,1,1.0
952,theater 4d shrek,1,1.0
921,globe universal studio,1,1.0
670,tiket online,1,1.0
1061,kamis,1,1.0
795,niagara,1,1.0


In [19]:
# most negative entities
df_summary.sort_values(by="sentiment",ascending=True, inplace=True)
df_summary.head(20)

Unnamed: 0,entity,count,sentiment
607,30-45menit,1,-1.0
1091,em,1,-1.0
391,30mnt,1,-1.0
688,40rb,1,-1.0
416,juli,1,-1.0
418,even express,1,-1.0
420,50-100 menit,1,-1.0
1068,bola dunia universal studio singapore,1,-1.0
695,gelang,1,-1.0
1092,ean,1,-1.0
