In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
import pandas as pd
import datetime
from dateutil.relativedelta import relativedelta

# **1. Normalize Reviews**


In [None]:
def normalize_reviews(filepath):
    '''
    1. Drop duplicated rows
    2. Normalize dates to only the year
    3. Normalize ratings to 10
    '''
  
    df = pd.read_excel(filepath)
  
    # Drop duplicated rows
    df.drop_duplicates(keep='first', inplace=True)
  
    # 1. Normalize dates to only year
  
    # For Google Review dates
    # fix_date_week | fix_date_month | fix_date_year
    # Takes current date and minus off respective day, week, month or year

    def fix_date_day(day):
        date = datetime.datetime.now()
        date = date.date()
        newdate = date - relativedelta(days=day)
        return newdate
  
    def fix_date_week(week):
        date = datetime.datetime.now()
        date = date.date()
        newdate = date - relativedelta(weeks=week)
        return newdate
  
    def fix_date_month(month):
        date = datetime.datetime.now()
        date = date.date()
        newdate = date - relativedelta(months=month)
        return newdate
  
    def fix_date_year(year):
        date = datetime.datetime.now()
        date = date.date()
        newdate = date - relativedelta(years=year)
        return newdate
  
    # Replace date strings with proper year
    # Only up to 10 years
    replace_date = {'sehari lalu': fix_date_day(1),
                    '2 hari lalu': fix_date_day(2),
                    '3 hari lalu': fix_date_day(3),
                    '4 hari lalu': fix_date_day(4),
                    '5 hari lalu': fix_date_day(5),
                    '6 hari lalu': fix_date_day(6),
                    'seminggu lalu': fix_date_week(1),
                    '2 minggu lalu': fix_date_week(2),
                    '3 minggu lalu': fix_date_week(3),
                    '4 minggu lalu': fix_date_week(4),
                    'sebulan lalu': fix_date_month(1),
                    '2 bulan lalu': fix_date_month(2),
                    '3 bulan lalu': fix_date_month(3),
                    '4 bulan lalu': fix_date_month(4),
                    '5 bulan lalu': fix_date_month(5),
                    '6 bulan lalu': fix_date_month(6),
                    '7 bulan lalu': fix_date_month(7),
                    '8 bulan lalu': fix_date_month(8),
                    '9 bulan lalu': fix_date_month(9),
                    '10 bulan lalu': fix_date_month(10),
                    '11 bulan lalu': fix_date_month(11),
                    'setahun lalu': fix_date_year(1),
                    '2 tahun lalu': fix_date_year(2),
                    '3 tahun lalu': fix_date_year(3),
                    '4 tahun lalu': fix_date_year(4),
                    '5 tahun lalu': fix_date_year(5),
                    '6 tahun lalu': fix_date_year(6),
                    '7 tahun lalu': fix_date_year(7),
                    '8 tahun lalu': fix_date_year(8),
                    '9 tahun lalu': fix_date_year(9),
                    '10 tahun lalu': fix_date_year(10),}


    # Google review mask
    dfg = df[(df['source']=='google_reviews')].copy()
    dfg = dfg.replace({"date": replace_date})
    dfg['date'] = pd.DatetimeIndex(dfg['date']).year
  
    # Other websites
    # Create mask for non-google rows
    ngmask = (df['source'] != 'google_reviews')

    # Dataframe for non-google rows
    df_ng = df[ngmask]

    # Get proper date rows from non-google df to separate dataframe
    df_ng_prop = df_ng[df_ng['date'].apply(lambda x: isinstance(x, datetime.date))].copy()

    # Extract only year from proper date rows
    df_ng_prop['date'] = pd.DatetimeIndex(df_ng_prop['date']).year

    # Get improper date rows from non-google df to separate dataframe
    df_ng_improp = df_ng[df_ng['date'].apply(lambda x: not isinstance(x, datetime.date))].copy()

    # Convert datatype to string and only extract last 4 index for year
    df_ng_improp['date'] = df_ng_improp['date'].astype(str).str[-4:]

    # Rejoin all dataframes
    df = pd.concat([dfg, df_ng_prop, df_ng_improp], ignore_index=True)
    df['date'] = df['date'].astype(int)  


    #2. Normalize ratings to 10
  
    # Normalize klook ratings
    replace_rating = {4:8,
                      'Baik': 8,
                      'Sangat Direkomendasikan': 10}
    df[(df['source'] == 'klook')] = df.replace({'rating': replace_rating})
  
    # Normalize TripAdvisor ratings
    ta_mask = (df['source'] == 'tripadvisor')
    # Retrieve index 0 string of rating and multiply by 2
    df.loc[ta_mask, 'rating'] = (df.loc[ta_mask, 'rating'].str[0].astype(int))*2
  
    # Normalize Google Review ratings
    # Mask for Google Review ratings
    g_mask = (df['source'] == 'google_reviews')
    # Retrieve index 14 string of rating and multiply by 2
    df.loc[g_mask, 'rating'] = (df.loc[g_mask, 'rating'].str[14].astype(int))*2
  
    # Drop irrelevant columns
    df.drop(['user','page_url','date_scraped'], axis=1, inplace=True)
    
    # Drop rows with date Year older than 2017 (eg. 2016, 2015, etc)
    df.drop(df[df['date'] < 2017].index, inplace=True)

    return df

In [None]:
#  Enter input filepath
df = normalize_reviews("/content/gdrive/MyDrive/data/raw/attractions_master.xlsx")
df

Unnamed: 0,date,rating,review,source,attraction
0,2021,10,"Belum Pernah Masuk kedalam, hanya diluar saja....",google_reviews,uss
1,2021,10,Sumpahh kalo ke Singapore wajib banget ke sini...,google_reviews,uss
2,2021,10,"Seru bisa jajan coklat enak dan murah, kalo ma...",google_reviews,uss
3,2022,10,"Luas banget,banyak spot fotonya,wahananya keren²",google_reviews,uss
4,2021,10,Wahana Sangat Lengkap ₩Pokoknya Tempatnya Luar...,google_reviews,uss
...,...,...,...,...,...
4861,2018,10,Harga murah dan cocok untuk berkunjung bersama...,klook,adventure_cove
4862,2018,10,sangat menyenangkan. berbeda dengan waterpark ...,klook,adventure_cove
4863,2018,10,"Tempatnya sangat bagus,dan permainannya sangat...",klook,adventure_cove
4864,2018,10,"Bagus permainan airnya, anak saya suka sekali,...",klook,adventure_cove


# **2. Preprocessing for Topic Modeling**


In [None]:
import pandas as pd
import ast
import string
import re
import numpy as np

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
def preprocess_reviews(df):
    '''
    Preprocessing
    - Lowercasing reviews
    - Remove emojis/emoticons
    - Remove punctuation and symbols
    - Remove empty rows
    - Fix slang words
    '''

    file = open("/content/gdrive/MyDrive/data/bahasa_indonesia_slangwords.txt", "r")
    contents = file.read()
    slangwords = ast.literal_eval(contents)

    # Omit website URLs and non-characters, Replace the stars people use in place of swear words with a empty string ' '
    def basic_cleaning(text):
        text=re.sub(r'https?://www\.\S+\.com','', text)
        text=re.sub(r'[^A-Za-z0-9|\s]','', text)
        text=re.sub(r'\*+','', text) # Capture swear words that are **** out, replace with empty string ' ' 
        text=re.sub(r'[^\w ]+', "", text)  # Special chars removal, punctuation
        return text

    # # Remove repeated characters (for example, so we have “way” instead of “waaaayyyyy”):
    # def remove_multiplechars(text):
    #     text = re.sub(r'(.)\1{3,}',r'\1', text)
    #     return text

    # # Define a function to remove punctuations
    # def remove_punc(text):
    #     punc_removed = [char for char in text if char not in string.punctuation]
    #     punc_removed_join = ''.join(punc_removed)
    #     return punc_removed_join

    def remove_emoji(text):
        emoji_pattern = re.compile(pattern = "["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
            u"\U00002702-\U000027B0"  
            u"\U000024C2-\U0001F251"
                             "]+", flags = re.UNICODE)
        text = emoji_pattern.sub(r'', text)
        return text

    df['reviews'] = df['review'].apply(lambda x: basic_cleaning(x))
    df['reviews'] = df['reviews'].apply(lambda x: ' '.join(x.split()))  # Remove multiple spaces
    df['reviews'] = df['reviews'].apply(lambda x: remove_emoji(x))
    
    # df['reviews_formatted'] = df['reviews_formatted'].apply(lambda x: remove_multiplechars(x))
    df['reviews'] = df['reviews'].apply(lambda x: " ".join(slangwords.get(word, word) for word in nltk.word_tokenize(x.lower())))
  
    # Remove empty rows 
    df['reviews'].replace("", np.nan, inplace=True)
    df.dropna(subset=['reviews'], inplace=True)

In [None]:
preprocess_reviews(df)

In [None]:
# Rearrange dataframe
df = df[['date', 'source', 'attraction', 'review', 'reviews', 'rating']]
df

Unnamed: 0,date,source,attraction,review,reviews,rating
0,2021,google_reviews,uss,"Belum Pernah Masuk kedalam, hanya diluar saja....",belum pernah masuk kedalam hanya diluar saja p...,10
1,2021,google_reviews,uss,Sumpahh kalo ke Singapore wajib banget ke sini...,sumpahh kalau ke singapore wajib banget ke sin...,10
2,2021,google_reviews,uss,"Seru bisa jajan coklat enak dan murah, kalo ma...",seru bisa jajan coklat enak dan murah kalau ma...,10
3,2022,google_reviews,uss,"Luas banget,banyak spot fotonya,wahananya keren²",luas bangetbanyak spot fotonyawahananya keren,10
4,2021,google_reviews,uss,Wahana Sangat Lengkap ₩Pokoknya Tempatnya Luar...,wahana sangat lengkap pokoknya tempatnya luar ...,10
...,...,...,...,...,...,...
4861,2018,klook,adventure_cove,Harga murah dan cocok untuk berkunjung bersama...,harga murah dan cocok untuk berkunjung bersama...,10
4862,2018,klook,adventure_cove,sangat menyenangkan. berbeda dengan waterpark ...,sangat menyenangkan berbeda dengan waterpark l...,10
4863,2018,klook,adventure_cove,"Tempatnya sangat bagus,dan permainannya sangat...",tempatnya sangat bagusdan permainannya sangat ...,10
4864,2018,klook,adventure_cove,"Bagus permainan airnya, anak saya suka sekali,...",bagus permainan airnya anak saya suka sekali k...,10


In [None]:
# df[df['reviews'].isnull()]

df1 = df.dropna()
df1 = df1.reset_index(drop=True)
df1["token_counts"] = df1["reviews"].str.split().map(len) 
df1

Unnamed: 0,date,source,attraction,review,reviews,rating,token_counts
0,2021,google_reviews,uss,"Belum Pernah Masuk kedalam, hanya diluar saja....",belum pernah masuk kedalam hanya diluar saja p...,10,28
1,2021,google_reviews,uss,Sumpahh kalo ke Singapore wajib banget ke sini...,sumpahh kalau ke singapore wajib banget ke sin...,10,43
2,2021,google_reviews,uss,"Seru bisa jajan coklat enak dan murah, kalo ma...",seru bisa jajan coklat enak dan murah kalau ma...,10,22
3,2022,google_reviews,uss,"Luas banget,banyak spot fotonya,wahananya keren²",luas bangetbanyak spot fotonyawahananya keren,10,5
4,2021,google_reviews,uss,Wahana Sangat Lengkap ₩Pokoknya Tempatnya Luar...,wahana sangat lengkap pokoknya tempatnya luar ...,10,13
...,...,...,...,...,...,...,...
4808,2018,klook,adventure_cove,Harga murah dan cocok untuk berkunjung bersama...,harga murah dan cocok untuk berkunjung bersama...,10,20
4809,2018,klook,adventure_cove,sangat menyenangkan. berbeda dengan waterpark ...,sangat menyenangkan berbeda dengan waterpark l...,10,28
4810,2018,klook,adventure_cove,"Tempatnya sangat bagus,dan permainannya sangat...",tempatnya sangat bagusdan permainannya sangat ...,10,8
4811,2018,klook,adventure_cove,"Bagus permainan airnya, anak saya suka sekali,...",bagus permainan airnya anak saya suka sekali k...,10,12


In [None]:
# Filter and Remove Reviews with less than 3 word tokens
df1.drop(df1[df1['token_counts'] < 3].index, inplace=True)
df1.drop(['review','token_counts'], axis=1, inplace=True)
sentosa_df = df1.reset_index(drop=True)
sentosa_df

Unnamed: 0,date,source,attraction,reviews,rating
0,2021,google_reviews,uss,belum pernah masuk kedalam hanya diluar saja p...,10
1,2021,google_reviews,uss,sumpahh kalau ke singapore wajib banget ke sin...,10
2,2021,google_reviews,uss,seru bisa jajan coklat enak dan murah kalau ma...,10
3,2022,google_reviews,uss,luas bangetbanyak spot fotonyawahananya keren,10
4,2021,google_reviews,uss,wahana sangat lengkap pokoknya tempatnya luar ...,10
...,...,...,...,...,...
4594,2018,klook,adventure_cove,harga murah dan cocok untuk berkunjung bersama...,10
4595,2018,klook,adventure_cove,sangat menyenangkan berbeda dengan waterpark l...,10
4596,2018,klook,adventure_cove,tempatnya sangat bagusdan permainannya sangat ...,10
4597,2018,klook,adventure_cove,bagus permainan airnya anak saya suka sekali k...,10


In [None]:
sentosa_df.to_excel(f'/content/gdrive/MyDrive/data/clean/formatted_reviews.xlsx', index=False)