In [2]:
import pandas as pd
import numpy as np
import os

from googletrans import Translator

# Combine Data

In [36]:
# Path to the folder containing the CSV files
folder_path = './data'

# List to hold individual dataframes
dataframes = []

# Iterate over all files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        # Construct full file path
        file_path = os.path.join(folder_path, filename)
        
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file_path)
        
        # Extract the place name from the filename (removing the '.csv' extension)
        place_name = filename.replace('.csv', '').replace('_', ' ')
        
        # Add the place column to the DataFrame
        df['Place name'] = place_name
        
        # Append the DataFrame to the list
        dataframes.append(df)

# Combine all DataFrames into one
data_review = pd.concat(dataframes, ignore_index=True)

# Save the combined DataFrame to a new CSV file
data_review.to_csv('combined_scrap_result.csv', index=False)

# Display the first few rows of the combined DataFrame
display(data_review.head())

data_review.describe()

Unnamed: 0,nama,bintang,komentar,Waktu,Place name
0,Satria Sihombing,5.0,view nya menyala 🤩🔥🔥,an hour,Bobocabin Baturraden Purwokerto
1,silas nainggolan,5.0,Penginapan yang paling menyatu dengan alam. Ca...,an hour,Bobocabin Baturraden Purwokerto
2,Ari Setiawan,5.0,"Bagus, hotel bernuasna baru yang berteknologi ...",2 hours,Bobocabin Baturraden Purwokerto
3,mutiara saragih,5.0,good,2 hours,Bobocabin Baturraden Purwokerto
4,hendro sebayang,5.0,So an exciting stay experience,2 hours,Bobocabin Baturraden Purwokerto


Unnamed: 0,bintang
count,11639.0
mean,4.570393
std,0.918343
min,1.0
25%,4.6
50%,5.0
75%,5.0
max,5.0


In [37]:
print(data_review.size)
data_review.dropna(subset="komentar", inplace=True)
print(data_review.size)
data_review.describe()

58195
33885


Unnamed: 0,bintang
count,6777.0
mean,4.46404
std,1.040103
min,1.0
25%,4.0
50%,5.0
75%,5.0
max,5.0


In [38]:
data_review.reset_index(drop=True, inplace=True)
data_review

Unnamed: 0,nama,bintang,komentar,Waktu,Place name
0,Satria Sihombing,5.0,view nya menyala 🤩🔥🔥,an hour,Bobocabin Baturraden Purwokerto
1,silas nainggolan,5.0,Penginapan yang paling menyatu dengan alam. Ca...,an hour,Bobocabin Baturraden Purwokerto
2,Ari Setiawan,5.0,"Bagus, hotel bernuasna baru yang berteknologi ...",2 hours,Bobocabin Baturraden Purwokerto
3,mutiara saragih,5.0,good,2 hours,Bobocabin Baturraden Purwokerto
4,hendro sebayang,5.0,So an exciting stay experience,2 hours,Bobocabin Baturraden Purwokerto
...,...,...,...,...,...
6772,pohon bumi,4.0,"All digital technologies, capsule/pod hotel 🏨,...",2 months,Bobopod Thamrin Jakarta
6773,Trip.com Member,2.7,"Access only by stair, can not keep lugage, if ...",3 months,Bobopod Thamrin Jakarta
6774,Sandi Prabowo,5.0,2 hari nginep disini ngga mengecewakan. Keluar...,3 months,Bobopod Thamrin Jakarta
6775,suka review,5.0,Good places to stay deket St. Sudirman didepan...,4 months,Bobopod Thamrin Jakarta


# Preprocess 1 (indo)

In [39]:
slang_df = pd.read_csv('colloquial-indonesian-lexicon.csv')
slang_df = slang_df.iloc[:, :2]

slang_dict = pd.Series(slang_df.formal.values, index=slang_df.slang).to_dict()

In [40]:
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import warnings, logging, regex as re

warnings.filterwarnings("ignore")
logging.basicConfig(level=logging.INFO)

slang_df = pd.read_csv('colloquial-indonesian-lexicon.csv')
slang_df = slang_df.iloc[:, :2]

slang_dict = pd.Series(slang_df.formal.values, index=slang_df.slang).to_dict()
slang_dict

def clean_text(data: pd.DataFrame) -> pd.DataFrame:
    
    def remove_trailing_char(st):
        s_arr = st.split(" ")
        hasil = []
        for s in s_arr:
            unique_char = set(s)
            for char in unique_char:
                if s.count(char) > 2:
                    s = s.replace(char*s.count(char), char)
            hasil.append(s)

        return " ".join(hasil)

    def normalize_text(text):
        # Create regex pattern to match the slang words
        pattern = re.compile(r'\b(' + '|'.join(re.escape(key) for key in slang_dict.keys()) + r')\b')
        
        # Replace slang words with formal words using the dictionary
        normalized_text = pattern.sub(lambda x: slang_dict[x.group()], text)
        
        return normalized_text

    stemmer = StemmerFactory().create_stemmer()
    stopword = StopWordRemoverFactory().create_stop_word_remover()

    logging.info("removing null..")
    data = data.dropna()
    logging.info("lowercasing..")
    data["komentar"] = data["komentar"].str.lower()
    logging.info("removing google translate..")
    data["komentar"] = data["komentar"].apply(
        lambda x: x.replace("(diterjemahkan oleh google)", ""))
    logging.info("removing newline..")
    data["komentar"] = data["komentar"].apply(lambda x: x.replace("\n", " "))
    # logging.info("removing punctuation..")
    # data["komentar"] = data["komentar"].str.replace('[^\w\s]', ' ')
    # data["komentar"] = data["komentar"].str.replace('²', ' ')
    logging.info("removing traling char..")
    data["komentar"] = data["komentar"].apply(
        lambda x: remove_trailing_char(x))
    logging.info("Normalize Slang..")
    data["komentar"] = data["komentar"].apply(
        lambda x: normalize_text(x))
    # logging.info("stripping..")
    # data["komentar"] = data["komentar"].apply(lambda x: x.strip())
    # logging.info("stemming..")
    # data["komentar"] = data["komentar"].apply(stemmer.stem)
    # logging.info("remove stopword..")
    # data["komentar"] = data["komentar"].apply(stopword.remove)

    return data

In [41]:
import time 

start_time = time.process_time() 

data_review_clean = clean_text(data_review)
data_review["clean_text"] = data_review_clean["komentar"]

end_time = time.process_time() 

print(f"Waktu CPU yang digunakan: {end_time - start_time} detik")

data_review.to_csv('cleaned_data.csv', index=False)

data_review

INFO:root:removing null..
INFO:root:lowercasing..
INFO:root:removing google translate..
INFO:root:removing newline..
INFO:root:removing traling char..
INFO:root:Normalize Slang..


Waktu CPU yang digunakan: 63.65625 detik


Unnamed: 0,nama,bintang,komentar,Waktu,Place name,clean_text
0,Satria Sihombing,5.0,view nya menyala 🤩🔥🔥,an hour,Bobocabin Baturraden Purwokerto,view nya menyala 🤩🔥🔥
1,silas nainggolan,5.0,Penginapan yang paling menyatu dengan alam. Ca...,an hour,Bobocabin Baturraden Purwokerto,penginapan yang paling menyatu dengan alam. ca...
2,Ari Setiawan,5.0,"Bagus, hotel bernuasna baru yang berteknologi ...",2 hours,Bobocabin Baturraden Purwokerto,"bagus, hotel bernuasna baru yang berteknologi ..."
3,mutiara saragih,5.0,good,2 hours,Bobocabin Baturraden Purwokerto,good
4,hendro sebayang,5.0,So an exciting stay experience,2 hours,Bobocabin Baturraden Purwokerto,sok an exciting stay experience
...,...,...,...,...,...,...
6772,pohon bumi,4.0,"All digital technologies, capsule/pod hotel 🏨,...",2 months,Bobopod Thamrin Jakarta,"all digital technologies, capsule/pod hotel 🏨,..."
6773,Trip.com Member,2.7,"Access only by stair, can not keep lugage, if ...",3 months,Bobopod Thamrin Jakarta,"access only by stair, can not keep lugage, if ..."
6774,Sandi Prabowo,5.0,2 hari nginep disini ngga mengecewakan. Keluar...,3 months,Bobopod Thamrin Jakarta,2 hari nginep disini enggak mengecewakan. kelu...
6775,suka review,5.0,Good places to stay deket St. Sudirman didepan...,4 months,Bobopod Thamrin Jakarta,good places tapi stay dekat st. sudirman didep...


In [42]:
data_review.describe()

Unnamed: 0,bintang
count,6777.0
mean,4.46404
std,1.040103
min,1.0
25%,4.0
50%,5.0
75%,5.0
max,5.0


In [43]:
import pandas as pd
from deep_translator import GoogleTranslator
from tqdm import tqdm
import time
import os
import threading

# Read the CSV file
combined_df = pd.read_csv('cleaned_data.csv')

# Function to handle timeout using threading
class TranslatorThread(threading.Thread):
    def __init__(self, text):
        threading.Thread.__init__(self)
        self.text = text
        self.translated_text = None
        self.exception = None

    def run(self):
        try:
            if self.text and isinstance(self.text, str):
                self.translated_text = GoogleTranslator(source='auto', target='en').translate(self.text)
            else:
                self.translated_text = self.text
        except Exception as e:
            self.exception = e

# Function to translate text to English with timeout and retry mechanism
def translate_to_english_with_timeout(text, timeout=60):
    max_retries = 5
    for attempt in range(max_retries):
        try:
            translator_thread = TranslatorThread(text)
            translator_thread.start()
            translator_thread.join(timeout)
            if translator_thread.is_alive():
                raise TimeoutError("Translation timed out.")
            if translator_thread.exception:
                raise translator_thread.exception
            return translator_thread.translated_text
        except Exception as e:
            print(f"Error translating text: {e}, Attempt: {attempt + 1}")
            time.sleep(5 * (attempt + 1))  # Exponential backoff
    return text

# Apply the translation function with progress bar
tqdm.pandas(desc="Translating")

# Check if there's already a progress file to continue from
if os.path.exists('combined_English_scrap_result_partial.csv'):
    translated_df = pd.read_csv('combined_English_scrap_result_partial.csv')
    start_index = len(translated_df)
    combined_df['komentar'][:start_index] = translated_df['komentar']
else:
    start_index = 0

last_save_index = start_index
for i in tqdm(range(start_index, len(combined_df)), desc="Translating"):
    try:
        combined_df.at[i, 'komentar'] = translate_to_english_with_timeout(combined_df.at[i, 'clean_text'])
    except TimeoutError:
        print(f"No progress after timeout at index {i}. Stopping.")
        break
    if i % 100 == 0 or i == len(combined_df) - 1:  # Save progress every 100 rows or at the end
        combined_df.iloc[:i + 1].to_csv('combined_English_scrap_result_partial.csv', index=False)
        last_save_index = i

# Save the final translated DataFrame to a new CSV file
combined_df.iloc[:last_save_index + 1].to_csv('combined_English_scrap_result.csv', index=False)

# Print the first few rows of the translated DataFrame
print(combined_df.head())


Translating: 0it [00:00, ?it/s]

               nama  bintang  \
0  Satria Sihombing      5.0   
1  silas nainggolan      5.0   
2      Ari Setiawan      5.0   
3   mutiara saragih      5.0   
4   hendro sebayang      5.0   

                                            komentar    Waktu  \
0                                 the view is on 🤩🔥🔥  an hour   
1  accommodation that is most united with nature....  an hour   
2  nice, new hotel with a high-tech feel, with ve...  2 hours   
3                                               good  2 hours   
4                    sok an exciting stay experience  2 hours   

                        Place name  \
0  Bobocabin Baturraden Purwokerto   
1  Bobocabin Baturraden Purwokerto   
2  Bobocabin Baturraden Purwokerto   
3  Bobocabin Baturraden Purwokerto   
4  Bobocabin Baturraden Purwokerto   

                                          clean_text  
0                               view nya menyala 🤩🔥🔥  
1  penginapan yang paling menyatu dengan alam. ca...  
2  bagus, hotel bernua




In [44]:
data_review.describe()

Unnamed: 0,bintang
count,6777.0
mean,4.46404
std,1.040103
min,1.0
25%,4.0
50%,5.0
75%,5.0
max,5.0


In [45]:
combined_df.describe()

Unnamed: 0,bintang
count,6777.0
mean,4.46404
std,1.040103
min,1.0
25%,4.0
50%,5.0
75%,5.0
max,5.0


In [46]:
data_review.describe() == combined_df.describe()

Unnamed: 0,bintang
count,True
mean,True
std,True
min,True
25%,True
50%,True
75%,True
max,True


In [12]:
translated_df = pd.read_csv('combined_English_scrap_result.csv')
translated_df.head()

Unnamed: 0,nama,bintang,komentar,Waktu,Place name,clean_text
0,Satria Sihombing,5.0,the view is on 🤩🔥🔥,an hour,Bobocabin Baturraden Purwokerto,view nya menyala 🤩🔥🔥
1,silas nainggolan,5.0,accommodation that is most united with nature....,an hour,Bobocabin Baturraden Purwokerto,penginapan yang paling menyatu dengan alam. ca...
2,Ari Setiawan,5.0,"nice, new hotel with a high-tech feel, with ve...",2 hours,Bobocabin Baturraden Purwokerto,"bagus, hotel bernuasna baru yang berteknologi ..."
3,mutiara saragih,5.0,good,2 hours,Bobocabin Baturraden Purwokerto,good
4,hendro sebayang,5.0,sok an exciting stay experience,2 hours,Bobocabin Baturraden Purwokerto,sok an exciting stay experience


In [80]:
location_df = pd.read_csv('bobobox_location.csv')
location_df.head()

Unnamed: 0,Nama Fasilitas,Place Id,Jenis,Provinsi,Kota,Kecamatan,Kelurahan,Link Gmaps,Latitude,Longitude
0,Bobopod Alun Alun Malang,0,Bobopod,Jawa Timur,Kota Malang,Klojen,Kauman,https://maps.app.goo.gl/zTLvQof4s9fmHmkn7,-7.983535,112.63012
1,Bobopod Alun Alun Bandung,1,Bobopod,Jawa Barat,Bandung,Regol,Balonggede,https://maps.app.goo.gl/ts8d9LSmwWVGvA3JA,-7.79483,110.365255
2,Bobopod Paskal Bandung,2,Bobopod,Jawa Barat,Bandung,Cicendo,Pasar Kaliki,https://maps.app.goo.gl/gGkjpa6MYZevfpce9,-6.910165,107.598052
3,Bobopod Cipaganti Bandung,3,Bobopod,Jawa Barat,Bandung,Cicendo,Pasar Kaliki,https://maps.app.goo.gl/Ji8pQu9cYFN2hopc8,-6.902427,107.603081
4,Bobopod Mega Mall Bekasi,4,Bobopod,Jawa Barat,Bekasi,Bekasi Selatan,Marga Jaya,https://maps.app.goo.gl/7tDZE1D7qXLoZcio6,-6.247576,106.992864


In [84]:
import warnings, logging, regex as re
import string
from datetime import datetime, timedelta
import inflect

warnings.filterwarnings("ignore")
logging.basicConfig(level=logging.INFO)

import nltk
nltk.download('stopwords')

p = inflect.engine()

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def remove_emoji(text):
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F700-\U0001F77F"  # alchemical symbols
        "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
        "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
        "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        "\U0001FA00-\U0001FA6F"  # Chess Symbols
        "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        "\U00002702-\U000027B0"  # Dingbats
        "\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def stem_text(text, stemmer):
    tokens = text.split()
    return ' '.join([stemmer.stem(token) for token in tokens])

def singularize_text(text):
    words = text.split()
    singular_words = [p.singular_noun(word) if p.singular_noun(word) else word for word in words]
    return ' '.join(singular_words)

# Scraping date
scraping_date = datetime(2024, 7, 12)

def clean_text(data: pd.DataFrame) -> pd.DataFrame:

    logging.info("Removing null values...")
    data = data.dropna()

    logging.info("Lowercasing text...")
    data["komentar"] = data["komentar"].str.lower()

    logging.info("Removing newlines...")
    data["komentar"] = data["komentar"].apply(lambda x: x.replace("\n", " "))

    logging.info("Removing punctuation...")
    data["komentar"] = data["komentar"].apply(lambda x: remove_punctuation(x))

    logging.info("Removing emojis...")
    data["komentar"] = data["komentar"].apply(lambda x: remove_emoji(x))

    logging.info("Singularizing text...")
    data["komentar"] = data["komentar"].apply(lambda x: singularize_text(x))

    return data


def convert_to_date(time_desc):
    if 'year' in time_desc:
        years = int(re.search(r'\d+', time_desc).group()) if re.search(r'\d+', time_desc) else 1
        return scraping_date - timedelta(days=years*365)
    elif 'month' in time_desc:
        months = int(re.search(r'\d+', time_desc).group()) if re.search(r'\d+', time_desc) else 1
        return scraping_date - timedelta(days=months*30)
    elif 'week' in time_desc:
        weeks = int(re.search(r'\d+', time_desc).group()) if re.search(r'\d+', time_desc) else 1
        return scraping_date - timedelta(weeks=weeks)
    elif 'day' in time_desc:
        days = int(re.search(r'\d+', time_desc).group()) if re.search(r'\d+', time_desc) else 1
        return scraping_date - timedelta(days=days)
    elif 'hour' in time_desc:
        hours = int(re.search(r'\d+', time_desc).group()) if re.search(r'\d+', time_desc) else 1
        return scraping_date - timedelta(hours=hours)
    else:
        return scraping_date

[nltk_data] Downloading package stopwords to C:\Users\Muhammad Harun
[nltk_data]     A\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [86]:
import time 

start_time = time.process_time() 

final_df = translated_df.copy()

data_review_clean = clean_text(final_df)
final_df["final_text"] = data_review_clean["komentar"]

final_df.dropna(inplace=True)
final_df.drop('komentar', axis= 1, inplace=True)

# Apply the conversion function to the time column
final_df['Waktu'] = final_df['Waktu'].apply(convert_to_date)

final_df.rename(columns={'clean_text': 'reviews', 'nama': 'name', 'bintang' : 'star', 'Waktu' : 'time'}, inplace=True)

final_df['review_id'] = final_df.index

# Merge the datasets based on the location name
merged_data = pd.merge(final_df, location_df, left_on='Place name', right_on='Nama Fasilitas', how='left')

# Display the merged data to check the results
col = ['name', 'star', 'time', 'Place Id', 'review_id', 'reviews', 'final_text']
merged_data = merged_data[col]
merged_data.head()

end_time = time.process_time() 

print(f"Waktu CPU yang digunakan: {end_time - start_time} detik")

# This data will be used for modelling in modeling.ipynb to do Aspect Based Sentiment Analysis (ABSA)
merged_data.to_csv('final_data.csv', index=False)

merged_data

INFO:root:Removing null values...
INFO:root:Lowercasing text...
INFO:root:Removing newlines...
INFO:root:Removing punctuation...
INFO:root:Removing emojis...
INFO:root:Singularizing text...


Waktu CPU yang digunakan: 41.5625 detik


Unnamed: 0,name,star,time,Place Id,review_id,reviews,final_text
0,Satria Sihombing,5.0,2024-07-11 23:00:00,14,0,view nya menyala 🤩🔥🔥,the view i on
1,silas nainggolan,5.0,2024-07-11 23:00:00,14,1,penginapan yang paling menyatu dengan alam. ca...,accommodation that i most united with nature t...
2,Ari Setiawan,5.0,2024-07-11 22:00:00,14,2,"bagus, hotel bernuasna baru yang berteknologi ...",nice new hotel with a hightech feel with very ...
3,mutiara saragih,5.0,2024-07-11 22:00:00,14,3,good,good
4,hendro sebayang,5.0,2024-07-11 22:00:00,14,4,sok an exciting stay experience,sok an exciting stay experience
...,...,...,...,...,...,...,...
6747,pohon bumi,4.0,2024-05-13 00:00:00,6,6772,"all digital technologies, capsule/pod hotel 🏨,...",all digital technology capsulepod hotel ultra ...
6748,Trip.com Member,2.7,2024-04-13 00:00:00,6,6773,"access only by stair, can not keep lugage, if ...",acces only by stair can not keep lugage if you...
6749,Sandi Prabowo,5.0,2024-04-13 00:00:00,6,6774,2 hari nginep disini enggak mengecewakan. kelu...,2 day staying here did not disappoint outside ...
6750,suka review,5.0,2024-03-14 00:00:00,6,6775,good places tapi stay dekat st. sudirman didep...,good place but stay near st sudirman in front ...


In [89]:
# This data will be used for dashboarding without preprocessed text to make it smaller in size
viz_data = merged_data.drop('final_text', axis = 1)
viz_data.to_csv('data reviews viz.csv', index = False)