In [14]:
import pandas as pd
import numpy as np
import os

from googletrans import Translator

In [15]:
# Path to the folder containing the CSV files
folder_path = './data'

# List to hold individual dataframes
dataframes = []

# Iterate over all files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        # Construct full file path
        file_path = os.path.join(folder_path, filename)
        
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file_path)
        
        # Extract the place name from the filename (removing the '.csv' extension)
        place_name = filename.replace('.csv', '').replace('_', ' ')
        
        # Add the place column to the DataFrame
        df['Place name'] = place_name
        
        # Append the DataFrame to the list
        dataframes.append(df)

# Combine all DataFrames into one
data_review = pd.concat(dataframes, ignore_index=True)

# Save the combined DataFrame to a new CSV file
data_review.to_csv('combined_scrap_result.csv', index=False)

# Display the first few rows of the combined DataFrame
display(data_review.head())

data_review.describe()

Unnamed: 0,nama,bintang,komentar,Waktu,Place name
0,Satria Sihombing,5.0,view nya menyala 🤩🔥🔥,an hour,Bobocabin Baturraden Purwokerto
1,silas nainggolan,5.0,Penginapan yang paling menyatu dengan alam. Ca...,an hour,Bobocabin Baturraden Purwokerto
2,Ari Setiawan,5.0,"Bagus, hotel bernuasna baru yang berteknologi ...",2 hours,Bobocabin Baturraden Purwokerto
3,mutiara saragih,5.0,good,2 hours,Bobocabin Baturraden Purwokerto
4,hendro sebayang,5.0,So an exciting stay experience,2 hours,Bobocabin Baturraden Purwokerto


Unnamed: 0,bintang
count,11639.0
mean,4.570393
std,0.918343
min,1.0
25%,4.6
50%,5.0
75%,5.0
max,5.0


In [16]:
print(data_review.size)
data_review.dropna(subset="komentar", inplace=True)
print(data_review.size)
data_review.describe()

58195
33885


Unnamed: 0,bintang
count,6777.0
mean,4.46404
std,1.040103
min,1.0
25%,4.0
50%,5.0
75%,5.0
max,5.0


In [17]:
data_review.reset_index(drop=True, inplace=True)
data_review

Unnamed: 0,nama,bintang,komentar,Waktu,Place name
0,Satria Sihombing,5.0,view nya menyala 🤩🔥🔥,an hour,Bobocabin Baturraden Purwokerto
1,silas nainggolan,5.0,Penginapan yang paling menyatu dengan alam. Ca...,an hour,Bobocabin Baturraden Purwokerto
2,Ari Setiawan,5.0,"Bagus, hotel bernuasna baru yang berteknologi ...",2 hours,Bobocabin Baturraden Purwokerto
3,mutiara saragih,5.0,good,2 hours,Bobocabin Baturraden Purwokerto
4,hendro sebayang,5.0,So an exciting stay experience,2 hours,Bobocabin Baturraden Purwokerto
...,...,...,...,...,...
6772,pohon bumi,4.0,"All digital technologies, capsule/pod hotel 🏨,...",2 months,Bobopod Thamrin Jakarta
6773,Trip.com Member,2.7,"Access only by stair, can not keep lugage, if ...",3 months,Bobopod Thamrin Jakarta
6774,Sandi Prabowo,5.0,2 hari nginep disini ngga mengecewakan. Keluar...,3 months,Bobopod Thamrin Jakarta
6775,suka review,5.0,Good places to stay deket St. Sudirman didepan...,4 months,Bobopod Thamrin Jakarta


In [18]:
slang_df = pd.read_csv('colloquial-indonesian-lexicon.csv')
slang_df = slang_df.iloc[:, :2]

slang_dict = pd.Series(slang_df.formal.values, index=slang_df.slang).to_dict()

In [19]:
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import warnings, logging, regex as re

warnings.filterwarnings("ignore")
logging.basicConfig(level=logging.INFO)

slang_df = pd.read_csv('colloquial-indonesian-lexicon.csv')
slang_df = slang_df.iloc[:, :2]

slang_dict = pd.Series(slang_df.formal.values, index=slang_df.slang).to_dict()
slang_dict

def clean_text(data: pd.DataFrame) -> pd.DataFrame:
    
    def remove_trailing_char(st):
        s_arr = st.split(" ")
        hasil = []
        for s in s_arr:
            unique_char = set(s)
            for char in unique_char:
                if s.count(char) > 2:
                    s = s.replace(char*s.count(char), char)
            hasil.append(s)

        return " ".join(hasil)

    def normalize_text(text):
        # Create regex pattern to match the slang words
        pattern = re.compile(r'\b(' + '|'.join(re.escape(key) for key in slang_dict.keys()) + r')\b')
        
        # Replace slang words with formal words using the dictionary
        normalized_text = pattern.sub(lambda x: slang_dict[x.group()], text)
        
        return normalized_text

    stemmer = StemmerFactory().create_stemmer()
    stopword = StopWordRemoverFactory().create_stop_word_remover()

    logging.info("removing null..")
    data = data.dropna()
    logging.info("lowercasing..")
    data["komentar"] = data["komentar"].str.lower()
    logging.info("removing google translate..")
    data["komentar"] = data["komentar"].apply(
        lambda x: x.replace("(diterjemahkan oleh google)", ""))
    logging.info("removing newline..")
    data["komentar"] = data["komentar"].apply(lambda x: x.replace("\n", " "))
    # logging.info("removing punctuation..")
    # data["komentar"] = data["komentar"].str.replace('[^\w\s]', ' ')
    # data["komentar"] = data["komentar"].str.replace('²', ' ')
    logging.info("removing traling char..")
    data["komentar"] = data["komentar"].apply(
        lambda x: remove_trailing_char(x))
    logging.info("Normalize Slang..")
    data["komentar"] = data["komentar"].apply(
        lambda x: normalize_text(x))
    # logging.info("stripping..")
    # data["komentar"] = data["komentar"].apply(lambda x: x.strip())
    # logging.info("stemming..")
    # data["komentar"] = data["komentar"].apply(stemmer.stem)
    # logging.info("remove stopword..")
    # data["komentar"] = data["komentar"].apply(stopword.remove)

    return data

In [20]:
import time 

start_time = time.process_time() 

data_review_clean = clean_text(data_review)
data_review["clean_text"] = data_review_clean["komentar"]

end_time = time.process_time() 

print(f"Waktu CPU yang digunakan: {end_time - start_time} detik")

data_review.to_csv('cleaned_data.csv', index=False)

data_review

INFO:root:removing null..
INFO:root:lowercasing..


INFO:root:removing google translate..
INFO:root:removing newline..
INFO:root:removing traling char..
INFO:root:Normalize Slang..


Waktu CPU yang digunakan: 70.171875 detik


Unnamed: 0,nama,bintang,komentar,Waktu,Place name,clean_text
0,Satria Sihombing,5.0,view nya menyala 🤩🔥🔥,an hour,Bobocabin Baturraden Purwokerto,view nya menyala 🤩🔥🔥
1,silas nainggolan,5.0,Penginapan yang paling menyatu dengan alam. Ca...,an hour,Bobocabin Baturraden Purwokerto,penginapan yang paling menyatu dengan alam. ca...
2,Ari Setiawan,5.0,"Bagus, hotel bernuasna baru yang berteknologi ...",2 hours,Bobocabin Baturraden Purwokerto,"bagus, hotel bernuasna baru yang berteknologi ..."
3,mutiara saragih,5.0,good,2 hours,Bobocabin Baturraden Purwokerto,good
4,hendro sebayang,5.0,So an exciting stay experience,2 hours,Bobocabin Baturraden Purwokerto,sok an exciting stay experience
...,...,...,...,...,...,...
6772,pohon bumi,4.0,"All digital technologies, capsule/pod hotel 🏨,...",2 months,Bobopod Thamrin Jakarta,"all digital technologies, capsule/pod hotel 🏨,..."
6773,Trip.com Member,2.7,"Access only by stair, can not keep lugage, if ...",3 months,Bobopod Thamrin Jakarta,"access only by stair, can not keep lugage, if ..."
6774,Sandi Prabowo,5.0,2 hari nginep disini ngga mengecewakan. Keluar...,3 months,Bobopod Thamrin Jakarta,2 hari nginep disini enggak mengecewakan. kelu...
6775,suka review,5.0,Good places to stay deket St. Sudirman didepan...,4 months,Bobopod Thamrin Jakarta,good places tapi stay dekat st. sudirman didep...


In [21]:
data_review.describe()

Unnamed: 0,bintang
count,6777.0
mean,4.46404
std,1.040103
min,1.0
25%,4.0
50%,5.0
75%,5.0
max,5.0


In [1]:
import pandas as pd
from deep_translator import GoogleTranslator
from tqdm import tqdm
import time
import os

# Read the CSV file
combined_df = pd.read_csv('cleaned_data.csv')

# Function to translate text to English with retry mechanism
def translate_to_english(text):
    max_retries = 5
    for attempt in range(max_retries):
        try:
            if text and isinstance(text, str):
                translated = GoogleTranslator(source='auto', target='en').translate(text)
                return translated
            return text
        except Exception as e:
            print(f"Error translating text: {e}, Attempt: {attempt + 1}")
            time.sleep(5 * (attempt + 1))  # Exponential backoff
    return text

# Apply the translation function with progress bar
tqdm.pandas(desc="Translating")

# Check if there's already a progress file to continue from
if os.path.exists('combined_English_scrap_result_partial.csv'):
    translated_df = pd.read_csv('combined_English_scrap_result_partial.csv')
    start_index = len(translated_df)
    combined_df['komentar'][:start_index] = translated_df['komentar']
else:
    start_index = 0

for i in tqdm(range(start_index, len(combined_df)), desc="Translating"):
    combined_df.at[i, 'komentar'] = translate_to_english(combined_df.at[i, 'clean_text'])
    if i % 100 == 0:  # Save progress every 100 rows
        combined_df.iloc[:i + 1].to_csv('combined_English_scrap_result_partial.csv', index=False)

# Save the final translated DataFrame to a new CSV file
combined_df.to_csv('combined_English_scrap_result.csv', index=False)

# Print the first few rows of the translated DataFrame
print(combined_df.head())

Translating:   0%|          | 3/6777 [00:00<24:42,  4.57it/s]

In [22]:
# from deep_translator import GoogleTranslator
# from tqdm import tqdm

# # Read the CSV file
# combined_df = pd.read_csv('cleaned_data.csv')

# # Function to translate text to English
# def translate_to_english(text):
#     try:
#         if text and isinstance(text, str):
#             translated = GoogleTranslator(source='auto', target='en').translate(text)
#             return translated
#         return text
#     except Exception as e:
#         print(f"Error translating text: {e}, The text : {text}")
#         return text

# # Apply the translation function with progress bar
# tqdm.pandas(desc="Translating")
# combined_df['komentar'] = combined_df['clean_text'].progress_apply(translate_to_english)

# # Save the translated DataFrame to a new CSV file
# combined_df.to_csv('combined_English_scrap_result.csv', index=False)

# # Print the first few rows of the translated DataFrame
# print(combined_df.head())

Translating:  11%|█         | 731/6777 [07:41<1:25:19,  1.18it/s]

In [8]:
from deep_translator import GoogleTranslator

# Load the combined data
combined_df = pd.read_csv('cleaned_data.csv')

# Convert specified columns to string
# combined_df['nama'] = combined_df['nama'].astype(str)
# combined_df['komentar'] = combined_df['komentar'].astype(str)
# combined_df['Waktu'] = combined_df['Waktu'].astype(str)

# Function to translate text to Indonesian
def translate_to_indonesian(text):
    try:
        if text and isinstance(text, str):
            translated = GoogleTranslator(source='auto', target='id').translate(text)
            return translated
        return text
    except Exception as e:

        print(f"Error translating text: {e}, The text : {text}")
        return text

# Apply the translation function to the 'komentar' column
combined_df['komentar'] = combined_df['komentar'].apply(translate_to_indonesian)

# Save the translated DataFrame to a new CSV file
combined_df.to_csv('combined_Indonesia_scrap_result.csv', index=False)

# Display the first few rows of the translated DataFrame
print(combined_df.head())

               nama  bintang  \
0  Satria Sihombing      5.0   
1  silas nainggolan      5.0   
2      Ari Setiawan      5.0   
3   mutiara saragih      5.0   
4   hendro sebayang      5.0   

                                            komentar    Waktu  \
0                               view nya menyala 🤩🔥🔥  an hour   
1  Akomodasi yang paling menyatu dengan alam. Kab...  an hour   
2  Bagus, hotel bernuasna baru yang berteknologi ...  2 hours   
3                                              Bagus  2 hours   
4                 Jadi pengalaman menginap yang seru  2 hours   

                        Place name  \
0  Bobocabin Baturraden Purwokerto   
1  Bobocabin Baturraden Purwokerto   
2  Bobocabin Baturraden Purwokerto   
3  Bobocabin Baturraden Purwokerto   
4  Bobocabin Baturraden Purwokerto   

                                          clean_text  
0                                     view nya nyala  
1  inap paling satu alam cabin nya canggih pakai ...  
2  bagus hotel bernuas

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
tes = pd.read_csv('combined_English_scrap_result.csv')

tes.dropna(subset="komentar", inplace=True)

# Display the rows with null values
# print(rows_with_nulls)
# Create a count vectorizer
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')

# Fit and transform the cleaned reviews
dtm = vectorizer.fit_transform(tes['komentar'])
dtm

<6732x4562 sparse matrix of type '<class 'numpy.int64'>'
	with 115589 stored elements in Compressed Sparse Row format>

In [7]:
from sklearn.decomposition import LatentDirichletAllocation

# Set the number of topics
num_topics = 5

# Create LDA model
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)

# Fit the LDA model
lda.fit(dtm)

# Display the topics
for index, topic in enumerate(lda.components_):
    print(f'Topic #{index + 1}:')
    print([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-20:]])

Topic #1:
['okay', 'stay', 'like', 'enter', 'order', 'road', 'pay', 'night', 'bring', 'just', 'time', 'good', 'price', 'want', 'lot', 'check', 'room', 'don', 'car', 'parking']
Topic #2:
['bed', 'don', 'price', 'pod', 'bathroom', 'time', 'place', 'just', 'stay', 'sok', 'staff', 'bobobox', 'service', 'si', 'really', 'like', 'clean', 'hotel', 'room', 'good']
Topic #3:
['atmosphere', 'food', 'location', 'area', 'great', 'staff', 'nature', 'night', 'room', 'experience', 'beautiful', 'nice', 'cool', 'really', 'good', 'stay', 'place', 'bobocabin', 'view', 'cabin']
Topic #4:
['eat', 'don', 'bobobox', 'soap', 'ready', 'available', 'good', 'like', 'air', 'area', 'quite', 'hot', 'shower', 'cold', 'toilet', 'floor', 'clean', 'bathroom', 'water', 'room']
Topic #5:
['facilities', 'recommended', 'staff', 'really', 'hotel', 'city', 'suitable', 'room', 'near', 'nice', 'service', 'strategic', 'stay', 'bobobox', 'good', 'friendly', 'location', 'place', 'clean', 'comfortable']
