# Import Libraries

In [32]:
import pandas as pd
import os
import re
import string
import numpy as np
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import requests
import csv
from io import StringIO
from sklearn.utils import resample
from imblearn.under_sampling import RandomUnderSampler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from rapidfuzz import process
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Data Loading

In [33]:
def data_loading(root, category):
    root_path = "dataset"
    folder_path = root_path + f"/{category}"

    csv_files = [f for f in os.listdir(folder_path) if f.endswith(".csv") and f!=f"{category}.csv"]

    # List to store each DataFrame
    dfs = []

    # Read and collect DataFrames
    for file in csv_files:
        full_path = os.path.join(folder_path, file)
        # print(f"Reading: {full_path}")
        df = pd.read_csv(full_path)
        # df['item_id'] = 
        # print(df.head(5))
        dfs.append(df)
    
    # Combine all into a single DataFrame
    combined_df = pd.concat(dfs, ignore_index=True)

    # Show result
    # print(combined_df.head())
    print(f"\nTotal combined rows of {category}: {len(combined_df)}")

    return combined_df

In [34]:
def data_loading(root, category):
    root_path = "dataset"
    folder_path = os.path.join(root_path, category)

    # Load product list (assumes it's named {category}.csv)
    product_list_path = os.path.join(folder_path, f"{category}.csv")
    product_df = pd.read_csv(product_list_path)

    # Normalize product names for matching
    product_names = product_df['Product Name'].tolist()

    # Get review files
    csv_files = [f for f in os.listdir(folder_path) if f.endswith(".csv") and f != f"{category}.csv"]

    dfs = []
    for file in csv_files:
        full_path = os.path.join(folder_path, file)
        df = pd.read_csv(full_path)

        # Extract base name from review file
        base_name = file.replace("_reviews.csv", "").replace("_", " ").strip()

        # Find best match using fuzzy matching
        match, score, idx = process.extractOne(base_name, product_names, score_cutoff=50)
        if match:
            item_id = product_df.iloc[idx]['item_id']
            df['item_id'] = item_id
            dfs.append(df)
        else:
            print(f"[!] No good match for file: {file}")

    combined_df = pd.concat(dfs, ignore_index=True)
    print(f"\nTotal combined rows of {category}: {len(combined_df)}")
    return combined_df


In [35]:
root_path = "dataset"

# List all CSV files in the folder
category_names = [f for f in os.listdir(root_path)]

print(category_names)

['Elektronik', 'Aksesoris', 'Kesehatan', 'Kecantikan', 'Alas_kaki', 'Pakaian_Pria', 'Pakaian_Wanita']


In [36]:
all_dfs = {
    name: data_loading(root_path, category)
    for name, category in zip(category_names, category_names)
}



Total combined rows of Elektronik: 6287

Total combined rows of Aksesoris: 9659

Total combined rows of Kesehatan: 2676

Total combined rows of Kecantikan: 17285

Total combined rows of Alas_kaki: 6969

Total combined rows of Pakaian_Pria: 11537

Total combined rows of Pakaian_Wanita: 7649


In [37]:
all_dfs['Elektronik'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6287 entries, 0 to 6286
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   review   6086 non-null   object
 1   rating   6287 non-null   object
 2   date     4447 non-null   object
 3   item_id  6287 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 196.6+ KB


In [38]:
all_dfs['Elektronik'].head()

Unnamed: 0,review,rating,date,item_id
0,Kualitas barang original dan respon penjual ju...,bintang 5,4 bulan lalu,38
1,"Seller responsif, pengiriman cepat, barangnya ...",bintang 5,5 bulan lalu,38
2,"Seller responsif, pengiriman cepat, barangnya ...",bintang 5,5 bulan lalu,38
3,Pesanan yang ditunggu-tunggu akhirnya datang j...,bintang 5,9 bulan lalu,38
4,"Unit Surface dlm Box, BNIB, dikemas rapi dan a...",bintang 5,7 bulan lalu,38


In [39]:
all_dfs['Elektronik'].describe()

Unnamed: 0,item_id
count,6287.0
mean,18.088596
std,12.751633
min,1.0
25%,3.0
50%,19.0
75%,29.0
max,38.0


# Data Preprocessing

### Date Converting

In [40]:
for name, df in all_dfs.items():
    print(f"Category: {name}")
    
    # Drop rows with NaT in parsed_date and reset index
    df.dropna(subset=['date'], inplace=True)

Category: Elektronik
Category: Aksesoris
Category: Kesehatan
Category: Kecantikan
Category: Alas_kaki
Category: Pakaian_Pria
Category: Pakaian_Wanita


In [41]:
all_dfs['Kecantikan'].head()

Unnamed: 0,review,rating,date,item_id
20,"bahannya enak, merata di muka, bikin muka fresh",bintang 5,11 bulan lalu,1
32,"bahannya ringan, mudah diserap, warnanya cocok...",bintang 5,1 bulan lalu,1
47,produk baagus bgt .. buat sehari2 cocok..packa...,bintang 5,produk baagus bgt .. buat sehari2 cocok..packa...,1
49,tipe kulitku kombinasi under tone kulitku netral,bintang 5,10 bulan lalu,1
75,"nyaman dipakai, warnanya pas dengan kulitku, m...",bintang 5,"nyaman dipakai, warnanya pas dengan kulitku, m...",1


In [42]:
# Define a function to check if a date string is valid (e.g., "3 bulan lalu")
def is_valid_date_string(s):
    if not isinstance(s, str):
        return False
    return bool(re.match(r"^\d+\s+(bulan|minggu|hari|tahun|jam)\s+lalu$", s.strip().lower()))

In [43]:
for name, df in all_dfs.items():
    print(f"Category: {name}")
    
    # Keep rows where `date` contains any of the expected time keywords
    df = df[df["date"].apply(is_valid_date_string)].copy()

    all_dfs[name] = df

Category: Elektronik
Category: Aksesoris
Category: Kesehatan
Category: Kecantikan
Category: Alas_kaki
Category: Pakaian_Pria
Category: Pakaian_Wanita


In [44]:
all_dfs['Kecantikan'].head()

Unnamed: 0,review,rating,date,item_id
20,"bahannya enak, merata di muka, bikin muka fresh",bintang 5,11 bulan lalu,1
32,"bahannya ringan, mudah diserap, warnanya cocok...",bintang 5,1 bulan lalu,1
49,tipe kulitku kombinasi under tone kulitku netral,bintang 5,10 bulan lalu,1
111,Selalu happy belanja disini. Produk Packingan ...,bintang 5,3 bulan lalu,1
112,wokeh. cucok digunakan. terima kasih ya,bintang 5,11 bulan lalu,1


In [45]:
# Replace with your actual DataFrame loading
# df = pd.read_csv("your_reviews.csv")

now = datetime.now()

def parse_relative_date(date_str):
    now = pd.Timestamp.now()
    try:
        if 'bulan' in date_str:
            n = int(re.search(r'(\d+)', date_str).group(1))
            return now - pd.DateOffset(months=n)
        elif 'minggu' in date_str:
            n = int(re.search(r'(\d+)', date_str).group(1))
            return now - pd.Timedelta(weeks=n)
        elif 'hari' in date_str:
            n = int(re.search(r'(\d+)', date_str).group(1))
            return now - pd.Timedelta(days=n)
        elif 'tahun' in date_str:
            n = int(re.search(r'(\d+)', date_str).group(1))
            return now - pd.DateOffset(years=n)
        elif 'jam' in date_str:
            n = int(re.search(r'(\d+)', date_str).group(1))
            return now - pd.Timedelta(hours=n)
        else:
            return pd.NaT
    except Exception:
        return pd.NaT

In [46]:
for name, df in all_dfs.items():
    print(f"Category: {name}")
    
    # Apply parse function first
    df["parsed_date"] = df["date"].apply(parse_relative_date)
    
    all_dfs[name] = df
    
    # Optionally: reset index
    # df.reset_index(drop=True, inplace=True)

Category: Elektronik
Category: Aksesoris
Category: Kesehatan
Category: Kecantikan
Category: Alas_kaki
Category: Pakaian_Pria
Category: Pakaian_Wanita


In [47]:
all_dfs['Kecantikan'].head()

Unnamed: 0,review,rating,date,item_id,parsed_date
20,"bahannya enak, merata di muka, bikin muka fresh",bintang 5,11 bulan lalu,1,2024-06-30 14:41:33.097113
32,"bahannya ringan, mudah diserap, warnanya cocok...",bintang 5,1 bulan lalu,1,2025-04-30 14:41:33.097169
49,tipe kulitku kombinasi under tone kulitku netral,bintang 5,10 bulan lalu,1,2024-07-30 14:41:33.097204
111,Selalu happy belanja disini. Produk Packingan ...,bintang 5,3 bulan lalu,1,2025-02-28 14:41:33.097235
112,wokeh. cucok digunakan. terima kasih ya,bintang 5,11 bulan lalu,1,2024-06-30 14:41:33.097264


### Helper Functions For Labelling

In [48]:
def cleaningText(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', text) # menghapus mention
    text = re.sub(r'#[A-Za-z0-9]+', '', text) # menghapus hashtag
    text = re.sub(r'RT[\s]', '', text) # menghapus RT
    text = re.sub(r"http\S+", '', text) # menghapus link
    text = re.sub(r'[0-9]+', '', text) # menghapus angka
    text = re.sub(r'[^\w\s]', '', text) # menghapus karakter selain huruf dan angka
 
    text = text.replace('\n', ' ') # mengganti baris baru dengan spasi
    text = text.translate(str.maketrans('', '', string.punctuation)) # menghapus semua tanda baca
    text = text.strip(' ') # menghapus karakter spasi dari kiri dan kanan teks

    # remove emojis
    emoji_pattern = re.compile(
        "["                               
        u"\U0001F600-\U0001F64F"
        u"\U0001F300-\U0001F5FF"
        u"\U0001F680-\U0001F6FF"
        u"\U0001F1E0-\U0001F1FF"
        u"\U00002700-\U000027BF"
        u"\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE)
    
    text = emoji_pattern.sub(r'', text)
    
    return text
 
def casefoldingText(text): # Mengubah semua karakter dalam teks menjadi huruf kecil
    text = text.lower()
    return text
 
# def tokenizingText(text): # Memecah atau membagi string, teks menjadi daftar token
#     text = word_tokenize(text)
#     return text
 
def tokenizingText(text):
    if pd.isna(text):
        return []
    return text.split()

def filteringText(text): # Menghapus stopwords dalam teks
    listStopwords = set(stopwords.words('indonesian'))
    listStopwords1 = set(stopwords.words('english'))
    listStopwords.update(listStopwords)
    # listStopwords.update(listStopwords1)
    listStopwords.update(['iya','yaa','nya','na','sih','ku',"di","ya","loh","kah","woi","woii","woy", "nih", "trus", "tuh",\
                          "yah", "ajah", "lagi", "lah", "aj", "aja", "jg", "juga", "jga", "jugaa", "yng", 'apa', "cuman", "deh",\
                            "min", "gak", "cuma",\
                            "si", "an", "dikit", "langsung"])
    filtered = []
    for txt in text:
        if txt not in listStopwords:
            filtered.append(txt)
    text = filtered
    return text
 
def stemmingText(text): # Mengurangi kata ke bentuk dasarnya yang menghilangkan imbuhan awalan dan akhiran atau ke akar kata
    # Membuat objek stemmer
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
 
    # Memecah teks menjadi daftar kata
    words = text.split()
 
    # Menerapkan stemming pada setiap kata dalam daftar
    stemmed_words = [stemmer.stem(word) for word in words]
 
    # Menggabungkan kata-kata yang telah distem
    stemmed_text = ' '.join(stemmed_words)
 
    return stemmed_text
 
def toSentence(list_words): # Mengubah daftar kata menjadi kalimat
    sentence = ' '.join(word for word in list_words)
    return sentence

In [49]:
# Download slang dictionary from a public dataset (if available)
url = "https://raw.githubusercontent.com/louisowen6/NLP_bahasa_resources/refs/heads/master/combined_slang_words.txt"
slang_dict = requests.get(url).json()

slang_dict['aku'] = 'saya'
slang_dict['ak'] = 'saya'
slang_dict['gua'] = 'saya'
slang_dict['gw'] = 'saya'

slang_dict['jlk'] = 'jelek'
slang_dict['jlek'] = 'jelek'
slang_dict['burik'] = 'jelek'
slang_dict['buriq'] = 'jelek'
slang_dict['ampas'] = 'jelek'
slang_dict['amps'] = 'jelek'
slang_dict['buruk'] = 'jelek'
slang_dict['kentang'] = 'jelek'
slang_dict['bobrok'] = 'jelek'

slang_dict['bgs'] = 'bagus'
slang_dict['wokeh'] = 'bagus'
slang_dict['bgus'] = 'bagus'
slang_dict['baguss'] = 'bagus'

slang_dict['trnyata'] = 'ternyata'

slang_dict['amann'] = 'aman'

slang_dict['syukaa'] = 'suka'

slang_dict['bgt'] = 'banget'
slang_dict['bgtt'] = 'banget'

slang_dict['kren'] = 'keren'

slang_dict['udh'] = 'udah'

slang_dict['kasi'] = 'kasih'
slang_dict['ksi'] = 'kasih'
slang_dict['ksih'] = 'kasih'

slang_dict['gk'] = 'gak'
slang_dict['ga'] = 'gak'
slang_dict['gaa'] = 'gak'
slang_dict['kagak'] = 'gak'
slang_dict['kgk'] = 'gak'
slang_dict['g'] = 'gak'
slang_dict['engga'] = 'gak'
slang_dict['tdk'] = 'gak'
slang_dict['nggk'] = 'gak'
slang_dict['no'] = 'gak'

slang_dict['jls'] = 'jelas'
slang_dict['jlas'] = 'jelas'
slang_dict['danta'] = 'jelas'

slang_dict['mntp'] = 'mantap'
slang_dict['mantul'] = 'mantap'
slang_dict['mntap'] = 'mantap'

slang_dict['lg'] = 'lagi'
slang_dict['lgi'] = 'lagi'

slang_dict['uk'] = 'ukuran'

slang_dict['ksel'] = 'kesal'
slang_dict['kesel'] = 'kesal'
slang_dict['sebel'] = 'kesal'
slang_dict['sebal'] = 'kesal'

slang_dict['bacod'] = 'bacot'
slang_dict['bct'] = 'bacot'
slang_dict['bcd'] = 'bacot'

slang_dict['goblog'] = 'goblok'
slang_dict['gblg'] = 'goblok'
slang_dict['gblk'] = 'goblok'
slang_dict['bego'] = 'goblok'
slang_dict['bgo'] = 'goblok'
slang_dict['tolol'] = 'goblok'
slang_dict['tlol'] = 'goblok'
slang_dict['idiot'] = 'goblok'

slang_dict['trun'] = 'turun'

slang_dict['brg'] = 'barang'
slang_dict['brang'] = 'barang'
slang_dict['barng'] = 'barang'

slang_dict['cm'] = 'cuma'
slang_dict['cma'] = 'cuma'
slang_dict['cman'] = 'cuma'
slang_dict['cmn'] = 'cuma'

slang_dict['yt'] = 'youtube'

slang_dict['wrnaa'] = 'warna'

slang_dict['ajg'] = 'anjing'
slang_dict['anj'] = 'anjing'
slang_dict['anjg'] = 'anjing'
slang_dict['anjir'] = 'anjing'
slang_dict['anjr'] = 'anjing'

slang_dict['leg'] = 'lambat'
slang_dict['ngeleg'] = 'lambat'
slang_dict['lemod'] = 'lambat'
slang_dict['lemot'] = 'lambat'

slang_dict['happy'] = 'senang'

slang_dict['satset'] = 'cepat'
slang_dict['cpt'] = 'cepat'

slang_dict['pass'] = 'pas'

slang_dict['sbg'] = 'sebagai'

slang_dict['wr'] = 'win rate'
slang_dict['winrate'] = 'win rate'
slang_dict['ws'] = 'win streak'
slang_dict['winstreak'] = 'win streak'

slang_dict['ori'] = 'asli'
slang_dict['original'] = 'asli'

slang_dict['kw'] = 'palsu'
slang_dict['fake'] = 'palsu'

slang_dict['ok'] = 'oke'
slang_dict['okey'] = 'oke'
slang_dict['okay'] = 'oke'

slang_dict['hps'] = 'hapus'
slang_dict['hpus'] = 'hapus'
slang_dict['uninstal'] = 'hapus'
slang_dict['uninstall'] = 'hapus'

slang_dict['dikirim'] = 'pengiriman'

# Bi-gram
# Common
slang_dict['cepat selesai'] = 'cepat'

slang_dict['gak palsu'] = 'asli'
slang_dict['gak asli'] = 'palsu'
slang_dict['gak jelas'] = 'aneh'
slang_dict['gaje'] = 'aneh'

slang_dict['suka banget'] = 'cinta'

slang_dict['tebel'] = 'tebal'

slang_dict['gak suka'] = 'jelek'
slang_dict['gak enak'] = 'jelek'
slang_dict['gak bagus'] = 'jelek'

slang_dict['murah banget'] = 'murah_banget'

slang_dict['mahal banget'] = 'mahal_banget'

slang_dict['cepet banget'] = 'cepet_banget'

slang_dict['lama banget'] = 'lambat_banget'
slang_dict['lambat banget'] = 'lambat_banget'

slang_dict['bagus banget'] = 'bagus_banget'

slang_dict['jelek banget'] = 'jelek_banget'
slang_dict['sangat jelek'] = 'jelek_banget'

slang_dict['pelayanan buruk'] = 'buruk'

slang_dict['sangat puas'] = 'sangat_puas'

slang_dict['gak puas'] = 'kecewa'

# Purchase & Delivery
slang_dict['barang datang'] = "datang"
slang_dict['barang telat'] = "lambat"
slang_dict['barang cepat'] = "cepat"
slang_dict['barang rusak'] = "rusak"
slang_dict['barang oke'] = "bagus"
slang_dict['tebal banget'] = "bagus"
slang_dict['barang bagus'] = "bagus"
slang_dict['barang jelek'] = "jelek"
slang_dict['pengiriman cepat'] = "cepat"
slang_dict['pengiriman lambat'] = "lambat"
slang_dict['pengiriman aman'] = "aman"
slang_dict['pengiriman oke'] = "bagus"

def fix_slangwords(text):
    words = text.lower().split()
    
    # Step 1: Fix unigrams
    fixed_unigrams = [slang_dict.get(word, word) for word in words]
    
    # Step 2: Check for fixed bigrams
    i = 0
    final_words = []
    while i < len(fixed_unigrams):
        if i + 1 < len(fixed_unigrams):
            bigram = f"{fixed_unigrams[i]} {fixed_unigrams[i+1]}"
            if bigram in slang_dict:
                final_words.append(slang_dict[bigram])
                i += 2
                continue
        
        final_words.append(fixed_unigrams[i])
        i += 1

    return ' '.join(final_words)

In [50]:
def preprocess_review(review_df):
    # Remove missing values before pre-processing
    review_df.dropna(subset=['review'], inplace=True)

    # Membersihkan teks dan menyimpannya di kolom 'text_clean'
    review_df['text_clean'] = review_df['review'].apply(cleaningText)
    
    # Mengubah huruf dalam teks menjadi huruf kecil dan menyimpannya di 'text_casefoldingText'
    review_df['text_casefoldingText'] = review_df['text_clean'].apply(casefoldingText)
    
    # Mengganti kata-kata slang dengan kata-kata standar dan menyimpannya di 'text_slangwords'
    review_df['text_slangwords'] = review_df['text_casefoldingText'].apply(fix_slangwords)

    # Memecah teks menjadi token (kata-kata) dan menyimpannya di 'text_tokenizingText'
    review_df['text_tokenizingText'] = review_df['text_slangwords'].apply(tokenizingText)
    
    # Menghapus kata-kata stop (kata-kata umum) dan menyimpannya di 'text_stopword'
    review_df['text_stopword'] = review_df['text_tokenizingText'].apply(filteringText)
    
    # Menggabungkan token-token menjadi kalimat dan menyimpannya di 'text_akhir'
    review_df['text_akhir'] = review_df['text_stopword'].apply(toSentence)

    return review_df

In [51]:
for name, df in all_dfs.items():
    print(f"Category: {name}")
    df = preprocess_review(df)

Category: Elektronik
Category: Aksesoris
Category: Kesehatan
Category: Kecantikan
Category: Alas_kaki
Category: Pakaian_Pria
Category: Pakaian_Wanita


In [52]:
all_dfs['Kecantikan'].head(10)

Unnamed: 0,review,rating,date,item_id,parsed_date,text_clean,text_casefoldingText,text_slangwords,text_tokenizingText,text_stopword,text_akhir
20,"bahannya enak, merata di muka, bikin muka fresh",bintang 5,11 bulan lalu,1,2024-06-30 14:41:33.097113,bahannya enak merata di muka bikin muka fresh,bahannya enak merata di muka bikin muka fresh,bahannya enak merata di muka bikin muka fresh,"[bahannya, enak, merata, di, muka, bikin, muka...","[bahannya, enak, merata, muka, bikin, muka, fr...",bahannya enak merata muka bikin muka fresh
32,"bahannya ringan, mudah diserap, warnanya cocok...",bintang 5,1 bulan lalu,1,2025-04-30 14:41:33.097169,bahannya ringan mudah diserap warnanya cocok d...,bahannya ringan mudah diserap warnanya cocok d...,bahannya ringan mudah diserap warnanya cocok d...,"[bahannya, ringan, mudah, diserap, warnanya, c...","[bahannya, ringan, mudah, diserap, warnanya, c...",bahannya ringan mudah diserap warnanya cocok p...
49,tipe kulitku kombinasi under tone kulitku netral,bintang 5,10 bulan lalu,1,2024-07-30 14:41:33.097204,tipe kulitku kombinasi under tone kulitku netral,tipe kulitku kombinasi under tone kulitku netral,tipe kulitku kombinasi under tone kulitku netral,"[tipe, kulitku, kombinasi, under, tone, kulitk...","[tipe, kulitku, kombinasi, under, tone, kulitk...",tipe kulitku kombinasi under tone kulitku netral
111,Selalu happy belanja disini. Produk Packingan ...,bintang 5,3 bulan lalu,1,2025-02-28 14:41:33.097235,Selalu happy belanja disini Produk Packingan P...,selalu happy belanja disini produk packingan p...,selalu senang belanja disini produk packingan ...,"[selalu, senang, belanja, disini, produk, pack...","[senang, belanja, produk, packingan, pengirima...",senang belanja produk packingan pengiriman oke...
112,wokeh. cucok digunakan. terima kasih ya,bintang 5,11 bulan lalu,1,2024-06-30 14:41:33.097264,wokeh cucok digunakan terima kasih ya,wokeh cucok digunakan terima kasih ya,bagus cocok digunakan terima kasih iya,"[bagus, cocok, digunakan, terima, kasih, iya]","[bagus, cocok, terima, kasih]",bagus cocok terima kasih
136,ringan dan sesuai warna kulit,bintang 5,5 bulan lalu,1,2024-12-30 14:41:33.097363,ringan dan sesuai warna kulit,ringan dan sesuai warna kulit,ringan dan sesuai warna kulit,"[ringan, dan, sesuai, warna, kulit]","[ringan, sesuai, warna, kulit]",ringan sesuai warna kulit
153,packing bagus,bintang 5,9 bulan lalu,1,2024-08-30 14:41:33.097394,packing bagus,packing bagus,packing bagus,"[packing, bagus]","[packing, bagus]",packing bagus
155,hehe,bintang 5,7 bulan lalu,1,2024-10-30 14:41:33.097422,hehe,hehe,he,[he],[he],he
159,matap semoga cocok,bintang 5,2 bulan lalu,1,2025-03-30 14:41:33.097449,matap semoga cocok,matap semoga cocok,matap semoga cocok,"[matap, semoga, cocok]","[matap, semoga, cocok]",matap semoga cocok
166,Terimakasih paket sudah sampai sesuai pesanan,bintang 5,3 bulan lalu,99,2025-02-28 14:41:33.097475,Terimakasih paket sudah sampai sesuai pesanan,terimakasih paket sudah sampai sesuai pesanan,terimakasih paket sudah sampai sesuai pesanan,"[terimakasih, paket, sudah, sampai, sesuai, pe...","[terimakasih, paket, sesuai, pesanan]",terimakasih paket sesuai pesanan


# Labeling

In [53]:
# Membaca data kamus kata-kata positif dari GitHub
lexicon_positive = dict()
 
response = requests.get('https://raw.githubusercontent.com/angelmetanosaa/dataset/main/lexicon_positive.csv')
# Mengirim permintaan HTTP untuk mendapatkan file CSV dari GitHub
 
if response.status_code == 200:
    # Jika permintaan berhasil
    reader = csv.reader(StringIO(response.text), delimiter=',')
    # Membaca teks respons sebagai file CSV menggunakan pembaca CSV dengan pemisah koma
 
    for row in reader:
        # Mengulangi setiap baris dalam file CSV
        lexicon_positive[row[0]] = int(row[1])
        # Menambahkan kata-kata positif dan skornya ke dalam kamus lexicon_positive
    lexicon_positive['aman'] = 2
    lexicon_positive['cinta'] = 5
    lexicon_positive['murah_banget'] = 5
    lexicon_positive['cepet_banget'] = 5
    lexicon_positive['bagus_banget'] = 5
    lexicon_positive['beli'] = 3
    lexicon_positive['aman'] = 4
    lexicon_positive['tebal'] = 3
    lexicon_positive['baju'] = 0
else:
    print("Failed to fetch positive lexicon data")
 
# Membaca data kamus kata-kata negatif dari GitHub
lexicon_negative = dict()
 
response = requests.get('https://raw.githubusercontent.com/angelmetanosaa/dataset/main/lexicon_negative.csv')
# Mengirim permintaan HTTP untuk mendapatkan file CSV dari GitHub
 
if response.status_code == 200:
    # Jika permintaan berhasil
    reader = csv.reader(StringIO(response.text), delimiter=',')
    # Membaca teks respons sebagai file CSV menggunakan pembaca CSV dengan pemisah koma
 
    for row in reader:
        # Mengulangi setiap baris dalam file CSV
        lexicon_negative[row[0]] = int(row[1])
        # Menambahkan kata-kata negatif dan skornya dalam kamus lexicon_negative
    
    lexicon_negative['bagus'] = 0
    lexicon_negative['beli'] = 0
    lexicon_negative['pengiriman'] = 0
    lexicon_negative['cepat'] = 0
    lexicon_negative['nyangka'] = 0
    lexicon_negative['murah'] = 0
    lexicon_positive['cinta'] = 0
    lexicon_positive['pengganti'] = 0
    lexicon_positive['terang'] = -1
    lexicon_positive['gelap'] = -1
    lexicon_negative['bolong'] = -5
    lexicon_positive['mahal_banget'] = -5
    lexicon_negative['lambat_banget'] = -4
    lexicon_negative['jelek_banget'] = -5
else:
    print("Failed to fetch negative lexicon data")

In [54]:
# Fungsi untuk menentukan polaritas sentimen dari tweet
 
def sentiment_analysis_lexicon_indonesia(text):
    score = 0
    # Inisialisasi skor sentimen ke 0
 
    for word in text:
        # Mengulangi setiap kata dalam teks
 
        if (word in lexicon_positive):
            score = score + lexicon_positive[word]/5.
            # Jika kata ada dalam kamus positif, tambahkan skornya ke skor sentimen
 
    for word in text:
        # Mengulangi setiap kata dalam teks (sekali lagi)
 
        if (word in lexicon_negative):
            score = score + lexicon_negative[word]/5.
            # Jika kata ada dalam kamus negatif, kurangkan skornya dari skor sentimen
 
    polarity=''
    # Inisialisasi variabel polaritas
 
    if (score >= 0.6):
        polarity = 'positive'
    elif (score <= -1.0):
        polarity = 'negative'
    else:
        polarity = 'neutral'
 
    return score, polarity
    # Mengembalikan skor sentimen dan polaritas teks

In [55]:
def labeling(preprocessed_df):
    results = preprocessed_df['text_stopword'].apply(sentiment_analysis_lexicon_indonesia)
    results = list(zip(*results))
    preprocessed_df['polarity_score'] = results[0]
    preprocessed_df['label'] = results[1]

In [56]:
for name, df in all_dfs.items():
    print(f"Category: {name}")
    df = labeling(df)

Category: Elektronik
Category: Aksesoris
Category: Kesehatan
Category: Kecantikan
Category: Alas_kaki
Category: Pakaian_Pria
Category: Pakaian_Wanita


In [57]:
all_dfs['Kecantikan'].head()

Unnamed: 0,review,rating,date,item_id,parsed_date,text_clean,text_casefoldingText,text_slangwords,text_tokenizingText,text_stopword,text_akhir,polarity_score,label
20,"bahannya enak, merata di muka, bikin muka fresh",bintang 5,11 bulan lalu,1,2024-06-30 14:41:33.097113,bahannya enak merata di muka bikin muka fresh,bahannya enak merata di muka bikin muka fresh,bahannya enak merata di muka bikin muka fresh,"[bahannya, enak, merata, di, muka, bikin, muka...","[bahannya, enak, merata, muka, bikin, muka, fr...",bahannya enak merata muka bikin muka fresh,-1.110223e-16,neutral
32,"bahannya ringan, mudah diserap, warnanya cocok...",bintang 5,1 bulan lalu,1,2025-04-30 14:41:33.097169,bahannya ringan mudah diserap warnanya cocok d...,bahannya ringan mudah diserap warnanya cocok d...,bahannya ringan mudah diserap warnanya cocok d...,"[bahannya, ringan, mudah, diserap, warnanya, c...","[bahannya, ringan, mudah, diserap, warnanya, c...",bahannya ringan mudah diserap warnanya cocok p...,2.2,positive
49,tipe kulitku kombinasi under tone kulitku netral,bintang 5,10 bulan lalu,1,2024-07-30 14:41:33.097204,tipe kulitku kombinasi under tone kulitku netral,tipe kulitku kombinasi under tone kulitku netral,tipe kulitku kombinasi under tone kulitku netral,"[tipe, kulitku, kombinasi, under, tone, kulitk...","[tipe, kulitku, kombinasi, under, tone, kulitk...",tipe kulitku kombinasi under tone kulitku netral,0.0,neutral
111,Selalu happy belanja disini. Produk Packingan ...,bintang 5,3 bulan lalu,1,2025-02-28 14:41:33.097235,Selalu happy belanja disini Produk Packingan P...,selalu happy belanja disini produk packingan p...,selalu senang belanja disini produk packingan ...,"[selalu, senang, belanja, disini, produk, pack...","[senang, belanja, produk, packingan, pengirima...",senang belanja produk packingan pengiriman oke...,1.6,positive
112,wokeh. cucok digunakan. terima kasih ya,bintang 5,11 bulan lalu,1,2024-06-30 14:41:33.097264,wokeh cucok digunakan terima kasih ya,wokeh cucok digunakan terima kasih ya,bagus cocok digunakan terima kasih iya,"[bagus, cocok, digunakan, terima, kasih, iya]","[bagus, cocok, terima, kasih]",bagus cocok terima kasih,1.2,positive


In [58]:
all_dfs['Kecantikan'][all_dfs['Kecantikan']['label'] == 'negative'].head()

Unnamed: 0,review,rating,date,item_id,parsed_date,text_clean,text_casefoldingText,text_slangwords,text_tokenizingText,text_stopword,text_akhir,polarity_score,label
347,Maaf kurir nya salah orang ....!,bintang 3,3 bulan lalu,99,2025-02-28 14:41:33.101576,Maaf kurir nya salah orang,maaf kurir nya salah orang,maaf kurir nya salah orang,"[maaf, kurir, nya, salah, orang]","[maaf, kurir, salah, orang]",maaf kurir salah orang,-1.0,negative
694,Make lipsticks LT Pro Matte Lip Cream no. 06 u...,bintang 5,9 bulan lalu,63,2024-08-30 14:41:33.109500,Make lipsticks LT Pro Matte Lip Cream no udah...,make lipsticks lt pro matte lip cream no udah...,make lipsticks lt profesional matte lip cream ...,"[make, lipsticks, lt, profesional, matte, lip,...","[make, lipsticks, lt, profesional, matte, lip,...",make lipsticks lt profesional matte lip cream ...,-1.4,negative
893,Baru pertama kali coba takut dempul eh ternyat...,bintang 5,1 bulan lalu,50,2025-04-30 14:41:33.112462,Baru pertama kali coba takut dempul eh ternyat...,baru pertama kali coba takut dempul eh ternyat...,baru pertama kali coba takut dempul eh ternyat...,"[baru, pertama, kali, coba, takut, dempul, eh,...","[kali, coba, takut, dempul, eh, masuk, tu, oili]",kali coba takut dempul eh masuk tu oili,-1.0,negative
952,deskripsi free 5 item tapi yg datang cuma free...,bintang 2,10 bulan lalu,50,2024-07-30 14:41:33.113952,deskripsi free item tapi yg datang cuma free ...,deskripsi free item tapi yg datang cuma free ...,deskripsi free hitam tapi yang datang cuma fre...,"[deskripsi, free, hitam, tapi, yang, datang, c...","[deskripsi, free, hitam, free, hitam]",deskripsi free hitam free hitam,-1.2,negative
1021,cushion nya baguss lumayan ngecover jugaa tp a...,bintang 5,2 bulan lalu,50,2025-03-30 14:41:33.115461,cushion nya baguss lumayan ngecover jugaa tp a...,cushion nya baguss lumayan ngecover jugaa tp a...,cushion nya bagus lumayan ngecover jugaa tapi ...,"[cushion, nya, bagus, lumayan, ngecover, jugaa...","[cushion, bagus, lumayan, ngecover, salah, amb...",cushion bagus lumayan ngecover salah ambil sha...,-1.0,negative


In [59]:
all_dfs['Kecantikan'][all_dfs['Kecantikan']['label'] == 'neutral'].head()

Unnamed: 0,review,rating,date,item_id,parsed_date,text_clean,text_casefoldingText,text_slangwords,text_tokenizingText,text_stopword,text_akhir,polarity_score,label
20,"bahannya enak, merata di muka, bikin muka fresh",bintang 5,11 bulan lalu,1,2024-06-30 14:41:33.097113,bahannya enak merata di muka bikin muka fresh,bahannya enak merata di muka bikin muka fresh,bahannya enak merata di muka bikin muka fresh,"[bahannya, enak, merata, di, muka, bikin, muka...","[bahannya, enak, merata, muka, bikin, muka, fr...",bahannya enak merata muka bikin muka fresh,-1.110223e-16,neutral
49,tipe kulitku kombinasi under tone kulitku netral,bintang 5,10 bulan lalu,1,2024-07-30 14:41:33.097204,tipe kulitku kombinasi under tone kulitku netral,tipe kulitku kombinasi under tone kulitku netral,tipe kulitku kombinasi under tone kulitku netral,"[tipe, kulitku, kombinasi, under, tone, kulitk...","[tipe, kulitku, kombinasi, under, tone, kulitk...",tipe kulitku kombinasi under tone kulitku netral,0.0,neutral
153,packing bagus,bintang 5,9 bulan lalu,1,2024-08-30 14:41:33.097394,packing bagus,packing bagus,packing bagus,"[packing, bagus]","[packing, bagus]",packing bagus,0.4,neutral
155,hehe,bintang 5,7 bulan lalu,1,2024-10-30 14:41:33.097422,hehe,hehe,he,[he],[he],he,0.0,neutral
159,matap semoga cocok,bintang 5,2 bulan lalu,1,2025-03-30 14:41:33.097449,matap semoga cocok,matap semoga cocok,matap semoga cocok,"[matap, semoga, cocok]","[matap, semoga, cocok]",matap semoga cocok,0.2,neutral


In [60]:
all_dfs['Kecantikan'][all_dfs['Kecantikan']['label'] == 'positive'].head()

Unnamed: 0,review,rating,date,item_id,parsed_date,text_clean,text_casefoldingText,text_slangwords,text_tokenizingText,text_stopword,text_akhir,polarity_score,label
32,"bahannya ringan, mudah diserap, warnanya cocok...",bintang 5,1 bulan lalu,1,2025-04-30 14:41:33.097169,bahannya ringan mudah diserap warnanya cocok d...,bahannya ringan mudah diserap warnanya cocok d...,bahannya ringan mudah diserap warnanya cocok d...,"[bahannya, ringan, mudah, diserap, warnanya, c...","[bahannya, ringan, mudah, diserap, warnanya, c...",bahannya ringan mudah diserap warnanya cocok p...,2.2,positive
111,Selalu happy belanja disini. Produk Packingan ...,bintang 5,3 bulan lalu,1,2025-02-28 14:41:33.097235,Selalu happy belanja disini Produk Packingan P...,selalu happy belanja disini produk packingan p...,selalu senang belanja disini produk packingan ...,"[selalu, senang, belanja, disini, produk, pack...","[senang, belanja, produk, packingan, pengirima...",senang belanja produk packingan pengiriman oke...,1.6,positive
112,wokeh. cucok digunakan. terima kasih ya,bintang 5,11 bulan lalu,1,2024-06-30 14:41:33.097264,wokeh cucok digunakan terima kasih ya,wokeh cucok digunakan terima kasih ya,bagus cocok digunakan terima kasih iya,"[bagus, cocok, digunakan, terima, kasih, iya]","[bagus, cocok, terima, kasih]",bagus cocok terima kasih,1.2,positive
136,ringan dan sesuai warna kulit,bintang 5,5 bulan lalu,1,2024-12-30 14:41:33.097363,ringan dan sesuai warna kulit,ringan dan sesuai warna kulit,ringan dan sesuai warna kulit,"[ringan, dan, sesuai, warna, kulit]","[ringan, sesuai, warna, kulit]",ringan sesuai warna kulit,0.8,positive
166,Terimakasih paket sudah sampai sesuai pesanan,bintang 5,3 bulan lalu,99,2025-02-28 14:41:33.097475,Terimakasih paket sudah sampai sesuai pesanan,terimakasih paket sudah sampai sesuai pesanan,terimakasih paket sudah sampai sesuai pesanan,"[terimakasih, paket, sudah, sampai, sesuai, pe...","[terimakasih, paket, sesuai, pesanan]",terimakasih paket sesuai pesanan,2.4,positive


In [61]:
# Drop the unused columns
for name, df in all_dfs.items():
    print(f"Category: {name}")
    df = df.drop(columns=['polarity_score', 'text_clean', 'text_casefoldingText', 'text_slangwords', 'text_tokenizingText', 'text_stopword', 'review'])
    all_dfs[name] = df
    

Category: Elektronik
Category: Aksesoris
Category: Kesehatan
Category: Kecantikan
Category: Alas_kaki
Category: Pakaian_Pria
Category: Pakaian_Wanita


In [62]:
all_dfs['Pakaian_Pria'].head()

Unnamed: 0,rating,date,item_id,parsed_date,text_akhir,label
1,bintang 5,8 bulan lalu,75,2024-09-30 14:41:33.525322,bahan bagus ukuran xxl,neutral
3,bintang 5,9 bulan lalu,75,2024-08-30 14:41:33.525401,packing plastik lapis aman kemeja dilapisi fur...,positive
9,bintang 4,4 bulan lalu,75,2025-01-30 14:41:33.525437,size pascuma ketiaknya pola jahitannya sempit,neutral
13,bintang 3,7 bulan lalu,75,2024-10-30 14:41:33.525469,baju bolong dilengan kanan,negative
15,bintang 5,10 bulan lalu,75,2024-07-30 14:41:33.525497,lebar lengan proporsionil,neutral


# Save

In [None]:
for name, df in all_dfs.items():
    output_dir = os.path.join("dataset_preprocessed", name)
    os.makedirs(output_dir, exist_ok=True)  # Create directory if it doesn't exist

    output_path = os.path.join(output_dir, f"{name}_review.csv")
    df.to_csv(output_path, index=False)
    print(f"Saved: {output_path}")


Saved: dataset_preprocessed/Elektronik/Elektronik_review.csv
Saved: dataset_preprocessed/Aksesoris/Aksesoris_review.csv
Saved: dataset_preprocessed/Kesehatan/Kesehatan_review.csv
Saved: dataset_preprocessed/Kecantikan/Kecantikan_review.csv
Saved: dataset_preprocessed/Alas_kaki/Alas_kaki_review.csv
Saved: dataset_preprocessed/Pakaian_Pria/Pakaian_Pria_review.csv
Saved: dataset_preprocessed/Pakaian_Wanita/Pakaian_Wanita_review.csv


# Training Data Preparation

### Checking the label distribution

In [61]:
for name, df in all_dfs.items():
    print(f"Category: {name}")
    print(df['label'].value_counts())

Category: Elektronik
label
positive    3548
neutral      628
negative      47
Name: count, dtype: int64
Category: Aksesoris
label
positive    4197
neutral     1258
negative      60
Name: count, dtype: int64
Category: Kesehatan
label
positive    618
neutral     151
negative     14
Name: count, dtype: int64
Category: Kecantikan
label
positive    6170
neutral     2282
negative     202
Name: count, dtype: int64
Category: Alas_kaki
label
positive    3020
neutral      897
negative      60
Name: count, dtype: int64
Category: Pakaian_Pria
label
positive    4261
neutral     1319
negative     119
Name: count, dtype: int64
Category: Pakaian_Wanita
label
positive    4524
neutral     1635
negative     252
Name: count, dtype: int64


### Balancing DF

In [62]:
def balance_df(df):
    # Separate by class
    df_0 = df[df['label'] == 'negative']
    df_1 = df[df['label'] == 'neutral']
    df_2 = df[df['label'] == 'positive']

    # Find the smallest class size
    min_class_size = min(len(df_0), len(df_1), len(df_2))

    # Generate random multipliers for each class
    mult_0 = np.random.uniform(1.0, 1.5)
    mult_1 = np.random.uniform(1.0, 1.5)
    mult_2 = np.random.uniform(1.0, 1.5)

    # Compute target sample sizes
    n_0 = min(len(df_0), int(min_class_size * mult_0))
    n_1 = min(len(df_1), int(min_class_size * mult_1))
    n_2 = min(len(df_2), int(min_class_size * mult_2))

    # Downsample each class
    df_0_down = resample(df_0, replace=False, n_samples=n_0, random_state=42)
    df_1_down = resample(df_1, replace=False, n_samples=n_1, random_state=43)
    df_2_down = resample(df_2, replace=False, n_samples=n_2, random_state=44)

    # Combine and shuffle
    df_balanced = pd.concat([df_0_down, df_1_down, df_2_down])
    df_balanced = df_balanced.sample(frac=1, random_state=45).reset_index(drop=True)

    # View new distribution
    print("Random multipliers:", f"Class 0: {mult_0:.2f}, Class 1: {mult_1:.2f}, Class 2: {mult_2:.2f}")
    print(df_balanced['label'].value_counts())

    return df_balanced

In [63]:
for name, df in all_dfs.items():
    print(f"Category: {name}")
    df = balance_df(df)

Category: Elektronik
Random multipliers: Class 0: 1.07, Class 1: 1.05, Class 2: 1.28
label
positive    60
neutral     49
negative    47
Name: count, dtype: int64
Category: Aksesoris
Random multipliers: Class 0: 1.18, Class 1: 1.04, Class 2: 1.16
label
positive    69
neutral     62
negative    60
Name: count, dtype: int64
Category: Kesehatan
Random multipliers: Class 0: 1.16, Class 1: 1.01, Class 2: 1.35
label
positive    18
negative    14
neutral     14
Name: count, dtype: int64
Category: Kecantikan
Random multipliers: Class 0: 1.36, Class 1: 1.04, Class 2: 1.46
label
positive    295
neutral     209
negative    202
Name: count, dtype: int64
Category: Alas_kaki
Random multipliers: Class 0: 1.39, Class 1: 1.36, Class 2: 1.34
label
neutral     81
positive    80
negative    60
Name: count, dtype: int64
Category: Pakaian_Pria
Random multipliers: Class 0: 1.33, Class 1: 1.06, Class 2: 1.35
label
positive    160
neutral     126
negative    119
Name: count, dtype: int64
Category: Pakaian_Wanit

# Save

In [None]:
# import os

# for name, df in all_dfs.items():
#     output_dir = os.path.join("dataset_preprocessed", name)
#     os.makedirs(output_dir, exist_ok=True)  # Create directory if it doesn't exist

#     output_path = os.path.join(output_dir, f"{name}_review.csv")
#     df.to_csv(output_path, index=False)
#     print(f"Saved: {output_path}")


Saved: dataset_preprocessed/Elektronik/Elektronik_review.csv
Saved: dataset_preprocessed/Aksesoris/Aksesoris_review.csv
Saved: dataset_preprocessed/Kesehatan/Kesehatan_review.csv
Saved: dataset_preprocessed/Kecantikan/Kecantikan_review.csv
Saved: dataset_preprocessed/Alas_kaki/Alas_kaki_review.csv
Saved: dataset_preprocessed/Pakaian_Pria/Pakaian_Pria_review.csv
Saved: dataset_preprocessed/Pakaian_Wanita/Pakaian_Wanita_review.csv
