## **Library**

In [111]:
!pip install sastrawi wordcloud transformers tqdm seaborn --quiet

In [113]:
# =============================
# Mount Google Drive
# =============================
from google.colab import drive

# =============================
# Built-in / Standard Libraries
# =============================
import re
import json
from collections import Counter

# =============================
# Data Manipulation
# =============================
import pandas as pd
import numpy as np

# =============================
# Visualization
# =============================
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# =============================
# Text Preprocessing (Bahasa Indonesia)
# =============================
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# =============================
# Machine Learning
# =============================
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# =============================
# Deep Learning - TensorFlow / Keras
# =============================
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv1D, MaxPooling1D, LSTM, Dropout, Dense
from tensorflow.keras.optimizers import Adam

# =============================
# Deep Learning - Transformers (HuggingFace)
# =============================
import torch
from transformers import AutoTokenizer, AutoModel, pipeline


In [114]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

## **Pre-Processing**

In [None]:
drive.mount('/content/drive')

# Baca data CSV ke dalam dataframe pandas
df = pd.read_csv('/content/drive/MyDrive/pdm_metopen_uas/data_penelitian_berlabel.csv')

# Load kamus normalisasi untuk text preprocessing
with open('/content/drive/MyDrive/pdm_metopen_uas/kamus_normalisasi.json', 'r') as f:
    kamus_normalisasi = json.load(f)

# Tampilkan 5 baris pertama untuk memastikan data berhasil dibaca
print(df.head())
print("Total baris:", len(df))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
                                              ulasan     label
0  harga under sejuta oke aja. minus cuma di pane...  negative
1  Harga sesuai dengan barangnya. Stand layar baw...  negative
2  sesuai kategori harga, mau ambil yg paling mur...  negative
3  bekerja dengan baik walau kadang sering reconn...  negative
4  Barang sampai walau hari libur, namun sayang b...  negative
Total baris: 7460


1. Case Folding

In [None]:
# Mengubah semua teks di kolom 'ulasan' menjadi huruf kecil (lowercase)
df['ulasan_preprocessed'] = df['ulasan'].str.lower()
print(df.head())

                                              ulasan     label  \
0  harga under sejuta oke aja. minus cuma di pane...  negative   
1  Harga sesuai dengan barangnya. Stand layar baw...  negative   
2  sesuai kategori harga, mau ambil yg paling mur...  negative   
3  bekerja dengan baik walau kadang sering reconn...  negative   
4  Barang sampai walau hari libur, namun sayang b...  negative   

                                 ulasan_preprocessed  
0  harga under sejuta oke aja. minus cuma di pane...  
1  harga sesuai dengan barangnya. stand layar baw...  
2  sesuai kategori harga, mau ambil yg paling mur...  
3  bekerja dengan baik walau kadang sering reconn...  
4  barang sampai walau hari libur, namun sayang b...  


2. Text Cleaning

In [None]:
def clean_text(text):
    # Hapus karakter selain huruf (besar/kecil) dan spasi
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Hilangkan multiple spaces menjadi satu spasi saja, lalu trim spasi di awal/akhir
    text = re.sub(r'\s+', ' ', text).strip()

    return text

# Terapkan fungsi clean_text pada kolom 'ulasan'
df['ulasan_preprocessed'] = df['ulasan_preprocessed'].apply(clean_text)
print(df.head())

                                              ulasan     label  \
0  harga under sejuta oke aja. minus cuma di pane...  negative   
1  Harga sesuai dengan barangnya. Stand layar baw...  negative   
2  sesuai kategori harga, mau ambil yg paling mur...  negative   
3  bekerja dengan baik walau kadang sering reconn...  negative   
4  Barang sampai walau hari libur, namun sayang b...  negative   

                                 ulasan_preprocessed  
0  harga under sejuta oke aja minus cuma di panel...  
1  harga sesuai dengan barangnya stand layar bawa...  
2  sesuai kategori harga mau ambil yg paling mura...  
3  bekerja dengan baik walau kadang sering reconn...  
4  barang sampai walau hari libur namun sayang bo...  


3. Text Normalization

In [None]:
# Pisahkan kamus frasa dan kata
kamus_frasa = {k: v for k, v in kamus_normalisasi.items() if ' ' in k}
kamus_kata = {k: v for k, v in kamus_normalisasi.items() if ' ' not in k}

# Fungsi normalisasi lengkap
def normalisasi_teks(teks, kamus_frasa, kamus_kata):
    teks = teks.lower()

    # 1. Ganti frasa dulu
    for frasa, ganti in kamus_frasa.items():
        if frasa in teks:
            teks = teks.replace(frasa, ganti)

    # 2. Ganti per kata
    kata_kata = re.findall(r'\w+', teks)
    hasil = [kamus_kata.get(kata, kata) for kata in kata_kata]

    return ' '.join(hasil)

df['ulasan_preprocessed'] = df['ulasan_preprocessed'].astype(str).apply(lambda x: normalisasi_teks(x, kamus_frasa, kamus_kata))
print(df.head())

                                              ulasan     label  \
0  harga under sejuta oke aja. minus cuma di pane...  negative   
1  Harga sesuai dengan barangnya. Stand layar baw...  negative   
2  sesuai kategori harga, mau ambil yg paling mur...  negative   
3  bekerja dengan baik walau kadang sering reconn...  negative   
4  Barang sampai walau hari libur, namun sayang b...  negative   

                                 ulasan_preprocessed  
0  harga under sejuta oke saja minus cuma di pane...  
1  harga sesuai dengan barangnya stand layar bawa...  
2  sesuai kategori harga mau ambil yang paling mu...  
3  bekerja dengan baik walau kadang sering reconn...  
4  barang sampai walau hari libur namun sayang bo...  


4. Stopword Removal

In [None]:
# Inisialisasi factory dan buat objek stopword remover
factory = StopWordRemoverFactory()
stopword_remover = factory.create_stop_word_remover()

# Terapkan stopword removal ke kolom 'ulasan'
df['ulasan_preprocessed'] = df['ulasan_preprocessed'].apply(stopword_remover.remove)

print(df.head())

                                              ulasan     label  \
0  harga under sejuta oke aja. minus cuma di pane...  negative   
1  Harga sesuai dengan barangnya. Stand layar baw...  negative   
2  sesuai kategori harga, mau ambil yg paling mur...  negative   
3  bekerja dengan baik walau kadang sering reconn...  negative   
4  Barang sampai walau hari libur, namun sayang b...  negative   

                                 ulasan_preprocessed  
0  harga under sejuta oke minus cuma panel va kal...  
1  harga sesuai barangnya stand layar bawaanya ka...  
2  sesuai kategori harga mau ambil paling murah c...  
3  bekerja baik kadang sering reconnect wifi jadi...  
4  barang walau hari libur sayang box sampai peny...  


In [None]:
print("Total baris (awal):", len(df))

# mengapus baris yang tidak memiliki ulasan
df = df[df["ulasan_preprocessed"].notna() & (df["ulasan_preprocessed"] != "")]
print("Total baris:", len(df))

Total baris (awal): 7460
Total baris: 7457


In [None]:
jumlah_duplikat = df.duplicated(subset=['ulasan_preprocessed', 'label']).sum()
print(jumlah_duplikat)


# Menghapus baris duplikat dari DataFrame
df.drop_duplicates(subset=['ulasan_preprocessed', 'label'], inplace=True)

# Verifikasi ulang, seharusnya hasilnya 0
print(df.duplicated().sum())
print("Total baris:", len(df))

172
0
Total baris: 7285


In [None]:
print(df['ulasan_preprocessed'].duplicated().sum())

# Menggunakan metode yang disarankan (keep=False) lalu mengurutkannya
duplikat = df[df['ulasan_preprocessed'].duplicated(keep=False)]
print(duplikat.sort_values('ulasan_preprocessed'))

7
                                  ulasan     label    ulasan_preprocessed
2368                         tidak bagus  negative                  bagus
4621                               bagus  positive                  bagus
2893                 Barang tidak sesuai  negative          barang sesuai
4741                       barang sesuai  positive          barang sesuai
2107        barang tidak sesuai pesanan   negative  barang sesuai pesanan
4701  barang sudah sesuai dengan pesanan  positive  barang sesuai pesanan
994                berfungsi dengan baik  positive         berfungsi baik
2651            tidak berfungsi dgn baik  negative         berfungsi baik
2642                             mantul.  negative           mantap betul
4625                              mantul  positive           mantap betul
2022                         Ga sesuai 👎  negative                 sesuai
5325                        Sudah sesuai  positive                 sesuai
2050                Tidak sesuai pes

In [None]:
print(df.head())

                                              ulasan     label  \
0  harga under sejuta oke aja. minus cuma di pane...  negative   
1  Harga sesuai dengan barangnya. Stand layar baw...  negative   
2  sesuai kategori harga, mau ambil yg paling mur...  negative   
3  bekerja dengan baik walau kadang sering reconn...  negative   
4  Barang sampai walau hari libur, namun sayang b...  negative   

                                 ulasan_preprocessed  
0  harga under sejuta oke minus cuma panel va kal...  
1  harga sesuai barangnya stand layar bawaanya ka...  
2  sesuai kategori harga mau ambil paling murah c...  
3  bekerja baik kadang sering reconnect wifi jadi...  
4  barang walau hari libur sayang box sampai peny...  


In [None]:
df.to_csv('/content/drive/MyDrive/pdm_metopen_uas/data_penelitian_preprocessing.csv', index=False)
print("File berhasil disimpan di Google Drive dengan nama 'data_penelitian_preprocessing.csv'")

File berhasil disimpan di Google Drive dengan nama 'data_penelitian_preprocessing.csv'


## **Data Split**

In [None]:
# Baca data
drive.mount('/content/drive')
df = pd.read_csv('/content/drive/MyDrive/pdm_metopen_uas/data_penelitian_preprocessing.csv')
print("Total baris:", len(df))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Total baris: 7285


In [None]:
# Menampilkan jumlah data untuk setiap label
print(df['label'].value_counts())

label
positive    4023
negative    3225
neutral       37
Name: count, dtype: int64


In [None]:
from sklearn.model_selection import train_test_split

# Pertama, split 70% train dan 30% sisanya (val + test)
df_train, df_temp = train_test_split(df, test_size=0.3, stratify=df['label'], random_state=42)

# Kedua, split 30% tadi menjadi 15% val dan 15% test
# Karena dari total data, 15% adalah 0.5 dari sisa 30%, maka gunakan test_size=0.5
df_val, df_test = train_test_split(df_temp, test_size=0.5, stratify=df_temp['label'], random_state=42)

In [None]:
# Cek hasilnya
print("Total baris Train:", len(df_train))
print("Train:", df_train['label'].value_counts())
print('------------------------------------------')
print("Total baris Val:", len(df_val))
print("Validation:", df_val['label'].value_counts())
print('------------------------------------------')
print("Total baris Test:", len(df_test))
print("Test:", df_test['label'].value_counts())

Total baris Train: 5099
Train: label
positive    2816
negative    2257
neutral       26
Name: count, dtype: int64
------------------------------------------
Total baris Val: 1093
Validation: label
positive    604
negative    484
neutral       5
Name: count, dtype: int64
------------------------------------------
Total baris Test: 1093
Test: label
positive    603
negative    484
neutral       6
Name: count, dtype: int64


In [None]:
# Simpan ke file CSV
df_train.to_csv('/content/drive/MyDrive/pdm_metopen_uas/dataset_split/train.csv', index=False)
df_val.to_csv('/content/drive/MyDrive/pdm_metopen_uas/dataset_split/val.csv', index=False)
df_test.to_csv('/content/drive/MyDrive/pdm_metopen_uas/dataset_split/test.csv', index=False)


## **Word Embedding**

In [115]:
# Mount Google Drive
drive.mount('/content/drive')

# Load file CSV
folder = '/content/drive/MyDrive/pdm_metopen_uas/dataset_split/'
train_df = pd.read_csv(folder + 'train.csv')
val_df = pd.read_csv(folder + 'val.csv')
test_df = pd.read_csv(folder + 'test.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Load IndoBERT
tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p2")
model = AutoModel.from_pretrained("indobenchmark/indobert-base-p2")

In [None]:
def get_bert_embeddings(text_list, tokenizer, model, max_length=128):
    model.eval()
    embeddings = []

    for text in tqdm(text_list):
        inputs = tokenizer(text, padding='max_length', truncation=True,
                           max_length=max_length, return_tensors='pt')

        with torch.no_grad():
            outputs = model(**inputs)
            # Ambil semua token embeddings (tanpa CLS pooling)
            token_embeddings = outputs.last_hidden_state.squeeze(0).numpy()  # (max_length, 768)

        embeddings.append(token_embeddings)

    return np.array(embeddings)  # (n_samples, max_length, 768)


1. validation

In [None]:
# Validation
val_embeddings = get_bert_embeddings(df_val['ulasan_preprocessed'], tokenizer, model)
np.save('/content/drive/MyDrive/pdm_metopen_uas/embeddings/val_embeddings.npy', val_embeddings)


  return forward_call(*args, **kwargs)
100%|██████████| 1093/1093 [08:16<00:00,  2.20it/s]


2. data testing

In [None]:
test_embeddings = get_bert_embeddings(df_test['ulasan_preprocessed'], tokenizer, model)
np.save('/content/drive/MyDrive/pdm_metopen_uas/embeddings/test_embeddings.npy', test_embeddings)

  return forward_call(*args, **kwargs)
100%|██████████| 1093/1093 [08:25<00:00,  2.16it/s]


3. Train

In [None]:
train_embeddings = get_bert_embeddings(df_train['ulasan_preprocessed'], tokenizer, model)
np.save('/content/drive/MyDrive/pdm_metopen_uas/embeddings/train_embeddings.npy', train_embeddings)

  return forward_call(*args, **kwargs)
100%|██████████| 5099/5099 [38:45<00:00,  2.19it/s]


## **One-Hot Encoding**

In [116]:
# Ambil label dari dataframe
train_labels_raw = train_df['label'].tolist()
val_labels_raw = val_df['label'].tolist()
test_labels_raw = test_df['label'].tolist()

# Encode label ke angka
le = LabelEncoder()
le.fit(train_labels_raw)  # Fit hanya di data train

train_labels_enc = le.transform(train_labels_raw)
val_labels_enc = le.transform(val_labels_raw)
test_labels_enc = le.transform(test_labels_raw)

In [117]:
# One-hot encoding
train_labels = to_categorical(train_labels_enc)
val_labels = to_categorical(val_labels_enc)
test_labels = to_categorical(test_labels_enc)

In [118]:
# Simpan ke file .npy
np.save('/content/drive/MyDrive/pdm_metopen_uas/encoding/train_labels.npy', train_labels)
np.save('/content/drive/MyDrive/pdm_metopen_uas/encoding/val_labels.npy', val_labels)
np.save('/content/drive/MyDrive/pdm_metopen_uas/encoding/test_labels.npy', test_labels)