#Import Library

In [1]:
# Data handling
import pandas as pd
import numpy as np
import re

# Deep Learning & NLP
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Utility
from tqdm import tqdm

#EDA

In [2]:
# Memuat dataset
df = pd.read_csv("youtube_komentar.csv")

print("Jumlah data (baris, kolom):")
print(df.shape)

print("\nStruktur kolom dataset:")
print(df.columns.tolist())

print("\nInformasi tipe data & memori:")
print(df.info())

print("\nPratinjau data (5 baris teratas):")
df.head()

Jumlah data (baris, kolom):
(6525, 4)

Struktur kolom dataset:
['author', 'text', 'published_at', 'like_count']

Informasi tipe data & memori:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6525 entries, 0 to 6524
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   author        6521 non-null   object
 1   text          6525 non-null   object
 2   published_at  6525 non-null   object
 3   like_count    6525 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 204.0+ KB
None

Pratinjau data (5 baris teratas):


Unnamed: 0,author,text,published_at,like_count
0,@jokonawanto1844,Berimbang dan netral.. cerdas dan bijak dalam ...,2025-12-20T01:30:57Z,0
1,@MaksumEncum,Jangan. Menyalah2 keumat Islam. Indonesia. Bak...,2025-12-08T00:30:53Z,0
2,@alldump676,https://youtu.be/BtZZ_0pj6es?si=O1KvyXiVgBsxbMDW,2025-12-08T00:28:53Z,0
3,@SubaNus-e8r,"TIDAK,,\nSEMUA AGAMA ADALAH SAMA TAK ADA PEMBE...",2025-12-06T13:59:29Z,0
4,@sabriishak2788,Saya harap kritikan kang guru kpd KDM di ambil...,2025-10-19T07:59:15Z,0


In [3]:
print("Jumlah missing value per kolom:")
df.isnull().sum()

Jumlah missing value per kolom:


Unnamed: 0,0
author,4
text,0
published_at,0
like_count,0


In [4]:
print("Jumlah duplikat berdasarkan teks komentar:")
print(df.duplicated(subset="text").sum())

Jumlah duplikat berdasarkan teks komentar:
45


#Pembersihan Data

In [5]:
# Hapus duplikat berdasarkan teks komentar
df = df.drop_duplicates(subset="text")

print("Jumlah data setelah hapus duplikat:")
print(df.shape)

Jumlah data setelah hapus duplikat:
(6480, 4)


#Text Preprocessing

In [6]:
def preprocess_text(text):
    text = str(text)

    # 1. Penghapusan URL
    text = re.sub(r'http\S+|www\S+', '', text)

    # 2. Penghapusan mention (@username)
    text = re.sub(r'@\w+', '', text)

    # 3. Penghapusan hashtag (#)
    text = re.sub(r'#', '', text)

    # 4. Penghapusan angka
    text = re.sub(r'\d+', '', text)

    # 5. Normalisasi karakter berulang
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)

    # 6. Penghapusan simbol berlebihan
    text = re.sub(r'[^a-zA-Z\s!?]', ' ', text)

    # 7. Penghapusan spasi ganda
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [7]:
df["clean_text"] = df["text"].apply(preprocess_text)

df[["text", "clean_text"]].head()

Unnamed: 0,text,clean_text
0,Berimbang dan netral.. cerdas dan bijak dalam ...,Berimbang dan netral cerdas dan bijak dalam me...
1,Jangan. Menyalah2 keumat Islam. Indonesia. Bak...,Jangan Menyalah keumat Islam Indonesia Bakal B...
2,https://youtu.be/BtZZ_0pj6es?si=O1KvyXiVgBsxbMDW,
3,"TIDAK,,\nSEMUA AGAMA ADALAH SAMA TAK ADA PEMBE...",TIDAK SEMUA AGAMA ADALAH SAMA TAK ADA PEMBEDA ...
4,Saya harap kritikan kang guru kpd KDM di ambil...,Saya harap kritikan kang guru kpd KDM di ambil...


Karena terdapat data kosong setelah Text Preprocessing, maka dilakukan penghapusan data kosong

In [8]:
# Hapus data kosong di clean_text
df = df[df['clean_text'].str.strip() != '']
df = df.reset_index(drop=True)

print(f"Data setelah preprocessing: {len(df)} baris")
print(f"\nContoh data bersih:")
for i in range(3):
    print(f"\nAsli: {df['text'].iloc[i][:100]}...")
    print(f"Bersih: {df['clean_text'].iloc[i][:100]}...")

Data setelah preprocessing: 6444 baris

Contoh data bersih:

Asli: Berimbang dan netral.. cerdas dan bijak dalam menyampaikan pendapatnya... Guru gembul memang seorang...
Bersih: Berimbang dan netral cerdas dan bijak dalam menyampaikan pendapatnya Guru gembul memang seorang guru...

Asli: Jangan. Menyalah2 keumat Islam. Indonesia. Bakal Bubar...
Bersih: Jangan Menyalah keumat Islam Indonesia Bakal Bubar...

Asli: TIDAK,,
SEMUA AGAMA ADALAH SAMA TAK ADA PEMBEDA YG MEMBEDAKAN MANUSIA HANYA TINGKAH LAKU,PERBUATAN P...
Bersih: TIDAK SEMUA AGAMA ADALAH SAMA TAK ADA PEMBEDA YG MEMBEDAKAN MANUSIA HANYA TINGKAH LAKU PERBUATAN PER...


#Labelling Otomatis Menggunakan IndoBERTweet

In [9]:
MODEL_NAME = "Aardiiiiy/indobertweet-base-Indonesian-sentiment-analysis"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/994 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31923, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.2, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.2, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [10]:
label_map = {
    0: "negative",
    1: "neutral",
    2: "positive"
}

def predict_sentiment(text):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=128
    )

    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=1)
        label_id = torch.argmax(probs, dim=1).item()

    return label_map[label_id]


In [11]:
tqdm.pandas()

df["label_sentimen"] = df["clean_text"].progress_apply(predict_sentiment)

100%|██████████| 6444/6444 [01:03<00:00, 101.59it/s]


In [12]:
df["label_sentimen"].value_counts()

Unnamed: 0_level_0,count
label_sentimen,Unnamed: 1_level_1
negative,4770
positive,941
neutral,733


In [13]:
df["label_sentimen"].value_counts(normalize=True) * 100

Unnamed: 0_level_0,proportion
label_sentimen,Unnamed: 1_level_1
negative,74.022346
positive,14.602731
neutral,11.374922


In [14]:
df.to_csv("youtube_komentar_labeled.csv", index=False)
print("Dataset berhasil disimpan sebagai youtube_komentar_labeled.csv")

Dataset berhasil disimpan sebagai youtube_komentar_labeled.csv
