<a href="https://colab.research.google.com/github/lhidayanti/Kelompok-14-Pemrosesan-Teks/blob/main/Final/KRL/KRL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import re
import string

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# df = pd.read_csv('ScrappingKRL.csv')
df = pd.read_csv('/content/drive/MyDrive/KRL.csv')
df

In [None]:
df = df[['full_text']]
df

# Preprocessing 1

## punctuation

In [None]:
import re

def normalize_basic(text):
    text = str(text).lower()                             # ubah ke huruf kecil
    text = re.sub(r'@\w+', '', text)                     # hapus mention
    text = re.sub(r'(?:http?://|https?://|www\.)\S+', '', text) # hapus url
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)          # hapus simbol aneh
    text = re.sub(r'(.)\1{2,}', r'\1', text)             # ubah huruf berulang: "heellooo" -> "helo"
    text = re.sub(r'[^\w\s]', ' ', text)                 # hapus tanda baca, tapi simpan spasi
    text = re.sub(r'\s+', ' ', text).strip()             # rapikan spasi berlebih
    return text
df['punctuation_text'] = df['full_text'].apply(normalize_basic)
df_punct_view = df[['punctuation_text']]
df_punct_view

## Normalize

In [None]:
!pip install deep-translator

In [None]:
import re
from deep_translator import GoogleTranslator

translator = GoogleTranslator(source='en', target='id')

def translate_sentence(text):
    text = str(text)
    try:
        # translate seluruh kalimat sekaligus
        translated_text = translator.translate(text)
    except:
        translated_text = text
    return translated_text

# Terapkan ke dataframe
df['translated'] = df['punctuation_text'].apply(translate_sentence)
df_translated_view = df[['punctuation_text', 'translated']]

df_translated_view


In [None]:
df_translated_view.to_csv('translated.csv', index=False)

In [None]:
# kamus slang
kamus = pd.read_csv('/content/drive/MyDrive/colloquial-indonesian-lexicon.csv', usecols=['slang', 'formal'])
kamus_dict = dict(zip(kamus['slang'], kamus['formal']))

kamus

In [None]:
def normalize_slang(text):
    words = text.lower().split()
    return ' '.join([kamus_dict.get(w, w) for w in words])

df['normalized'] = df['translated'].apply(normalize_slang)
df_normalized_view = df[['translated', 'normalized']]
df_normalized_view

In [None]:
df_normalized_view.to_csv('normalized.csv', index=False)

# Labeling

In [None]:
from transformers import pipeline

In [None]:
sentiment_pipe = pipeline(
    "sentiment-analysis",
    model="ayameRushia/bert-base-indonesian-1.5G-sentiment-analysis-smsa",
    tokenizer="ayameRushia/bert-base-indonesian-1.5G-sentiment-analysis-smsa"
)

In [None]:
df['hf_label'] = df['normalized'].apply(lambda x: sentiment_pipe(x)[0]['label'])
df['hf_score'] = df['normalized'].apply(lambda x: sentiment_pipe(x)[0]['score'])
df_view = df[['normalized', 'hf_label', 'hf_score']]
df_view

In [None]:
print("HuggingFace label counts:")
print(df['hf_label'].value_counts())

In [None]:
# membaca file labeling final dari drive
df = pd.read_csv('/content/drive/MyDrive/LABELING FIX.csv')
df

In [None]:
# menghitung jumlah sentimen neutral, negatif, dan positif
print("Label counts:")
print(df['hf_final'].value_counts())

# Preprocessing 2

## Tokenizer (memisah kalimat menjadi tiap kata)

In [None]:
df['tokens'] = df['normalized'].apply(lambda x: x.split())
df['tokens'] = df['tokens'].apply(lambda x: [t for t in x if len(t) > 1])
df_tokenized_view = df[['normalized', 'tokens']]
df_tokenized_view

## Stopword

In [None]:
# melakukan stopword manual dari kolom tokens
import ast
manual_stopwords = [
    'yang', 'dan', 'di', 'ke', 'dari', 'itu', 'ini',
    'untuk', 'pada', 'dengan', 'karena', 'bahwa', 'saat',
    'ada', 'tidak', 'ya', 'nih', 'loh', 'sih', 'dong', 'agar', 'atau',
    'sehingga', 'telah', 'sudah', 'tersebut', 'nya', 'lah', 'pun',
    'seperti', 'sebuah', 'seorang', 'akan', 'para', 'dah', 'kek', 'jg', 'juga',
    'udah', 'udahh', 'belum', 'blm',
    'ngapa', 'ngapain', 'gimana', 'kenapa'
]

df['stopwords'] = df['tokens'].apply(lambda x: [t for t in x if t not in manual_stopwords])
df_stopword_view = df[['tokens', 'stopwords']]
df_stopword_view


## Stemming (Merubah Kata menjadi kata dasar)

In [None]:
!pip install Sastrawi

In [None]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [None]:
def apply_stemming(tokens):
  if isinstance(tokens, str):
    tokens = tokens.split()
  stemmed_tokens = [stemmer.stem(word) for word in tokens]
  return ' '.join(stemmed_tokens)

df['stemmed'] = df['stopwords'].apply(apply_stemming)
df_stemmed_view = df[['stopwords', 'stemmed']]
df_stemmed_view

# Word cloud

In [None]:
#Word Cloud KRL
from wordcloud import WordCloud
import matplotlib.pyplot as plt

text = ' '.join(df['stemmed'])

wc = WordCloud(width=800, height=400, background_color='white').generate(text)

plt.figure(figsize=(10,5))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud KRL')
plt.show()

In [None]:
# Word cloud per sentimen
for label in df['hf_final'].unique():
    text = ' '.join(df[df['hf_final'] == label]['stemmed'])
    wc = WordCloud(width=800, height=400, background_color='white').generate(text)

    plt.figure(figsize=(10,5))
    plt.title(label)
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    plt.show()


In [None]:
# distribusi sentimen
import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(x='hf_final', data=df)
plt.title('Distribusi Sentimen')
plt.show()

In [None]:
# top 20 kata
from collections import Counter

all_words = sum(df['stemmed'].apply(lambda x: x.split()), [])
word_freq = Counter(all_words).most_common(20)

words, freqs = zip(*word_freq)
plt.barh(words, freqs)
plt.title("Top 20 Kata")
plt.show()

# Feature Engineering

## TF IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

In [None]:
text = df['stopwords'].astype(str)
tfidf = TfidfVectorizer(
    max_features=1000,
    ngram_range=(1,2),
    stop_words=None
)

X_tfidf = tfidf.fit_transform(text)


In [None]:
tfidf_df = pd.DataFrame(
    X_tfidf.toarray(),
    columns=tfidf.get_feature_names_out()
)

tfidf_df

# Split Data (data train dan data testing)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = tfidf_df
y = df['hf_final']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print('ukuran data latih: ', X_train.shape)
print('ukuran data uji: ', X_test.shape)

# Klasifikasi Metode Naive Bayes

## modeling

In [None]:
from sklearn.naive_bayes import MultinomialNB

# Inisialisasi dan latih model Naive Bayes
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

## Training data

In [None]:
y_pred_train = nb_classifier.predict(X_train)

In [None]:
from sklearn.metrics import accuracy_score, classification_report

print("Akurasi pada data training:", accuracy_score(y_train, y_pred_train))
print(classification_report(y_train, y_pred_train))

# Evaluasi

In [None]:
y_pred = nb_classifier.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, classification_report

print("Akurasi pada data testing:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# cofussion matrix

In [None]:
# confussion matrix data testing
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# 1. Prediksi data training (atau testing)
y_pred = nb_classifier.predict(X_test)   # ganti X_train jadi X_test kalau mau lihat testing

# 2. Buat confusion matrix
cm = confusion_matrix(y_test, y_pred, labels=['Negative', 'Neutral', 'Positive'])

# 3. Tampilkan sebagai heatmap
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=['Negative', 'Neutral', 'Positive'])

plt.figure(figsize=(6, 4))
disp.plot(cmap="Blues")
plt.title("Confusion Matrix Naive Bayes")
plt.show()

In [None]:
# confussion matrix data training
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# 1. Prediksi data training (atau testing)
y_pred = nb_classifier.predict(X_train)   # ganti X_train jadi X_test kalau mau lihat testing

# 2. Buat confusion matrix
cm = confusion_matrix(y_train, y_pred, labels=['Negative', 'Neutral', 'Positive'])

# 3. Tampilkan sebagai heatmap
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=['Negative', 'Neutral', 'Positive'])

plt.figure(figsize=(6, 4))
disp.plot(cmap="Blues")
plt.title("Confusion Matrix Naive Bayes")
plt.show()

# Fine Tuning

Data tweet KRL dilatih kembali dengan model BERT agar model lebih memahami pola bahasa dan sentimen khusus pada konteks KRL

## install dan impor

In [None]:
!pip install transformers datasets torch accelerate

In [None]:
!pip install --upgrade transformers datasets torch accelerate

In [None]:
import datasets
import httpx
print(f"datasets version: {datasets.__version__}")
print(f"httpx version: {httpx.__version__}")

In [None]:
# impor
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from collections import Counter

from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch.nn as nn

## load data & label

In [None]:
# contoh: load CSV KRL yang sudah berisi kolom 'normalized' (teks) dan 'hf_final' (label final)
df = pd.read_csv('/content/drive/MyDrive/LABELING FIX.csv')  # sesuaikan path

# pastikan nggak ada NaN di teks / label
df = df.dropna(subset=['normalized', 'hf_final']).reset_index(drop=True)

# lihat distribusi label
print(df['hf_final'].value_counts())

# encode label -> id (0..n-1)
le = LabelEncoder()
df['label_id'] = le.fit_transform(df['hf_final'])
label2id = {l:i for i,l in enumerate(le.classes_)}
id2label = {i:l for l,i in label2id.items()}
print("label2id:", label2id)


## split training & testing

In [None]:
train_df, test_df = train_test_split(
    df[['normalized','label_id']],
    test_size=0.2,
    stratify=df['label_id'],
    random_state=42
)

print("train size:", len(train_df), "test size:", len(test_df))


## hitung class weight

In [None]:
# compute freq on training labels
counts = train_df['label_id'].value_counts().sort_index()
print("class counts (train):", counts.to_dict())

# class weights = inverse frequency
# convert to tensor on device later
class_freq = counts.values.astype(np.float32)
class_weights = 1.0 / class_freq
class_weights = class_weights / class_weights.sum() * len(class_freq)  # normalisasi (opsional)
class_weights = torch.tensor(class_weights, dtype=torch.float)
print("class weights:", class_weights)


## hugging face dataset

In [None]:
model_name = "indobenchmark/indobert-base-p1"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# convert ke Dataset dari pandas
dataset = Dataset.from_pandas(train_df.rename(columns={'normalized':'text','label_id':'labels'}))
dataset_test = Dataset.from_pandas(test_df.rename(columns={'normalized':'text','label_id':'labels'}))


## tokenizer

In [None]:
def tokenize_fn(batch):
    return tokenizer(batch['text'], truncation=True, padding='max_length', max_length=128)

tokenized_train = dataset.map(tokenize_fn, batched=True)
tokenized_test = dataset_test.map(tokenize_fn, batched=True)

# buang kolom teks asli agar Trainer pakai tensors
tokenized_train = tokenized_train.remove_columns([c for c in tokenized_train.column_names if c not in ['input_ids','attention_mask','labels']])
tokenized_test = tokenized_test.remove_columns([c for c in tokenized_test.column_names if c not in ['input_ids','attention_mask','labels']])

tokenized_train.set_format("torch")
tokenized_test.set_format("torch")

## load model

In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

num_labels = len(label2id)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)


## custom trainer

In [None]:
# Custom Trainer: override compute_loss to include class weights
class WeightedTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights.to(self.model.device) if class_weights is not None else None

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = nn.CrossEntropyLoss(weight=self.class_weights)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

## training argument & trainer start

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./indobert-krl",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    fp16=torch.cuda.is_available()
)

In [None]:
# compute_metrics (gunakan sklearn)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted', zero_division=0)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}


In [None]:
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
    class_weights=class_weights
)

# Mulai train
trainer.train()

## evaluasi

In [None]:
metrics = trainer.evaluate(tokenized_test)
print("Trainer evaluation metrics (test set):")
print(metrics)

## Prediksi data testing

In [None]:
preds_output = trainer.predict(tokenized_test)

y_pred = np.argmax(preds_output.predictions, axis=1)
y_true = preds_output.label_ids

In [None]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_true, y_pred)
print("Akurasi pada data testing:", accuracy)

## Classification Report

In [None]:
from sklearn.metrics import classification_report

print("Classification Report (test set):")
print(classification_report(
    y_true,
    y_pred,
    target_names=list(label2id.keys())  # nama label sesuai mapping
))

## confussion matrix

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:\n", cm)

# Visualisasi heatmap
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=list(label2id.keys()),
            yticklabels=list(label2id.keys()))
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix (Test Set)")
plt.show()


In [None]:
# simpan model & tokenizer
trainer.save_model("./indobert-krl-finetuned")
tokenizer.save_pretrained("./indobert-krl-finetuned")

In [None]:
import numpy as np

np.save("krl_y_true.npy", y_true)
np.save("krl_y_pred.npy", y_pred)

print("File berhasil disimpan!")


In [None]:
#simpan file csv krl final
df.to_csv('krl_final.csv', index=False)

Perbandingan metode NB dan BERT

In [None]:
from sklearn.metrics import classification_report, accuracy_score

# Naive Bayes results (from previous execution)
y_true_nb = y_test # The y_test is still available from the Naive Bayes split
y_pred_nb = nb_classifier.predict(X_test)

print("**Naive Bayes Classification Report (Test Set):**")
print(classification_report(y_true_nb, y_pred_nb))
print(f"Naive Bayes Accuracy: {accuracy_score(y_true_nb, y_pred_nb):.4f}\n")

# Fine-tuned BERT results (from previous execution)
y_true_bert = np.load("krl_y_true.npy")
y_pred_bert = np.load("krl_y_pred.npy")

# Ensure label2id is available, or redefine if necessary
# Assuming label2id is still in scope from previous execution
# If not, you might need to re-run the `label2id` cell or load it.
if 'label2id' not in globals():
    print("label2id not found, attempting to reconstruct...")
    # This is a fallback, ideally label2id would persist or be saved/loaded.
    unique_labels = sorted(list(set(df['hf_final'].unique())))
    le = LabelEncoder()
    le.fit(unique_labels)
    label2id = {l:i for i,l in enumerate(le.classes_)}
    id2label = {i:l for l,i in label2id.items()}

print("**Fine-tuned BERT Classification Report (Test Set):**")
print(classification_report(
    y_true_bert,
    y_pred_bert,
    target_names=list(label2id.keys())
))
print(f"Fine-tuned BERT Accuracy: {accuracy_score(y_true_bert, y_pred_bert):.4f}")

 Perbandingan Confusion Matrix

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Naive Bayes Confusion Matrix
cm_nb = confusion_matrix(y_true_nb, y_pred_nb, labels=['Negative', 'Neutral', 'Positive'])
disp_nb = ConfusionMatrixDisplay(confusion_matrix=cm_nb,
                                 display_labels=['Negative', 'Neutral', 'Positive'])
disp_nb.plot(cmap="Blues", ax=axes[0])
axes[0].set_title("Confusion Matrix Naive Bayes (Test Set)")

# Fine-tuned BERT Confusion Matrix
cm_bert = confusion_matrix(y_true_bert, y_pred_bert)
disp_bert = ConfusionMatrixDisplay(confusion_matrix=cm_bert,
                                   display_labels=list(label2id.keys()))
disp_bert.plot(cmap="Blues", ax=axes[1])
axes[1].set_title("Confusion Matrix Fine-tuned BERT (Test Set)")

plt.tight_layout()
plt.show()

# Perbandingan 2 metode

In [None]:
y_true_nb = y_test         # label sebenarnya dari test KRL
y_pred_nb = nb_classifier.predict(X_test)

In [None]:
y_true_bert = np.load("krl_y_true.npy")
y_pred_bert = np.load("krl_y_pred.npy")

In [None]:
id2label = {0: "Negative", 1: "Neutral", 2: "Positive"}
label_names = ["Negative", "Neutral", "Positive"]

In [None]:
from sklearn.metrics import classification_report, accuracy_score

print("=== Naive Bayes (KRL) ===")
print(classification_report(y_true_nb, y_pred_nb))
print("Accuracy NB:", accuracy_score(y_true_nb, y_pred_nb))

print("\n=== Fine-tuned BERT (KRL) ===")
print(classification_report(y_true_bert, y_pred_bert))
print("Accuracy BERT:", accuracy_score(y_true_bert, y_pred_bert))

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

labels = ["Negative", "Neutral", "Positive"]

fig, axes = plt.subplots(1, 2, figsize=(14,6))

# NB
cm_nb = confusion_matrix(y_true_nb, y_pred_nb, labels=labels)
ConfusionMatrixDisplay(confusion_matrix=cm_nb, display_labels=labels).plot(ax=axes[0], cmap="Blues")
axes[0].set_title("Naive Bayes (KRL)")

# BERT
y_true_bert_str = [id2label[label_id] for label_id in y_true_bert]
y_pred_bert_str = [id2label[label_id] for label_id in y_pred_bert]

cm_bert = confusion_matrix(y_true_bert_str, y_pred_bert_str, labels=labels)
ConfusionMatrixDisplay(confusion_matrix=cm_bert,
                       display_labels=labels).plot(ax=axes[1], cmap="Blues")
axes[1].set_title("Fine-tuned BERT (KRL)")

plt.tight_layout()
plt.show()

ya allah jujur takut banget bantuin aku  ya allah