# Mengimpor dataset MRT

In [None]:
import pandas as pd
from google.colab import drive

In [None]:

drive.mount('/content/drive')

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Pemrosesan Teks Revisi/SrappingMRT (1).csv')
df

In [None]:
df = df[['full_text']]
df

#Preprocessing Sederhana

## Punctuation

In [None]:
import re
import string

def normalize_basic(text):
    text = str(text).lower()  # ke huruf kecil
    text = re.sub(r'@\w+', '', text)  # hapus mention
    text = re.sub(r'(?:http?://|https?://|www\.)\S+', '', text)  # hapus url
    text = re.sub(r'\d+', '', text)  # hapus angka
    text = re.sub(r'(.)\1{2,}', r'\1', text)  # huruf berulang
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)  # hapus tanda baca
    text = re.sub(r'\s+', ' ', text).strip()  # rapikan spasi

    return text

df['punctuation_text'] = df['full_text'].apply(normalize_basic)


##Normalisasi

In [None]:
# !pip install indoNLP

In [None]:
# from indoNLP.preprocessing import replace_slang

In [None]:
kamus_path = pd.read_csv('/content/drive/MyDrive/Pemrosesan Teks Revisi/slang_indo.csv', header=None, names=["slang","formal"])
kamus_path

In [None]:
slang_dict = dict(zip(kamus_path['slang'], kamus_path['formal']))

def normalize_slang(text):
    tokens = str(text).split()
    normalized = [slang_dict.get(tok, tok) for tok in tokens]
    return ' '.join(normalized)

df['normalized'] = df['punctuation_text'].astype(str).apply(normalize_slang)
df

# Labeling

In [None]:
from transformers import pipeline

In [None]:
sentiment_pipe = pipeline(
    "sentiment-analysis",
    model="ayameRushia/bert-base-indonesian-1.5G-sentiment-analysis-smsa",
    tokenizer="ayameRushia/bert-base-indonesian-1.5G-sentiment-analysis-smsa"
)

In [None]:

df['hf_label'] = df['normalized'].apply(lambda x: sentiment_pipe(x)[0]['label'])
df['hf_score'] = df['normalized'].apply(lambda x: sentiment_pipe(x)[0]['score'])
df.to_csv('labeling.csv', index=False)
df

In [None]:
print("HuggingFace label counts:")
print(df['hf_label'].value_counts())

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Pemrosesan Teks Revisi/final labeling bagus.csv')
df

In [None]:
print("HuggingFace label counts:")
print(df['hf_final'].value_counts())

#Preprocessing Lanjutan

##Tokenisasi

In [None]:
df['tokens'] = df['normalized'].apply(lambda x: x.split())
df['tokens'] = df['tokens'].apply(lambda toks: [t for t in toks if len(t) > 1])
df

##Stopword

In [None]:
import ast

In [None]:
manual_stopwords = [
    'yang', 'dan', 'di', 'ke', 'dari', 'itu', 'ini',
    'untuk', 'pada', 'dengan', 'karena', 'bahwa', 'saat',
    'ada', 'tidak', 'ya', 'nih', 'loh', 'sih', 'agar', 'atau',
    'sehingga', 'tersebut', 'eh','akan','aku','bisa','dalam','dari','dia',
    'dong','jadi','kalau','kalo','kan','kau','kita', 'lagi','lah',
    'loh','me','mereka','nih','nya','para','pun','sama','saat','sebuah',
    'seorang','seperti','sudah','telah','tidak','ya','yang','yg',
]

def remove_manual_stopwords(tokens_str):
    try:
        tokens = ast.literal_eval(tokens_str)
    except (ValueError, SyntaxError):
        tokens = [tokens_str]

    # hapus kata yang ada di daftar stopword manual
    filtered = [word for word in tokens if word not in manual_stopwords]
    return filtered

# Terapkan ke kolom
df['stopword'] = df['tokens'].apply(remove_manual_stopwords)
df[['tokens', 'stopword']]
df

##Stemming

In [None]:
!pip install Sastrawi

In [None]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import itertools

In [None]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [None]:
def apply_stemming(tokens):
    if isinstance(tokens, str):
        tokens = tokens.split()
    # Flatten satu level dan filter string
    flat_tokens = list(itertools.chain.from_iterable(t if isinstance(t, list) else [t] for t in tokens))
    stemmed_tokens = [stemmer.stem(word) for word in flat_tokens if isinstance(word, str)]
    return ' '.join(stemmed_tokens)
# Terapkan seperti sebelumnya
df['stemmed'] = df['stopword'].apply(apply_stemming)
df

# Word Cloud

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [None]:
# word plot semua
all_words = " ".join(df['stemmed'])

wc = WordCloud(
    width=1600,
    height=800,
    background_color='white'
).generate(all_words)

plt.figure(figsize=(14,7))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.title("WordCloud Keseluruhan Teks")
plt.show()

In [None]:
#word plot setiap label
labels = df['hf_final'].unique()
for label in labels:
    words = " ".join(df[df['hf_final'] == label]['stemmed'])
    wc = WordCloud(width=1600, height=800, background_color='white').generate(words)

    plt.figure(figsize=(14,7))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    plt.title(f"WordCloud Label: {label}")
    plt.show()

In [None]:
# word plot kata paling sering muncul
from collections import Counter

# pecah token
tokens = df['stemmed'].str.split().sum()

# hitung 20 kata paling sering muncul
word_freq = Counter(tokens).most_common(20)

words, counts = zip(*word_freq)

plt.figure(figsize=(12,6))
plt.bar(words, counts)
plt.xticks(rotation=45)
plt.title("20 Kata Paling Sering Muncul")
plt.show()

#Featur Extraction

##TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
text = df['stopword'].astype(str)
tfidf = TfidfVectorizer(
    max_features=1000,
    ngram_range=(1,2),
    stop_words=None
)

X_tfidf = tfidf.fit_transform(text)

In [None]:
# tampilkan ke dalam bentuk datframe
tfidf_df = pd.DataFrame(
    X_tfidf.toarray(),
    columns=tfidf.get_feature_names_out()
)

tfidf_df

# Split Data

In [None]:
# fitur = hasil TF-IDF
X = X_tfidf

# label = hasil labeling HuggingFace
y = df['hf_final']

# split 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Ukuran data latih :", X_train.shape)
print("Ukuran data uji   :", X_test.shape)


#Modeling

In [None]:
from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

In [None]:
y_pred = nb_model.predict(X_test)


#Evaluasi

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("Accuracy:", accuracy_score(y_test, y_pred))
# print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# 1. Prediksi data training (atau testing)
y_pred = nb_model.predict(X_train)

# 2. Buat confusion matrix
cm = confusion_matrix(y_train, y_pred, labels=['Negative', 'Neutral', 'Positive'])

# 3. Tampilkan sebagai heatmap
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=['Negative', 'Neutral', 'Positive'])

plt.figure(figsize=(6, 4))
disp.plot(cmap="Blues")
plt.title("Confusion Matrix Naive Bayes")
plt.show()

#Fine Tunning

##Label Mapping

In [None]:
label_map = {'Negative':0, 'Neutral':1, 'Positive':2}
df['label_id'] = df['hf_final'].map(label_map)

##Dataset Split

In [None]:
from datasets import Dataset

dataset = Dataset.from_pandas(df[['normalized','label_id']])
dataset = dataset.train_test_split(test_size=0.2, seed=42)

In [None]:
!pip install transformers datasets torch accelerate

In [None]:
!pip install --upgrade transformers datasets torch accelerate

##Tokenize

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments

model_name = "indobenchmark/indobert-base-p1"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3
)

##Prepare Dataset

In [None]:
def tokenize(batch):
    return tokenizer(
        batch['normalized'],
        truncation=True,
        padding='max_length',
        max_length=128
    )

tokenized_dataset = dataset.map(tokenize, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(['normalized'])
tokenized_dataset = tokenized_dataset.rename_column("label_id", "labels")
tokenized_dataset.set_format("torch")

##Class Weights

In [None]:
from sklearn.utils.class_weight import compute_class_weight
import torch
import numpy as np

# hitung berdasarkan data train
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(dataset['train']['label_id']),
    y=dataset['train']['label_id']
)

class_weights = torch.tensor(class_weights, dtype=torch.float)
print("Class weights:", class_weights)

##Weighted Trainer

In [None]:
from transformers import Trainer
import torch.nn as nn

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        loss_fct = nn.CrossEntropyLoss(weight=class_weights.to(model.device))
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss

##Train Args

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./roberta-mrt-model",       # folder hasil model
    overwrite_output_dir=True,              # timpa folder jika sudah ada
    eval_strategy="epoch",                  # evaluasi tiap akhir epoch
    save_strategy="epoch",                  # simpan model tiap akhir epoch
    learning_rate=2e-5,                     # learning rate
    per_device_train_batch_size=16,         # batch size untuk training
    per_device_eval_batch_size=16,          # batch size untuk evaluasi
    num_train_epochs=4,                     # jumlah epoch
    weight_decay=0.01,                      # regularisasi
    logging_steps=50,                       # logging tiap 50 langkah
    load_best_model_at_end=True,            # load model terbaik otomatis
    metric_for_best_model="accuracy",       # metric untuk model terbaik
    greater_is_better=True                  # metric yang lebih tinggi lebih baik
)


##Metrics

In [None]:
from transformers import Trainer, EvalPrediction
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Definisikan metric evaluasi
def compute_metrics(p: EvalPrediction):
    preds = torch.argmax(torch.tensor(p.predictions), dim=1)
    labels = torch.tensor(p.label_ids)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

##Train Model

In [None]:
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

##Evaluate

In [None]:
trainer.evaluate()

##Predict

In [None]:
predictions = trainer.predict(tokenized_dataset['test'])

In [None]:
import numpy as np

y_pred = np.argmax(predictions.predictions, axis=1)
y_true = predictions.label_ids


##Classification Report

In [None]:
from sklearn.metrics import classification_report

accuracy = accuracy_score(y_true, y_pred)
print("Accuracy:", accuracy)

print(classification_report(
    y_true,
    y_pred,
    target_names=['negative', 'neutral', 'positive']
))

##Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(y_true, y_pred)

plt.figure(figsize=(6,4))
sns.heatmap(
    cm, annot=True, fmt='d', cmap='Blues',
    xticklabels=['negative','neutral','positive'],
    yticklabels=['negative','neutral','positive']
)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import LabelEncoder

In [None]:
# Naive Bayes results (from previous execution)
y_true_nb = y_test # The y_test is still available from the Naive Bayes split
y_pred_nb = nb_model.predict(X_test)

print("Naive Bayes Classification Report (Test Set):")
print(classification_report(y_true_nb, y_pred_nb))
print(f"Naive Bayes Accuracy: {accuracy_score(y_true_nb, y_pred_nb):.4f}\n")

# Fine-tuned BERT results (from previous execution)
y_true_bert = y_true # Use the in-memory variable
y_pred_bert = y_pred # Use the in-memory variable

if 'label2id' not in globals():
    print("label2id not found, attempting to reconstruct...")
    # This is a fallback, ideally label2id would persist or be saved/loaded.
    unique_labels = sorted(list(set(df['hf_final'].unique())))
    le = LabelEncoder()
    le.fit(unique_labels)
    label2id = {l:i for i,l in enumerate(le.classes_)}
    id2label = {i:l for l,i in label2id.items()}

print("Fine-tuned BERT Classification Report (Test Set):")
print(classification_report(
    y_true_bert,
    y_pred_bert,
    target_names=list(label2id.keys())
))
print(f"Fine-tuned BERT Accuracy: {accuracy_score(y_true_bert, y_pred_bert):.4f}")

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Naive Bayes Confusion Matrix
cm_nb = confusion_matrix(y_true_nb, y_pred_nb, labels=['Negative', 'Neutral', 'Positive'])
disp_nb = ConfusionMatrixDisplay(confusion_matrix=cm_nb,
                                 display_labels=['Negative', 'Neutral', 'Positive'])
disp_nb.plot(cmap="Blues", ax=axes[0])
axes[0].set_title("Confusion Matrix Naive Bayes (Test Set)")

# Fine-tuned BERT Confusion Matrix
cm_bert = confusion_matrix(y_true_bert, y_pred_bert)
disp_bert = ConfusionMatrixDisplay(confusion_matrix=cm_bert,
                                   display_labels=list(label2id.keys()))
disp_bert.plot(cmap="Blues", ax=axes[1])
axes[1].set_title("Confusion Matrix Fine-tuned BERT (Test Set)")

plt.tight_layout()
plt.show()

Save untuk Perbandingan KRL

In [None]:
df.to_csv("mrt_final.csv", index=False)

In [None]:
y_true = predictions.label_ids
y_pred = np.argmax(predictions.predictions, axis=1)

In [None]:
np.save("mrt_y_true.npy", y_true)
np.save("mrt_y_pred.npy", y_pred)