<a href="https://colab.research.google.com/github/kakaandriano0-hash/tugaskelompok/blob/main/Untitled24.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
!pip -q install -U transformers datasets accelerate evaluate scikit-learn pandas

import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split


In [16]:
import pandas as pd
import numpy as np
import torch

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)

from datasets import Dataset


In [17]:
train_df = pd.read_csv("dataset_sentimen_ml_300_unique_semantic.csv")
test_df  = pd.read_csv("dataset_ml_400_unlabeled_playstore_style.csv")

train_df = train_df.dropna().reset_index(drop=True)
test_df  = test_df.dropna().reset_index(drop=True)

train_df["label"] = train_df["label"].str.lower()

print(train_df.head())
print(train_df["label"].value_counts())
print("Test data:", len(test_df))


                                            komentar    label
0  Sumpah nagih main solo, meta-nya lumayan seimb...  positif
1  Keren sih main bareng squad, meta-nya lumayan ...  positif
2  Worth it banget push rank di ML, animasi skill...  positif
3  Seru banget main solo, animasi skill nya kelia...  positif
4  Sumpah nagih main solo, meta-nya lumayan seimb...  positif
label
positif    100
negatif    100
netral     100
Name: count, dtype: int64
Test data: 400


In [18]:
label2id = {"negatif": 0, "netral": 1, "positif": 2}
id2label = {v: k for k, v in label2id.items()}

train_df["labels"] = train_df["label"].map(label2id)


In [19]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df["komentar"].tolist(),
    train_df["labels"].tolist(),
    test_size=0.2,
    random_state=42,
    stratify=train_df["labels"]
)


In [20]:
model_ckpt = "indobenchmark/indobert-base-p1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

def tokenize(texts):
    return tokenizer(
        texts,
        truncation=True,
        padding=True,
        max_length=128
    )

train_enc = tokenize(train_texts)
val_enc   = tokenize(val_texts)
test_enc  = tokenize(test_df["komentar"].tolist())

train_ds = Dataset.from_dict({**train_enc, "labels": train_labels})
val_ds   = Dataset.from_dict({**val_enc,   "labels": val_labels})
test_ds  = Dataset.from_dict(test_enc)

data_collator = DataCollatorWithPadding(tokenizer)


In [21]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_ckpt,
    num_labels=3,
    id2label=id2label,
    label2id=label2id
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
training_args = TrainingArguments(
    output_dir="model_output",
    learning_rate=1.5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.02,
    eval_strategy="epoch",
    save_strategy="no",
    logging_strategy="epoch",
    report_to="none",
    label_smoothing_factor=0.1,
    warmup_ratio=0.1
)


In [32]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_macro": f1_score(labels, preds, average="macro")
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.evaluate()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.5646,0.431982,0.933333,0.934123
2,0.427,0.296804,1.0,1.0
3,0.2951,0.29172,1.0,1.0
4,0.2932,0.291339,1.0,1.0
5,0.2928,0.291401,1.0,1.0
6,0.2929,0.291251,1.0,1.0
7,0.2931,0.291264,1.0,1.0
8,0.2926,0.291306,1.0,1.0
9,0.2929,0.291247,1.0,1.0
10,0.2926,0.29121,1.0,1.0


{'eval_loss': 0.2912096679210663,
 'eval_accuracy': 1.0,
 'eval_f1_macro': 1.0,
 'eval_runtime': 0.0406,
 'eval_samples_per_second': 1478.473,
 'eval_steps_per_second': 49.282,
 'epoch': 10.0}

In [34]:
pred = trainer.predict(test_ds)

logits = pred.predictions
probs = torch.softmax(torch.tensor(logits), dim=1).numpy()

pred_ids = probs.argmax(axis=1)
pred_labels = [id2label[i] for i in pred_ids]
confidence = probs.max(axis=1)

result_df = test_df.copy()
result_df["predicted_label"] = pred_labels
result_df["confidence"] = confidence.round(4)

result_df.head()


Unnamed: 0,komentar,predicted_label,confidence
0,"Yang aku rasain mode classic, butuh koneksi in...",netral,0.9349
1,"Kalau dari pengalamanku game MOBA ini, hasil p...",netral,0.9392
2,"Setelah update kemarin game ini, permainannya ...",netral,0.9319
3,"Kalau main lama-lama sistem di ML, kadang juga...",netral,0.8087
4,"Pas nyobain lagi Mobile Legends, kontrolnya cu...",netral,0.5806


In [35]:
result_df.to_csv("hasil_prediksi_transformer.csv", index=False)
print("Saved: hasil_prediksi_transformer.csv")


Saved: hasil_prediksi_transformer.csv
