In [10]:
!pip install transformers[torch] --upgrade




In [11]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from datasets import Dataset

In [12]:
df = pd.read_csv("dataset\dataset-shopee-final.csv")

In [13]:
label_map = {"Negatif": 0, "Netral": 1, "Positif": 2}
df["label"] = df["sentiment"].map(label_map)

In [14]:
# ✅ SPLIT dataset
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["final_text"].tolist(),
    df["label"].tolist(),
    test_size=0.1,
    random_state=42,
    stratify=df["label"]
)

In [15]:
# ✅ LOAD IndoBERT tokenizer
tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-base-p1")

In [16]:
# ✅ Tokenize
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [17]:
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'label': train_labels
})
val_dataset = Dataset.from_dict({
    'input_ids': val_encodings['input_ids'],
    'attention_mask': val_encodings['attention_mask'],
    'label': val_labels
})

In [None]:
!pip install torch --upgrade

^C


Collecting torch
  Using cached torch-2.7.1-cp310-cp310-win_amd64.whl.metadata (28 kB)
Collecting sympy>=1.13.3 (from torch)
  Using cached sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Downloading torch-2.7.1-cp310-cp310-win_amd64.whl (216.1 MB)
   ---------------------------------------- 0.0/216.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/216.1 MB ? eta -:--:--
   ---------------------------------------- 0.8/216.1 MB 3.7 MB/s eta 0:00:58
   ---------------------------------------- 1.6/216.1 MB 3.6 MB/s eta 0:00:59
   ---------------------------------------- 2.6/216.1 MB 4.2 MB/s eta 0:00:51
    --------------------------------------- 2.9/216.1 MB 4.2 MB/s eta 0:00:51
    --------------------------------------- 3.9/216.1 MB 3.8 MB/s eta 0:00:57
    --------------------------------------- 5.0/216.1 MB 4.0 MB/s eta 0:00:54
   - -------------------------------------- 5.8/216.1 MB 4.0 MB/s eta 0:00:54
   - -------------------------------------- 6.6/216.1 MB 4.0 MB/

  You can safely remove it manually.


In [None]:
model = BertForSequenceClassification.from_pretrained("indobenchmark/indobert-base-p1", num_labels=3)




In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    acc = accuracy_score(p.label_ids, preds)
    f1 = f1_score(p.label_ids, preds, average="weighted")
    return {"accuracy": acc, "f1": f1}


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
preds = trainer.predict(val_dataset)
y_pred = np.argmax(preds.predictions, axis=1)
print("\nClassification Report:\n")
print(classification_report(val_labels, y_pred, target_names=label_map.keys()))