In [4]:
import pandas as pd
import torch
from torch.utils.data import Dataset
from sklearn.metrics import accuracy_score, classification_report
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
import re

In [5]:
df_train = pd.read_csv('training.csv')
df_val = pd.read_csv('validation.csv')
df_test = pd.read_csv('test.csv')

In [6]:
# 3. CLEANING TEKS (PREPROCESSING) DENGAN REGEX
def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'\@w+|\#','', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df_train['clean_text'] = df_train['text'].apply(clean_text)
df_val['clean_text'] = df_val['text'].apply(clean_text)
df_test['clean_text'] = df_test['text'].apply(clean_text)

In [7]:
# 4. DEFINISI DATASET CLASS
class EmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=64):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encodings = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        item = {key: val.squeeze() for key, val in encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

In [8]:
# 5. TOKENIZER DAN DATASET
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

train_dataset = EmotionDataset(df_train['clean_text'].tolist(), df_train['label'].tolist(), tokenizer)
val_dataset = EmotionDataset(df_val['clean_text'].tolist(), df_val['label'].tolist(), tokenizer)
test_dataset = EmotionDataset(df_test['clean_text'].tolist(), df_test['label'].tolist(), tokenizer)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Error while downloading from https://huggingface.co/distilbert-base-uncased/resolve/main/tokenizer.json: HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out.
Trying to resume download...


In [10]:
# 6. LOAD MODEL
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=6)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# 7. DEFINISI METRIK
def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    labels = p.label_ids
    return {
        "accuracy": accuracy_score(labels, preds)
    }

In [14]:
# 8. ARGUMEN TRAINING
training_args = TrainingArguments(
    output_dir='./results',
    do_train=True,
    do_eval=True,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    logging_dir='./logs',
    logging_steps=10,
    fp16=True  # Aktifkan hanya jika GPU mendukung
)

In [15]:
# 9. TRAINER
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [16]:
# 10. TRAIN
trainer.train()



Step,Training Loss
10,1.7892
20,1.6049
30,1.6471
40,1.6462
50,1.7647
60,1.3653
70,1.3238
80,1.4464
90,1.4019
100,1.4941


TrainOutput(global_step=4000, training_loss=0.41161673424579204, metrics={'train_runtime': 9298.5725, 'train_samples_per_second': 1.721, 'train_steps_per_second': 0.43, 'total_flos': 264953696256000.0, 'train_loss': 0.41161673424579204, 'epoch': 1.0})

In [17]:
# 11. EVALUASI TEST
predictions = trainer.predict(test_dataset)
y_pred = predictions.predictions.argmax(-1)
y_true = df_test['label'].tolist()

print("\n📊 Hasil Evaluasi:")
print(classification_report(y_true, y_pred))
print("Akurasi:", accuracy_score(y_true, y_pred))




📊 Hasil Evaluasi:
              precision    recall  f1-score   support

           0       0.97      0.96      0.96       581
           1       0.96      0.94      0.95       695
           2       0.81      0.85      0.83       159
           3       0.94      0.92      0.93       275
           4       0.89      0.94      0.91       224
           5       0.78      0.77      0.78        66

    accuracy                           0.93      2000
   macro avg       0.89      0.90      0.89      2000
weighted avg       0.93      0.93      0.93      2000

Akurasi: 0.9305


In [18]:
label_map = {
    0: "sadness",
    1: "joy",
    2: "love",
    3: "anger",
    4: "fear",
    5: "others"
}

In [19]:
# Menyimpan model dan tokenizer setelah pelatihan
model.save_pretrained("./sentiment_model")
tokenizer.save_pretrained("./sentiment_model")

print("Model dan tokenizer berhasil disimpan!")

Model dan tokenizer berhasil disimpan!


In [20]:
# Memuat model dan tokenizer yang sudah disimpan
model = DistilBertForSequenceClassification.from_pretrained("./sentiment_model")
tokenizer = DistilBertTokenizer.from_pretrained("./sentiment_model")

print("Model dan tokenizer berhasil dimuat!")

Model dan tokenizer berhasil dimuat!


In [28]:
# Prediksi dengan model yang sudah dimuat
def predict_emotion(text):
    # Preprocessing teks
    clean_text_input = clean_text(text)

    # Tokenisasi teks
    inputs = tokenizer(clean_text_input, return_tensors="pt", truncation=True, padding=True, max_length=64)

    # Melakukan prediksi
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    # Menentukan label dengan probabilitas tertinggi
    label = torch.argmax(logits, dim=1).item()
    probs = torch.softmax(logits, dim=1).squeeze().tolist()

    # Mengembalikan label dan probabilitas
    return label, probs

# Contoh penggunaan
curhatan_user = "i'm so tired, i feel like everyone left me behind"
label, probs = predict_emotion(curhatan_user)

print("Curhatan:", curhatan_user)
print("Label emosi (prediksi):", label_map[label])
print("Probabilitas semua label:", probs)


Curhatan: i'm so tired, i feel like everyone left me behind
Label emosi (prediksi): sadness
Probabilitas semua label: [0.9914470911026001, 0.0016511474968865514, 0.0009433434461243451, 0.004056474659591913, 0.0016351989470422268, 0.00026675997651182115]
