# Di Sini Tidak ada Pre-processing, hanya traning saja

# Hard Code

In [None]:
# ==============================================================================
# KODE FINAL UNTUK PELATIHAN MODEL (DENGAN PERBAIKAN UNTUK LOSS NAN)
# ==============================================================================
print("Memulai sesi GPU... Mengimpor semua library yang dibutuhkan.")

# --- 1. IMPORT LIBRARY ---
import torch
import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict
from google.colab import drive
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, precision_score, recall_score
from huggingface_hub import notebook_login
import os

# --- 2. MOUNT DRIVE & DEFINISIKAN KONFIGURASI ---
print("\nMenghubungkan ke Google Drive dan mendefinisikan konfigurasi...")
try:
    drive.mount('/content/drive')
except:
    print("Drive sudah ter-mount.")

# !!! PENTING: Ganti path ini dengan path file CSV Anda yang sudah dilabeli secara manual !!!
DATASET_PATH = "/content/untuk_modelling.csv"
MODEL_NAME = "Keagannn/player-pulse-indobert-v1-WutheringWaves"
HUB_MODEL_ID = "Keagannn/Model-CookingorCooked-datathon"
BATCH_SIZE = 4
EPOCHS = 3
LEARNING_RATE = 5e-5 # Menggunakan learning rate yang lebih rendah dan aman
MAX_LEN = 128

# --- 3. MUAT DATA & PERSIAPAN X dan y ---
print(f"\nMemuat dataset dari: {DATASET_PATH}")
df = pd.read_csv(DATASET_PATH)

print("Mengidentifikasi kolom fitur dan label...")
label_columns = [col for col in df.columns if col.startswith('AutoLabel_')]
if not label_columns:
    raise ValueError("Tidak ada kolom label (diawali 'AutoLabel_') ditemukan di file CSV Anda.")
print(f"Ditemukan {len(label_columns)} kategori aspek: {label_columns}")


# --- PENYESUAIAN PENTING 1: PASTIKAN TIDAK ADA NaN DI LABEL ---
# Ini adalah langkah pengaman untuk mencegah data 'rusak' masuk ke model
print(f"Jumlah baris sebelum membuang label NaN: {len(df)}")
df.dropna(subset=label_columns, inplace=True)
print(f"Jumlah baris setelah membuang label NaN: {len(df)}")
# -------------------------------------------------------------


# Menyiapkan Fitur (X) dan Target (y) secara langsung dari kolom yang ada
X = df['Ulasan Bersih'].tolist()
y = df[label_columns].values.astype(np.float32)

print("Fitur (X) dan Target (y) berhasil disiapkan.")

# --- 4. TRAIN-TEST SPLIT & MEMBUAT DATASET HUGGING FACE ---
train_texts, val_texts, train_labels, val_labels = train_test_split(X, y, test_size=0.2, random_state=42)
train_dataset_dict = Dataset.from_dict({'text': train_texts, 'labels': train_labels})
test_dataset_dict = Dataset.from_dict({'text': val_texts, 'labels': val_labels})
hg_dataset = DatasetDict({'train': train_dataset_dict, 'test': test_dataset_dict})
print("Dataset telah dibagi dan diubah ke format Hugging Face.")

# --- 5. TOKENISASI ---
print("\nMemuat tokenizer dan melakukan tokenisasi...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=MAX_LEN)
tokenized_train = hg_dataset['train'].map(tokenize_function, batched=True)
tokenized_test = hg_dataset['test'].map(tokenize_function, batched=True)
print("Tokenisasi selesai.")

# --- 6. MUAT MODEL & DEFINISIKAN METRIK ---
print("\nMemuat model dan mendefinisikan metrik...")
id2label = {idx: label for idx, label in enumerate(label_columns)}
label2id = {label: idx for idx, label in enumerate(label_columns)}
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=len(label_columns), problem_type="multi_label_classification",
    id2label=id2label, label2id=label2id
)
# --- VERSI COMPUTE_METRICS YANG DISEMPURNAKAN ---
def compute_metrics(p):
    # p.predictions adalah output logits mentah dari model
    # p.label_ids adalah label jawaban benar
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    true_labels = p.label_ids

    # Ubah logits menjadi probabilitas menggunakan sigmoid
    activated_preds = torch.sigmoid(torch.Tensor(preds))
    # Tentukan prediksi akhir (0 atau 1) berdasarkan threshold 0.5
    binary_preds = np.where(activated_preds > 0.5, 1, 0)

    # Hitung semua metrik yang kita inginkan
    accuracy = accuracy_score(y_true=true_labels, y_pred=binary_preds)
    f1_micro = f1_score(y_true=true_labels, y_pred=binary_preds, average='micro', zero_division=0)
    precision_micro = precision_score(y_true=true_labels, y_pred=binary_preds, average='micro', zero_division=0)
    recall_micro = recall_score(y_true=true_labels, y_pred=binary_preds, average='micro', zero_division=0)

    # Kembalikan dalam bentuk dictionary
    return {
        'accuracy': accuracy,
        'f1_micro': f1_micro,
        'precision_micro': precision_micro,
        'recall_micro': recall_micro
    }
print("Model dan fungsi metrik siap.")

Memulai sesi GPU... Mengimpor semua library yang dibutuhkan.





Menghubungkan ke Google Drive dan mendefinisikan konfigurasi...
Mounted at /content/drive

Memuat dataset dari: /content/untuk_modelling.csv
Mengidentifikasi kolom fitur dan label...
Ditemukan 7 kategori aspek: ['AutoLabel_Cerita', 'AutoLabel_Gameplay', 'AutoLabel_Grafis', 'AutoLabel_Bugs & Error', 'AutoLabel_Optimalisasi', 'AutoLabel_Monetisasi & Gacha', 'AutoLabel_Komunitas']
Jumlah baris sebelum membuang label NaN: 9402
Jumlah baris setelah membuang label NaN: 9402
Fitur (X) dan Target (y) berhasil disiapkan.
Dataset telah dibagi dan diubah ke format Hugging Face.

Memuat tokenizer dan melakukan tokenisasi...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

Map:   0%|          | 0/7521 [00:00<?, ? examples/s]

Map:   0%|          | 0/1881 [00:00<?, ? examples/s]

Tokenisasi selesai.

Memuat model dan mendefinisikan metrik...


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Model dan fungsi metrik siap.


In [None]:
# --- 7. PELATIHAN ---
print("\nMenggunakan TrainingArguments yang sudah distabilkan...")

# --- PENYESUAIAN PENTING 2: MENAMBAHKAN GRADIENT CLIPPING ---
training_args = TrainingArguments(
    output_dir=HUB_MODEL_ID,
    num_train_epochs=EPOCHS,
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    weight_decay=0.01,
    max_grad_norm=1.0, # Ini adalah "rem pengaman" untuk stabilitas
    push_to_hub=False,
    report_to="none",
)

trainer = Trainer(
    model=model, args=training_args, train_dataset=tokenized_train,
    eval_dataset=tokenized_test, tokenizer=tokenizer, compute_metrics=compute_metrics
)
notebook_login()
trainer.train()
print("\n--- Pelatihan Selesai ---")


Menggunakan TrainingArguments yang sudah distabilkan...


  trainer = Trainer(


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Step,Training Loss
500,0.2198
1000,0.2043
1500,0.1616
2000,0.1515
2500,0.0999
3000,0.1063
3500,0.1127
4000,0.0718
4500,0.0665
5000,0.0666



--- Pelatihan Selesai ---


In [None]:
# --- 8. EVALUASI & UNGGAH MANUAL ---
print("\nMenjalankan evaluasi final pada data test...")
final_evaluation = trainer.evaluate()
print("Hasil Evaluasi Final:", final_evaluation)

# Prediksi
predictions = trainer.predict(tokenized_test)
y_pred = predictions.predictions # Use raw predictions for multilabel
y_true = predictions.label_ids  # Label asli

# Note: Accuracy is not a suitable metric for multi-label classification in the same way as multiclass.
# The trainer.evaluate() already computes metrics like F1-score which are more appropriate.
# We will rely on the metrics from trainer.evaluate().

print("\nMengunggah model ke Hub...")
trainer.push_to_hub("Pelatihan stabil, mengunggah model terakhir.")
print(f"✅ Selesai! Model Anda sekarang tersedia di: https://huggingface.co/{HUB_MODEL_ID}")


Menjalankan evaluasi final pada data test...


Hasil Evaluasi Final: {'eval_loss': 0.06186863034963608, 'eval_accuracy': 0.885699096225412, 'eval_f1_micro': 0.9839539503845958, 'eval_precision_micro': 0.9835030549898167, 'eval_recall_micro': 0.9844052594027113, 'eval_runtime': 13.0894, 'eval_samples_per_second': 143.704, 'eval_steps_per_second': 9.015, 'epoch': 3.0}


In [None]:
# Menghitung Akurasi model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier # Corrected import
from sklearn.metrics import classification_report

vectorizer = TfidfVectorizer()
X_vectorized = vectorizer.fit_transform(X)
y = np.array(y)

X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)


model_acc = OneVsRestClassifier(RandomForestClassifier())
model_acc = model_acc.fit(X_train,y_train)
pred_acc = model_acc.predict(X_test)
print(classification_report(y_test, pred_acc))

              precision    recall  f1-score   support

           0       0.88      0.82      0.85       839
           1       0.96      0.99      0.97      1485
           2       0.94      0.98      0.96      1419
           3       0.96      0.99      0.97      1573
           4       0.95      0.99      0.97      1494
           5       0.96      0.99      0.98      1521
           6       0.95      0.98      0.97      1480

   micro avg       0.95      0.97      0.96      9811
   macro avg       0.94      0.96      0.95      9811
weighted avg       0.95      0.97      0.96      9811
 samples avg       0.79      0.81      0.79      9811



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
