# Import Library Dasar

In [16]:
import os
import pandas as pd
import numpy as np
import torch
from torch import nn

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
)

# Cek GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")

# Define compute_metrics di sini agar bisa diakses di mana saja
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    acc = accuracy_score(labels, preds)
    f1_macro = f1_score(labels, preds, average='macro')
    f1_weighted = f1_score(labels, preds, average='weighted')
    return {"accuracy": acc, "f1_macro": f1_macro, "f1_weighted": f1_weighted}

Device: cpu


# Load Data dan Mapping Label

In [17]:
data_path = "C:/Users/Razy31/Documents/minHackathon/projek-analisis-sentimen-fenomena-vibecoding/data/vibe_coding_auditLabel.csv"

df = pd.read_csv(data_path,sep=';', encoding='utf-8')
df.head()

Unnamed: 0,video_id,comment_id,parent_id,author,text,like_count,published_at,updated_at,reply_count,tokens,token_len,text_raw,text_trunc,sentiment_pseudo,sentiment_pseudo_id
0,2wwp3dKbGE8,Ugwj1M5bLivVlqF4KkZ4AaABAg,,musfifirah7538,"['bang', 'gimana', 'kalau', 'pemula', 'banget'...",0,2025-11-19T11:44:32Z,2025-11-19T11:44:32Z,0.0,"['bang', 'gimana', 'kalau', 'pemula', 'banget'...",21,bang gimana kalau pemula banget tapi mau belaj...,bang gimana kalau pemula banget tapi mau belaj...,netral,1.0
1,2wwp3dKbGE8,UgzZwHJ7sB8GMMgehFB4AaABAg,,advhe77,"['prinsip2', 'dasar', 'foundamentalnya', 'jgn'...",0,2025-11-18T02:28:17Z,2025-11-18T02:28:17Z,0.0,"['prinsip2', 'dasar', 'foundamentalnya', 'jgn'...",26,prinsip2 dasar foundamentalnya jgn smpe dilupa...,prinsip2 dasar foundamentalnya jgn smpe dilupa...,netral,1.0
2,2wwp3dKbGE8,UgydMEecb1Nh-PRBgq94AaABAg,,zororaka,"['sekedar', 'sharing', 'aja', 'pernah', 'disku...",0,2025-11-17T05:25:58Z,2025-11-17T05:29:49Z,0.0,"['sekedar', 'sharing', 'aja', 'pernah', 'disku...",45,sekedar sharing aja pernah diskusi tech lead b...,sekedar sharing aja pernah diskusi tech lead b...,netral,1.0
3,2wwp3dKbGE8,UgxoF9XGiQczCku-ZqJ4AaABAg,,reafterstudio,"['bukan', 'programer', 'tapi', 'butuh', 'websi...",0,2025-11-14T23:31:39Z,2025-11-14T23:31:39Z,0.0,"['bukan', 'programer', 'tapi', 'butuh', 'websi...",70,bukan programer tapi butuh website sederhana u...,bukan programer tapi butuh website sederhana u...,positif,2.0
4,2wwp3dKbGE8,UgzAbfd4HeGD1h1UxPJ4AaABAg,,kurabasakurata2575,"['biasa', 'ngehandle', 'pekerjaan', 'sendiri',...",0,2025-11-13T00:32:17Z,2025-11-13T00:32:17Z,0.0,"['biasa', 'ngehandle', 'pekerjaan', 'sendiri',...",51,biasa ngehandle pekerjaan sendiri punya gaya k...,biasa ngehandle pekerjaan sendiri punya gaya k...,netral,1.0


In [18]:
df['sentiment_pseudo'].value_counts()

sentiment_pseudo
netral     630
negatif    134
positif    115
Name: count, dtype: int64

In [19]:
# Pastikan kolom yang diperlukan sudah ada
assert 'text_raw' in df.columns
assert 'sentiment_pseudo' in df.columns

df['text_raw'] = (
    df['text_raw'].astype(str).fillna("").
    str.replace('\n', ' ',regex=False).
    str.strip()
)

label_map = {
    "negatif": 0,
    "netral": 1,
    "positif": 2
}

df['label'] = df['sentiment_pseudo'].str.lower().map(label_map)
df['label'].value_counts().sort_index()


label
0    134
1    630
2    115
Name: count, dtype: int64

# Helper: Split, Oversampling, Tokenization

In [20]:
# Stratified split train dan val
def make_splits(df, test_size=0.2, seed=42):
    X_train, X_val, y_train, y_val = train_test_split(
        df['text_raw'].tolist(),
        df['label'].tolist(),
        test_size=test_size,
        random_state=seed,
        stratify=df['label'].tolist(),
    )
    train_df = pd.DataFrame({'text_raw': X_train, 'label': y_train})
    val_df = pd.DataFrame({'text_raw': X_val, 'label': y_val})
    return train_df, val_df

train_df_base, val_df_base = make_splits(df)
print("Train label dist:",train_df_base['label'].value_counts().sort_index().to_dict())
print("Val label dist:",val_df_base['label'].value_counts().sort_index().to_dict())

Train label dist: {0: 107, 1: 504, 2: 92}
Val label dist: {0: 27, 1: 126, 2: 23}


In [21]:
# Oversampling minoritas di train set
def oversampling_minority(df, label_col='label', target='label', target_per_class=250, seed=42):
    rng = np.random.default_rng(seed)
    dfs = []
    
    for lbl, group in df.groupby(label_col):
        if len(group) >= target_per_class:
            dfs.append(group)
        else:
            needed = target_per_class - len(group)
            extra_idx = rng.choice(group.index, size=needed, replace=True)
            extra_samples = group.loc[extra_idx]
            dfs.append(pd.concat([group, extra_samples], axis=0))
    df_os = pd.concat(dfs, axis=0).sample(frac=1, random_state=seed).reset_index(drop=True)
    return df_os


# Tokenization Helper
model_name = 'indolem/indobert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(batch):
    return tokenizer(
        batch['text_raw'],
        truncation=True,
        padding='max_length',
        max_length=256,
    )

# WeightTrainer & Metrics 

In [22]:
class WeightedTrainer(Trainer):
    """
    Trainer HF + CrossEntropyLoss dengan class weights
    """
    def __init__(self, class_weights=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights
        
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        
        if self.class_weights is not None:
            cw = self.class_weights.to(logits.device)
            loss_fct = nn.CrossEntropyLoss(weight=cw)
            
        else:
            loss_fct = nn.CrossEntropyLoss()
        
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

# Buat `run_experiment` untuk 1 strategi

In [None]:
def run_experiment(
    strategy_name: str,
    base_train_df: pd.DataFrame,
    base_val_df: pd.DataFrame,
    oversample: bool,
    use_class_weight: bool,
    epochs: int = 5,
    batch_size: int = 8,
    lr: float=2e-5,
    target_per_class: int = 250,
    output_root: str = "../models/experimentsIndobert"
):
    """
    Jalankan satu eksperimen:
    - oversampling train (opsional)
    - class_weights (opsional)
    - fine-tune IndoBERT
    - evaluate di val
    """
    print(f"\n===== Experiment: {strategy_name} =====")
    
    # 1) siapkan train_df sesuai strategi
    if oversample:
        train_df = oversampling_minority(
            base_train_df, target_per_class=target_per_class)
        print("Train label dist (oversampled):", train_df['label'].value_counts().sort_index().to_dict())
    else:
        train_df = base_train_df.copy()
        print("Train Label dist (base):", train_df['label'].value_counts().sort_index().to_dict())
        
        
    val_df = base_val_df.copy()
    print("Val label dist:", val_df['label'].value_counts().sort_index().to_dict())
    
    # 2) Hitung class_weights kalau dipakai
    class_weights_tensor = None
    if use_class_weight:
        label_counts = train_df['label'].value_counts().sort_index()
        counts = label_counts.values.astype(float)
        inv_freq = 1.0 / counts
        class_weights = inv_freq / inv_freq.sum() * len(counts)
        class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32)
        print("Class Weights (0,1,2):", class_weights_tensor.tolist())
    else:
        print("Class Weigths: Not Used")
        
    # 3) Siapkan dataset HF
    train_dfs = Dataset.from_pandas(train_df.reset_index(drop=True))
    val_dfs = Dataset.from_pandas(val_df.reset_index(drop=True))
    
    train_dfs = train_dfs.map(tokenize_function, batched=True)
    val_dfs = val_dfs.map(tokenize_function, batched=True)
    
    train_dfs = train_dfs.remove_columns(['text_raw'])
    val_dfs = val_dfs.remove_columns(['text_raw'])
    
    train_dfs = train_dfs.with_format("torch")
    val_dfs = val_dfs.with_format("torch")
    
    # 4) Load model baru tiap eksperimen (biar fair)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=3,
    ).to(device)
    
    # 5) TrainingArguments minimal
    exp_output_dir = os.path.join(output_root, strategy_name)
    os.makedirs(exp_output_dir, exist_ok=True)
    
    training_args = TrainingArguments(
        output_dir=exp_output_dir,
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        learning_rate=lr,
        logging_steps=50,
        save_total_limit=1,
        weight_decay=0.01,
    )
    
    trainer = WeightedTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dfs,
        eval_dataset=val_dfs,
        compute_metrics=compute_metrics,
        class_weights=class_weights_tensor,
    )
    
    # 6) Train
    train_result = trainer.train()
    print("Train result:", train_result.metrics)
    
    # 7) Evaluate
    eval_result = trainer.evaluate()
    print("Eval result:", eval_result)
    
    
    # 8) Prediksi detail untuk classfication report
    pred_output = trainer.predict(val_dfs)
    y_true = pred_output.label_ids
    y_pred = np.argmax(pred_output.predictions, axis=1)
    
    cls_report = classification_report(
        y_true,
        y_pred,
        target_names=['negatif', 'netral', 'positif'],
        digits=3,
        output_dict=True, #disimpan dalam bentuk dictionary
    )
    
    # 9) save model & tokenizer (optional: hanya yang terbaik nanti)
    trainer.save_model(exp_output_dir)
    tokenizer.save_pretrained(exp_output_dir)
    
    return{
                "name": strategy_name,
        "oversample": oversample,
        "use_class_weight": use_class_weight,
        "epochs": epochs,
        "batch_size": batch_size,
        "lr": lr,
        "target_per_class": target_per_class,
        "eval_metrics": eval_result,
        "cls_report": cls_report,
        "output_dir": exp_output_dir,
    }

# Looping semua strategi & ringkasan hasil

In [24]:
strategies = [
    {"name": "baseline",              "oversample": False, "use_class_weight": False},
    {"name": "class_weight_only",     "oversample": False, "use_class_weight": True},
    {"name": "oversample_only",       "oversample": True,  "use_class_weight": False},
    {"name": "oversample_and_weight", "oversample": True,  "use_class_weight": True},
]

experiments = []
for cfg in strategies:
    result = run_experiment(
        strategy_name=cfg["name"],
        base_train_df=train_df_base,
        base_val_df=val_df_base,
        oversample=cfg["oversample"],
        use_class_weight=cfg["use_class_weight"],
        epochs=3,           # bisa kamu ubah global di sini
        batch_size=8,
        lr=2e-5,
        target_per_class=250,
        output_root="models/experiments_indobert",
    )
    experiments.append(result)


===== Experiment: baseline =====
Train Label dist (base): {0: 107, 1: 504, 2: 92}
Val label dist: {0: 27, 1: 126, 2: 23}
Class Weigths: Not Used


Map:   0%|          | 0/703 [00:00<?, ? examples/s]

Map:   0%|          | 0/176 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,0.8087
100,0.6774
150,0.6774
200,0.632
250,0.5181


Train result: {'train_runtime': 2360.645, 'train_samples_per_second': 0.893, 'train_steps_per_second': 0.112, 'total_flos': 277453098994176.0, 'train_loss': 0.6621764862176144, 'epoch': 3.0}




Eval result: {'eval_loss': 0.7889192700386047, 'eval_accuracy': 0.6931818181818182, 'eval_f1_macro': 0.407843137254902, 'eval_f1_weighted': 0.6418449197860963, 'eval_runtime': 46.6246, 'eval_samples_per_second': 3.775, 'eval_steps_per_second': 0.472, 'epoch': 3.0}





===== Experiment: class_weight_only =====
Train Label dist (base): {0: 107, 1: 504, 2: 92}
Val label dist: {0: 27, 1: 126, 2: 23}
Class Weights (0,1,2): [1.2629743814468384, 0.26813146471977234, 1.468894124031067]


Map:   0%|          | 0/703 [00:00<?, ? examples/s]

Map:   0%|          | 0/176 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,1.0516
100,0.9783
150,0.955
200,0.9591
250,0.8074


Train result: {'train_runtime': 1739.0668, 'train_samples_per_second': 1.213, 'train_steps_per_second': 0.152, 'total_flos': 277453098994176.0, 'train_loss': 0.9450688470493663, 'epoch': 3.0}




Eval result: {'eval_loss': 0.9001913070678711, 'eval_accuracy': 0.6136363636363636, 'eval_f1_macro': 0.5223657434981139, 'eval_f1_weighted': 0.6401669196243254, 'eval_runtime': 33.3046, 'eval_samples_per_second': 5.285, 'eval_steps_per_second': 0.661, 'epoch': 3.0}





===== Experiment: oversample_only =====
Train label dist (oversampled): {0: 250, 1: 504, 2: 250}
Val label dist: {0: 27, 1: 126, 2: 23}
Class Weigths: Not Used


Map:   0%|          | 0/1004 [00:00<?, ? examples/s]

Map:   0%|          | 0/176 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,1.0188
100,0.8527
150,0.8112
200,0.6949
250,0.6106
300,0.4591
350,0.4799


Train result: {'train_runtime': 2515.4715, 'train_samples_per_second': 1.197, 'train_steps_per_second': 0.15, 'total_flos': 396248807098368.0, 'train_loss': 0.6784836274606211, 'epoch': 3.0}




Eval result: {'eval_loss': 0.6938790678977966, 'eval_accuracy': 0.6761363636363636, 'eval_f1_macro': 0.563903494536406, 'eval_f1_weighted': 0.6929990262901655, 'eval_runtime': 32.7647, 'eval_samples_per_second': 5.372, 'eval_steps_per_second': 0.671, 'epoch': 3.0}





===== Experiment: oversample_and_weight =====
Train label dist (oversampled): {0: 250, 1: 504, 2: 250}
Val label dist: {0: 27, 1: 126, 2: 23}
Class Weights (0,1,2): [1.201907753944397, 0.5961844325065613, 1.201907753944397]


Map:   0%|          | 0/1004 [00:00<?, ? examples/s]

Map:   0%|          | 0/176 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,1.0645
100,0.9535
150,0.9263
200,0.8186
250,0.7305
300,0.5705
350,0.5541


Train result: {'train_runtime': 2469.7946, 'train_samples_per_second': 1.22, 'train_steps_per_second': 0.153, 'total_flos': 396248807098368.0, 'train_loss': 0.776834593878852, 'epoch': 3.0}




Eval result: {'eval_loss': 0.9110493063926697, 'eval_accuracy': 0.5909090909090909, 'eval_f1_macro': 0.5163015708187048, 'eval_f1_weighted': 0.6262060131803122, 'eval_runtime': 33.7217, 'eval_samples_per_second': 5.219, 'eval_steps_per_second': 0.652, 'epoch': 3.0}




# Table Perbandingan & pilih yang terbaik

In [25]:
rows = []
for exp in experiments:
    m = exp["eval_metrics"]
    rows.append({
        "name": exp["name"],
        "oversample": exp["oversample"],
        "class_weight": exp["use_class_weight"],
        "epochs": exp["epochs"],
        "batch_size": exp["batch_size"],
        "lr": exp["lr"],
        "eval_loss": m.get("eval_loss", None),
        "accuracy": m.get("eval_accuracy", None),
        "f1_macro": m.get("eval_f1_macro", None),
        "f1_weighted": m.get("eval_f1_weighted", None),
        "output_dir": exp["output_dir"],
    })

summary_df = pd.DataFrame(rows).sort_values(by="f1_macro", ascending=False)
summary_df

Unnamed: 0,name,oversample,class_weight,epochs,batch_size,lr,eval_loss,accuracy,f1_macro,f1_weighted,output_dir
2,oversample_only,True,False,3,8,2e-05,0.693879,0.676136,0.563903,0.692999,models/experiments_indobert\oversample_only
1,class_weight_only,False,True,3,8,2e-05,0.900191,0.613636,0.522366,0.640167,models/experiments_indobert\class_weight_only
3,oversample_and_weight,True,True,3,8,2e-05,0.911049,0.590909,0.516302,0.626206,models/experiments_indobert\oversample_and_weight
0,baseline,False,False,3,8,2e-05,0.788919,0.693182,0.407843,0.641845,models/experiments_indobert\baseline


In [26]:
# Pilih eksperimen terbaik berdasarkan f1_macro
best_exp = max(experiments, key=lambda e: e["eval_metrics"].get("eval_f1_macro", 0))
best_exp["name"], best_exp["eval_metrics"], best_exp["output_dir"]

('oversample_only',
 {'eval_loss': 0.6938790678977966,
  'eval_accuracy': 0.6761363636363636,
  'eval_f1_macro': 0.563903494536406,
  'eval_f1_weighted': 0.6929990262901655,
  'eval_runtime': 32.7647,
  'eval_samples_per_second': 5.372,
  'eval_steps_per_second': 0.671,
  'epoch': 3.0},
 'models/experiments_indobert\\oversample_only')

# Fine Tune untuk `oversample_only`

In [28]:
def make_tokenize_fn(max_length):
    def tokenize_batch(batch):
        return tokenizer(
            batch['text_raw'],
            truncation=True,
            padding='max_length',
            max_length=max_length,
        )
    return tokenize_batch

In [29]:
def run_experiment(
    strategy_name: str,
    base_train_df: pd.DataFrame,
    base_val_df: pd.DataFrame,
    epochs: int =3,
    batch_size: int=8,
    lr: float=2e-5,
    target_per_class: int =250,
    max_length: int =256,
    output_root: str ="../models/fineTuneIndobert",
):
    """
    Satu ekperimen dengan:
    - Oversampling minor class (selalu True disini)
    - tanpa class_weight (oversample_only)
    - Hyperparam bisa di-set: epochs, batch_size, lr, target_per_class, max_length
    """
    print(f"\n===== EXP: {strategy_name} =====")
    print(f"epochs={epochs}, lr={lr}, target_per_class={target_per_class}, max_len={max_length}, batch={batch_size}")

    # 1) Oversample train
    train_df = oversampling_minority(base_train_df, target_per_class=target_per_class)
    val_df   = base_val_df.copy()

    print("Train label dist (oversampled):", train_df["label"].value_counts().sort_index().to_dict())
    print("Val   label dist:", val_df["label"].value_counts().sort_index().to_dict())

    # 2) Dataset HF + tokenization sesuai max_length
    tok_fn = make_tokenize_fn(max_length=max_length)

    train_ds = Dataset.from_pandas(train_df.reset_index(drop=True))
    val_ds   = Dataset.from_pandas(val_df.reset_index(drop=True))

    train_ds = train_ds.map(tok_fn, batched=True)
    val_ds   = val_ds.map(tok_fn, batched=True)

    train_ds = train_ds.remove_columns(["text_raw"])
    val_ds   = val_ds.remove_columns(["text_raw"])

    train_ds = train_ds.with_format("torch")
    val_ds   = val_ds.with_format("torch")

    # 3) Model fresh
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=3,
    ).to(device)

    # 4) TrainingArguments minimal
    exp_output_dir = os.path.join(output_root, strategy_name)
    os.makedirs(exp_output_dir, exist_ok=True)

    training_args = TrainingArguments(
        output_dir=exp_output_dir,
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        learning_rate=lr,
        logging_steps=50,
        save_total_limit=1,
        weight_decay=0.01,
    )

    trainer = WeightedTrainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        compute_metrics=compute_metrics,
        class_weights=None,  # oversample_only => TANPA class_weight
    )

    # 5) Train + eval
    train_result = trainer.train()
    print("Train metrics:", train_result.metrics)

    eval_metrics = trainer.evaluate()
    print("Eval metrics:", eval_metrics)

    # 6) Classification report (biar bisa dicek detail kalau perlu)
    pred_output = trainer.predict(val_ds)
    y_true = pred_output.label_ids
    y_pred = np.argmax(pred_output.predictions, axis=1)

    cls_report = classification_report(
        y_true,
        y_pred,
        target_names=["negatif", "netral", "positif"],
        digits=3,
        output_dict=True,
    )

    # 7) Save model & tokenizer
    trainer.save_model(exp_output_dir)
    tokenizer.save_pretrained(exp_output_dir)

    return {
        "name": strategy_name,
        "epochs": epochs,
        "batch_size": batch_size,
        "lr": lr,
        "target_per_class": target_per_class,
        "max_length": max_length,
        "eval_metrics": eval_metrics,
        "cls_report": cls_report,
        "output_dir": exp_output_dir,
    }

In [31]:
search_configs = [
    {"epochs": 3, "lr": 2e-5, "target_per_class": 250, "max_length": 256},
    {"epochs": 4, "lr": 2e-5, "target_per_class": 250, "max_length": 256},
    {"epochs": 3, "lr": 3e-5, "target_per_class": 250, "max_length": 256},
    {"epochs": 4, "lr": 3e-5, "target_per_class": 250, "max_length": 256},
    {"epochs": 3, "lr": 2e-5, "target_per_class": 300, "max_length": 256},
    {"epochs": 3, "lr": 2e-5, "target_per_class": 250, "max_length": 128},
]

experiments_tuned = []

for i, cfg in enumerate(search_configs):
    name = f"oversample_only_e{cfg['epochs']}_lr{cfg['lr']}_tar{cfg['target_per_class']}_ml{cfg['max_length']}"
    result = run_experiment(
        strategy_name=name,
        base_train_df=train_df_base,
        base_val_df=val_df_base,
        epochs=cfg["epochs"],
        batch_size=8,
        lr=cfg["lr"],
        target_per_class=cfg["target_per_class"],
        max_length=cfg["max_length"],
        output_root="../models/fineTuneIndobert",
    )
    experiments_tuned.append(result)

rows = []
for exp in experiments_tuned:
    m = exp["eval_metrics"]
    rows.append({
        "name": exp["name"],
        "epochs": exp["epochs"],
        "batch_size": exp["batch_size"],
        "lr": exp["lr"],
        "target_per_class": exp["target_per_class"],
        "max_length": exp["max_length"],
        "eval_loss": m.get("eval_loss", None),
        "accuracy": m.get("eval_accuracy", None),
        "f1_macro": m.get("eval_f1_macro", None),
        "f1_weighted": m.get("eval_f1_weighted", None),
        "output_dir": exp["output_dir"],
    })

tuned_summary_df = pd.DataFrame(rows).sort_values(by="f1_macro", ascending=False)
tuned_summary_df


===== EXP: oversample_only_e3_lr2e-05_tar250_ml256 =====
epochs=3, lr=2e-05, target_per_class=250, max_len=256, batch=8
Train label dist (oversampled): {0: 250, 1: 504, 2: 250}
Val   label dist: {0: 27, 1: 126, 2: 23}


Map:   0%|          | 0/1004 [00:00<?, ? examples/s]

Map:   0%|          | 0/176 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,1.0128
100,0.8418
150,0.7619
200,0.602
250,0.5281
300,0.3825
350,0.4036


Train metrics: {'train_runtime': 2501.1648, 'train_samples_per_second': 1.204, 'train_steps_per_second': 0.151, 'total_flos': 396248807098368.0, 'train_loss': 0.6225620930787747, 'epoch': 3.0}




Eval metrics: {'eval_loss': 0.7761140465736389, 'eval_accuracy': 0.6704545454545454, 'eval_f1_macro': 0.587141874293577, 'eval_f1_weighted': 0.6906305285272003, 'eval_runtime': 32.6132, 'eval_samples_per_second': 5.397, 'eval_steps_per_second': 0.675, 'epoch': 3.0}





===== EXP: oversample_only_e4_lr2e-05_tar250_ml256 =====
epochs=4, lr=2e-05, target_per_class=250, max_len=256, batch=8
Train label dist (oversampled): {0: 250, 1: 504, 2: 250}
Val   label dist: {0: 27, 1: 126, 2: 23}


Map:   0%|          | 0/1004 [00:00<?, ? examples/s]

Map:   0%|          | 0/176 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,1.0286
100,0.8374
150,0.8205
200,0.6404
250,0.5462
300,0.3925
350,0.4059
400,0.2456
450,0.2182
500,0.2229


Train metrics: {'train_runtime': 3356.3476, 'train_samples_per_second': 1.197, 'train_steps_per_second': 0.15, 'total_flos': 528331742797824.0, 'train_loss': 0.5340192800476438, 'epoch': 4.0}




Eval metrics: {'eval_loss': 0.8879430294036865, 'eval_accuracy': 0.6875, 'eval_f1_macro': 0.5872758535986243, 'eval_f1_weighted': 0.7055277349768875, 'eval_runtime': 32.0165, 'eval_samples_per_second': 5.497, 'eval_steps_per_second': 0.687, 'epoch': 4.0}





===== EXP: oversample_only_e3_lr3e-05_tar250_ml256 =====
epochs=3, lr=3e-05, target_per_class=250, max_len=256, batch=8
Train label dist (oversampled): {0: 250, 1: 504, 2: 250}
Val   label dist: {0: 27, 1: 126, 2: 23}


Map:   0%|          | 0/1004 [00:00<?, ? examples/s]

Map:   0%|          | 0/176 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,1.0139
100,0.8854
150,0.8081
200,0.6693
250,0.6152
300,0.3922
350,0.4626


Train metrics: {'train_runtime': 2431.1471, 'train_samples_per_second': 1.239, 'train_steps_per_second': 0.155, 'total_flos': 396248807098368.0, 'train_loss': 0.6674688959878589, 'epoch': 3.0}




Eval metrics: {'eval_loss': 0.8034815192222595, 'eval_accuracy': 0.6647727272727273, 'eval_f1_macro': 0.566556319497496, 'eval_f1_weighted': 0.6865786397203509, 'eval_runtime': 31.6012, 'eval_samples_per_second': 5.569, 'eval_steps_per_second': 0.696, 'epoch': 3.0}





===== EXP: oversample_only_e4_lr3e-05_tar250_ml256 =====
epochs=4, lr=3e-05, target_per_class=250, max_len=256, batch=8
Train label dist (oversampled): {0: 250, 1: 504, 2: 250}
Val   label dist: {0: 27, 1: 126, 2: 23}


Map:   0%|          | 0/1004 [00:00<?, ? examples/s]

Map:   0%|          | 0/176 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,1.0533
100,0.9202
150,0.9314
200,0.8723
250,0.8048
300,0.7326
350,0.7941
400,0.6497
450,0.6136
500,0.6754


Train metrics: {'train_runtime': 3791.8725, 'train_samples_per_second': 1.059, 'train_steps_per_second': 0.133, 'total_flos': 528331742797824.0, 'train_loss': 0.8032643208428035, 'epoch': 4.0}




Eval metrics: {'eval_loss': 1.004328966140747, 'eval_accuracy': 0.6193181818181818, 'eval_f1_macro': 0.4529001970790969, 'eval_f1_weighted': 0.6314760075043033, 'eval_runtime': 44.0918, 'eval_samples_per_second': 3.992, 'eval_steps_per_second': 0.499, 'epoch': 4.0}





===== EXP: oversample_only_e3_lr2e-05_tar300_ml256 =====
epochs=3, lr=2e-05, target_per_class=300, max_len=256, batch=8
Train label dist (oversampled): {0: 300, 1: 504, 2: 300}
Val   label dist: {0: 27, 1: 126, 2: 23}


Map:   0%|          | 0/1104 [00:00<?, ? examples/s]

Map:   0%|          | 0/176 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,1.0234
100,0.8747
150,0.9166
200,0.7719
250,0.6787
300,0.5871
350,0.4481
400,0.4538


Train metrics: {'train_runtime': 3367.7435, 'train_samples_per_second': 0.983, 'train_steps_per_second': 0.123, 'total_flos': 435715819757568.0, 'train_loss': 0.7114589387092037, 'epoch': 3.0}




Eval metrics: {'eval_loss': 0.7708637714385986, 'eval_accuracy': 0.6363636363636364, 'eval_f1_macro': 0.5598212485051967, 'eval_f1_weighted': 0.6619687484748552, 'eval_runtime': 33.1371, 'eval_samples_per_second': 5.311, 'eval_steps_per_second': 0.664, 'epoch': 3.0}





===== EXP: oversample_only_e3_lr2e-05_tar250_ml128 =====
epochs=3, lr=2e-05, target_per_class=250, max_len=128, batch=8
Train label dist (oversampled): {0: 250, 1: 504, 2: 250}
Val   label dist: {0: 27, 1: 126, 2: 23}


Map:   0%|          | 0/1004 [00:00<?, ? examples/s]

Map:   0%|          | 0/176 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,1.0328
100,0.8886
150,0.8438
200,0.7476
250,0.64
300,0.5154
350,0.539


Train metrics: {'train_runtime': 1172.228, 'train_samples_per_second': 2.569, 'train_steps_per_second': 0.322, 'total_flos': 198124403549184.0, 'train_loss': 0.7245212131076388, 'epoch': 3.0}




Eval metrics: {'eval_loss': 0.8796199560165405, 'eval_accuracy': 0.6136363636363636, 'eval_f1_macro': 0.5074818955415971, 'eval_f1_weighted': 0.6433426060291733, 'eval_runtime': 20.6736, 'eval_samples_per_second': 8.513, 'eval_steps_per_second': 1.064, 'epoch': 3.0}




Unnamed: 0,name,epochs,batch_size,lr,target_per_class,max_length,eval_loss,accuracy,f1_macro,f1_weighted,output_dir
1,oversample_only_e4_lr2e-05_tar250_ml256,4,8,2e-05,250,256,0.887943,0.6875,0.587276,0.705528,../models/fineTuneIndobert\oversample_only_e4_...
0,oversample_only_e3_lr2e-05_tar250_ml256,3,8,2e-05,250,256,0.776114,0.670455,0.587142,0.690631,../models/fineTuneIndobert\oversample_only_e3_...
2,oversample_only_e3_lr3e-05_tar250_ml256,3,8,3e-05,250,256,0.803482,0.664773,0.566556,0.686579,../models/fineTuneIndobert\oversample_only_e3_...
4,oversample_only_e3_lr2e-05_tar300_ml256,3,8,2e-05,300,256,0.770864,0.636364,0.559821,0.661969,../models/fineTuneIndobert\oversample_only_e3_...
5,oversample_only_e3_lr2e-05_tar250_ml128,3,8,2e-05,250,128,0.87962,0.613636,0.507482,0.643343,../models/fineTuneIndobert\oversample_only_e3_...
3,oversample_only_e4_lr3e-05_tar250_ml256,4,8,3e-05,250,256,1.004329,0.619318,0.4529,0.631476,../models/fineTuneIndobert\oversample_only_e4_...


In [32]:
best_tuned = max(experiments_tuned, key=lambda e: e["eval_metrics"].get("eval_f1_macro", 0))
best_tuned["name"], best_tuned["eval_metrics"], best_tuned["output_dir"]

('oversample_only_e4_lr2e-05_tar250_ml256',
 {'eval_loss': 0.8879430294036865,
  'eval_accuracy': 0.6875,
  'eval_f1_macro': 0.5872758535986243,
  'eval_f1_weighted': 0.7055277349768875,
  'eval_runtime': 32.0165,
  'eval_samples_per_second': 5.497,
  'eval_steps_per_second': 0.687,
  'epoch': 4.0},
 '../models/fineTuneIndobert\\oversample_only_e4_lr2e-05_tar250_ml256')