In [1]:
import re
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from torch.optim import Adam
from tqdm.auto import tqdm

# ============================================================
# 1. DEVICE & SEED
# ============================================================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

torch.manual_seed(42)
np.random.seed(42)

# ============================================================
# 2. LOAD DATA + SPLIT 80:20
# ============================================================
df = pd.read_csv("traindata_final_fixed_rulebased.csv")

label_cols = ["admiration","amusement","gratitude","love","pride","relief","remorse"]

print("Full data shape:", df.shape)
'''
train_df, dev_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42
)
'''
train_df = pd.read_csv("train.csv")
dev_df   = pd.read_csv("dev.csv")

print("Train shape :", train_df.shape)
print("Dev/Test shape :", dev_df.shape)

# Kalau mau subset train, ubah N_TRAIN. Sekarang pakai semua data train.
N_TRAIN = len(train_df)

train_df_small = train_df.sample(
    n=min(N_TRAIN, len(train_df)),
    random_state=42
).reset_index(drop=True)

dev_df_full = dev_df.reset_index(drop=True)

print("Subset Train shape:", train_df_small.shape)
print("Full Dev/Test shape:", dev_df_full.shape)

X_train_texts = train_df_small["text"].astype(str).tolist()
X_dev_texts   = dev_df_full["text"].astype(str).tolist()

y_train = train_df_small[label_cols].values.astype("float32")
y_dev   = dev_df_full[label_cols].values.astype("float32")

# ============================================================
# 3. TOKENISASI SEDERHANA + VOCAB
# ============================================================
from collections import Counter

def tokenize(text):
    text = text.lower()
    # ambil token alfanumerik sederhana
    tokens = re.findall(r"\b\w+\b", text)
    return tokens

# Bangun vocab dari TRAIN SAJA (jangan dari dev)
counter = Counter()
for txt in X_train_texts:
    counter.update(tokenize(txt))

MAX_VOCAB_SIZE = 30000  # bisa diubah
most_common = counter.most_common(MAX_VOCAB_SIZE)

# 0 = PAD, 1 = UNK
word2idx = {"<PAD>": 0, "<UNK>": 1}
for i, (word, freq) in enumerate(most_common, start=2):
    word2idx[word] = i

idx2word = {idx: word for word, idx in word2idx.items()}

vocab_size = len(word2idx)
print("Vocab size:", vocab_size)

def numericalize(tokens, word2idx):
    return [word2idx.get(tok, word2idx["<UNK>"]) for tok in tokens]

MAX_LEN = 50  # panjang sequence, bisa diubah

def encode_texts(texts, word2idx, max_len):
    all_ids = []
    for txt in texts:
        toks = tokenize(txt)
        ids = numericalize(toks, word2idx)
        # pad / truncate
        if len(ids) < max_len:
            ids = ids + [word2idx["<PAD>"]] * (max_len - len(ids))
        else:
            ids = ids[:max_len]
        all_ids.append(ids)
    return np.array(all_ids, dtype="int64")

print("Encoding train texts...")
X_train_ids = encode_texts(X_train_texts, word2idx, MAX_LEN)
print("Encoding dev texts...")
X_dev_ids   = encode_texts(X_dev_texts,   word2idx, MAX_LEN)

# ============================================================
# 4. DATASET & DATALOADER
# ============================================================
X_train_tensor = torch.tensor(X_train_ids, dtype=torch.long)
y_train_tensor = torch.tensor(y_train,     dtype=torch.float32)

X_dev_tensor   = torch.tensor(X_dev_ids,   dtype=torch.long)
y_dev_tensor   = torch.tensor(y_dev,       dtype=torch.float32)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
dev_dataset   = TensorDataset(X_dev_tensor,   y_dev_tensor)

batch_size = 64

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dev_loader   = DataLoader(dev_dataset,   batch_size=128, shuffle=False)

print("Train batches:", len(train_loader))
print("Dev batches:", len(dev_loader))

# ============================================================
# 5. MODEL TEXTCNN
# ============================================================
class TextCNN(nn.Module):
    def __init__(self,
                 vocab_size,
                 embed_dim,
                 num_labels,
                 kernel_sizes=(3, 4, 5),
                 num_filters=100,
                 dropout=0.5,
                 pad_idx=0):
        super(TextCNN, self).__init__()
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embed_dim,
            padding_idx=pad_idx
        )

        self.convs = nn.ModuleList([
            nn.Conv1d(
                in_channels=embed_dim,
                out_channels=num_filters,
                kernel_size=k
            )
            for k in kernel_sizes
        ])

        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(num_filters * len(kernel_sizes), num_labels)

    def forward(self, x):
        # x: [batch, seq_len]
        embedded = self.embedding(x)          # [B, L, E]
        embedded = embedded.transpose(1, 2)   # [B, E, L]

        conv_outs = [torch.relu(conv(embedded)) for conv in self.convs]
        # conv_out: [B, num_filters, L_out]

        pooled = [torch.max(co, dim=2)[0] for co in conv_outs]
        # pooled: [B, num_filters]

        cat = torch.cat(pooled, dim=1)        # [B, num_filters * len(kernel_sizes)]
        cat = self.dropout(cat)
        logits = self.fc(cat)                 # [B, num_labels]
        return logits

num_labels = len(label_cols)

embed_dim   = 100
kernel_sizes = (3, 4, 5)
num_filters = 100
dropout     = 0.5

model = TextCNN(
    vocab_size=vocab_size,
    embed_dim=embed_dim,
    num_labels=num_labels,
    kernel_sizes=kernel_sizes,
    num_filters=num_filters,
    dropout=dropout,
    pad_idx=word2idx["<PAD>"]
).to(device)

optimizer = Adam(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

print(model)

# ============================================================
# 6. EVALUATION FUNCTION
# ============================================================
def evaluate(threshold=0.5):
    model.eval()
    all_logits = []
    all_labels = []

    with torch.no_grad():
        for ids, labels in dev_loader:
            ids    = ids.to(device)
            labels = labels.to(device)

            logits = model(ids)

            all_logits.append(logits.cpu().numpy())
            all_labels.append(labels.cpu().numpy())

    all_logits = np.concatenate(all_logits, axis=0)
    all_labels = np.concatenate(all_labels, axis=0)

    probs = 1 / (1 + np.exp(-all_logits))  # sigmoid
    preds = (probs >= threshold).astype(int)

    micro = f1_score(all_labels, preds, average="micro")
    macro = f1_score(all_labels, preds, average="macro")

    return micro, macro, probs, preds, all_labels

# ============================================================
# 7. TRAINING LOOP + SIMPAN BEST MODEL (BERDASARKAN MICRO-F1 DEV)
# ============================================================
epochs = 15

best_micro = -1.0
best_state = None

for epoch in range(1, epochs + 1):
    model.train()
    total_loss = 0.0

    pbar = tqdm(train_loader, desc=f"Epoch {epoch}/{epochs}")
    for ids, labels in pbar:
        ids    = ids.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        logits = model(ids)
        loss   = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * ids.size(0)
        pbar.set_postfix({"loss": f"{loss.item():.4f}"})

    train_loss = total_loss / len(train_loader.dataset)
    dev_micro, dev_macro, _, _, _ = evaluate(threshold=0.5)

    print(f"\nEpoch {epoch} done | train_loss={train_loss:.4f} | "
          f"dev_micro={dev_micro:.4f} | dev_macro={dev_macro:.4f}")

    if dev_micro > best_micro:
        best_micro = dev_micro
        best_state = model.state_dict().copy()
        print(f"  → New best model (dev Micro-F1={best_micro:.4f})")

# restore best model
if best_state is not None:
    model.load_state_dict(best_state)
    print("\nBest model restored!")

# ============================================================
# 8. CARI THRESHOLD GLOBAL TERBAIK DI FULL DEV
# ============================================================
print("\nSearching best global threshold on FULL dev...")

best_t = 0.5
best_t_micro = -1.0

for t in np.arange(0.1, 0.9, 0.05):
    micro_t, _, _, _, _ = evaluate(threshold=t)
    print(f"t={t:.2f} → Dev Micro-F1 = {micro_t:.4f}")
    if micro_t > best_t_micro:
        best_t_micro = micro_t
        best_t = t

print(f"\nBEST threshold: {best_t:.2f}")
print(f"BEST dev Micro-F1: {best_t_micro:.4f}")

# ============================================================
# 9. FINAL EVAL + PER-LABEL F1 DI FULL DEV
# ============================================================
final_micro, final_macro, final_probs, final_preds, final_true = evaluate(threshold=best_t)

print("\n========================")
print(" FINAL DEV PERFORMANCE  ")
print("========================")
print(f"Micro-F1 (t={best_t:.2f}): {final_micro:.2f}")
print(f"Macro-F1 (t={best_t:.2f}): {final_macro:.2f}\n")

for i, col in enumerate(label_cols):
    f1 = f1_score(final_true[:, i], final_preds[:, i])
    print(f"{col.capitalize()} F1: {f1:.2f}")

# ============================================================
# 10. SAVE dev_predictions_textcnn.csv
# ============================================================
output_df = dev_df_full.copy()
for i, col in enumerate(label_cols):
    output_df[col] = final_preds[:, i]

output_df.to_csv("dev_predictions_textcnn.csv", index=False)
print("\nSaved dev_predictions_textcnn.csv!")


Using device: cuda
Full data shape: (16386, 8)
Train shape : (25196, 8)
Dev/Test shape : (3149, 8)
Subset Train shape: (25196, 8)
Full Dev/Test shape: (3149, 8)
Vocab size: 20278
Encoding train texts...
Encoding dev texts...
Train batches: 394
Dev batches: 25
TextCNN(
  (embedding): Embedding(20278, 100, padding_idx=0)
  (convs): ModuleList(
    (0): Conv1d(100, 100, kernel_size=(3,), stride=(1,))
    (1): Conv1d(100, 100, kernel_size=(4,), stride=(1,))
    (2): Conv1d(100, 100, kernel_size=(5,), stride=(1,))
  )
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=300, out_features=7, bias=True)
)


Epoch 1/15:   0%|          | 0/394 [00:00<?, ?it/s]


Epoch 1 done | train_loss=0.1752 | dev_micro=0.6986 | dev_macro=0.5085
  → New best model (dev Micro-F1=0.6986)


Epoch 2/15:   0%|          | 0/394 [00:00<?, ?it/s]


Epoch 2 done | train_loss=0.1156 | dev_micro=0.7707 | dev_macro=0.5622
  → New best model (dev Micro-F1=0.7707)


Epoch 3/15:   0%|          | 0/394 [00:00<?, ?it/s]


Epoch 3 done | train_loss=0.0985 | dev_micro=0.7725 | dev_macro=0.5743
  → New best model (dev Micro-F1=0.7725)


Epoch 4/15:   0%|          | 0/394 [00:00<?, ?it/s]


Epoch 4 done | train_loss=0.0878 | dev_micro=0.7896 | dev_macro=0.5927
  → New best model (dev Micro-F1=0.7896)


Epoch 5/15:   0%|          | 0/394 [00:00<?, ?it/s]


Epoch 5 done | train_loss=0.0791 | dev_micro=0.7906 | dev_macro=0.5896
  → New best model (dev Micro-F1=0.7906)


Epoch 6/15:   0%|          | 0/394 [00:00<?, ?it/s]


Epoch 6 done | train_loss=0.0716 | dev_micro=0.8020 | dev_macro=0.5955
  → New best model (dev Micro-F1=0.8020)


Epoch 7/15:   0%|          | 0/394 [00:00<?, ?it/s]


Epoch 7 done | train_loss=0.0645 | dev_micro=0.7984 | dev_macro=0.6269


Epoch 8/15:   0%|          | 0/394 [00:00<?, ?it/s]


Epoch 8 done | train_loss=0.0577 | dev_micro=0.7966 | dev_macro=0.6357


Epoch 9/15:   0%|          | 0/394 [00:00<?, ?it/s]


Epoch 9 done | train_loss=0.0527 | dev_micro=0.7858 | dev_macro=0.6134


Epoch 10/15:   0%|          | 0/394 [00:00<?, ?it/s]


Epoch 10 done | train_loss=0.0473 | dev_micro=0.7747 | dev_macro=0.6091


Epoch 11/15:   0%|          | 0/394 [00:00<?, ?it/s]


Epoch 11 done | train_loss=0.0432 | dev_micro=0.7893 | dev_macro=0.6142


Epoch 12/15:   0%|          | 0/394 [00:00<?, ?it/s]


Epoch 12 done | train_loss=0.0377 | dev_micro=0.7889 | dev_macro=0.6037


Epoch 13/15:   0%|          | 0/394 [00:00<?, ?it/s]


Epoch 13 done | train_loss=0.0353 | dev_micro=0.7838 | dev_macro=0.6179


Epoch 14/15:   0%|          | 0/394 [00:00<?, ?it/s]


Epoch 14 done | train_loss=0.0329 | dev_micro=0.7816 | dev_macro=0.5976


Epoch 15/15:   0%|          | 0/394 [00:00<?, ?it/s]


Epoch 15 done | train_loss=0.0303 | dev_micro=0.7742 | dev_macro=0.5942

Best model restored!

Searching best global threshold on FULL dev...
t=0.10 → Dev Micro-F1 = 0.7776
t=0.15 → Dev Micro-F1 = 0.7873
t=0.20 → Dev Micro-F1 = 0.7917
t=0.25 → Dev Micro-F1 = 0.7895
t=0.30 → Dev Micro-F1 = 0.7858
t=0.35 → Dev Micro-F1 = 0.7841
t=0.40 → Dev Micro-F1 = 0.7808
t=0.45 → Dev Micro-F1 = 0.7808
t=0.50 → Dev Micro-F1 = 0.7742
t=0.55 → Dev Micro-F1 = 0.7719
t=0.60 → Dev Micro-F1 = 0.7692
t=0.65 → Dev Micro-F1 = 0.7645
t=0.70 → Dev Micro-F1 = 0.7604
t=0.75 → Dev Micro-F1 = 0.7583
t=0.80 → Dev Micro-F1 = 0.7519
t=0.85 → Dev Micro-F1 = 0.7424

BEST threshold: 0.20
BEST dev Micro-F1: 0.7917

 FINAL DEV PERFORMANCE  
Micro-F1 (t=0.20): 0.79
Macro-F1 (t=0.20): 0.64

Admiration F1: 0.72
Amusement F1: 0.79
Gratitude F1: 0.92
Love F1: 0.80
Pride F1: 0.22
Relief F1: 0.17
Remorse F1: 0.82

Saved dev_predictions_textcnn.csv!
