In [3]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.optim import Adam, AdamW
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from collections import Counter
from tqdm import tqdm
import re
import os

# --- KONFIGURASI UMUM ---
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
SEED = 42
BATCH_SIZE = 16
LSTM_EPOCHS = 5
BERT_EPOCHS = 3
MAX_LEN = 64

# Reproducibility
torch.manual_seed(SEED)
np.random.seed(SEED)

print(f"[System] Using device: {DEVICE}")

# ==========================================
# 1. DATA PREPARATION UTILS
# ==========================================
class DataHandler:
    def __init__(self, filepath):
        self.filepath = filepath
        self.df = None
    
    def load_and_clean(self):
        # Load Data
        try:
            self.df = pd.read_csv(self.filepath)
        except:
            self.df = pd.read_csv('archive (8).zip/sentimentdataset.csv')
            
        # Clean Whitespace
        for col in self.df.select_dtypes(include=['object']).columns:
            self.df[col] = self.df[col].str.strip()
            
        # Normalize Labels
        def map_label(txt):
            txt = str(txt).lower()
            if any(x in txt for x in ['positive', 'happy', 'joy', 'love', 'excitement']):
                return 2 # Positive
            elif any(x in txt for x in ['negative', 'sad', 'anger', 'hate', 'fear']):
                return 0 # Negative
            return 1 # Neutral
            
        self.df['label'] = self.df['Sentiment'].apply(map_label)
        
        # Simple text cleaning for LSTM
        def clean_text(text):
            text = str(text).lower()
            text = re.sub(r'[^a-z0-9\s]', '', text) # Hapus simbol aneh
            return text
            
        self.df['clean_text'] = self.df['Text'].apply(clean_text)
        
        return self.df

# Load Data Sekali Saja
handler = DataHandler('sentimentdataset.csv')
df = handler.load_and_clean()

# Split Data
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    df['clean_text'], df['label'], test_size=0.2, random_state=SEED, stratify=df['label']
)

print(f"[Data] Train size: {len(X_train_raw)} | Test size: {len(X_test_raw)}")

# ==========================================
# 2. MODEL A: LSTM (Long Short-Term Memory)
# ==========================================
print("\n" + "="*30)
print("BUILDING MODEL A: LSTM")
print("="*30)

# A. Build Vocabulary (Manual Tokenization)
class Vocabulary:
    def __init__(self, texts, min_freq=2):
        self.word2idx = {'<PAD>': 0, '<UNK>': 1}
        self.idx2word = {0: '<PAD>', 1: '<UNK>'}
        self.min_freq = min_freq
        self.build_vocab(texts)
        
    def build_vocab(self, texts):
        counter = Counter()
        for text in texts:
            counter.update(text.split())
            
        idx = 2
        for word, freq in counter.items():
            if freq >= self.min_freq:
                self.word2idx[word] = idx
                self.idx2word[idx] = word
                idx += 1
                
    def encode(self, text, max_len):
        tokens = text.split()
        indices = [self.word2idx.get(w, 1) for w in tokens] # 1 is UNK
        
        # Padding / Truncating
        if len(indices) < max_len:
            indices += [0] * (max_len - len(indices)) # 0 is PAD
        else:
            indices = indices[:max_len]
            
        return torch.tensor(indices, dtype=torch.long)

    def __len__(self):
        return len(self.word2idx)

# Init Vocab based on Train Data
vocab = Vocabulary(X_train_raw)
print(f"[LSTM] Vocabulary size: {len(vocab)} unique words")

# B. LSTM Dataset
class LSTMDataset(Dataset):
    def __init__(self, texts, labels, vocab, max_len):
        self.texts = texts.tolist()
        self.labels = labels.tolist()
        self.vocab = vocab
        self.max_len = max_len
        
    def __getitem__(self, idx):
        x = self.vocab.encode(self.texts[idx], self.max_len)
        y = torch.tensor(self.labels[idx], dtype=torch.long)
        return x, y
        
    def __len__(self):
        return len(self.texts)

# C. LSTM Architecture
class SimpleLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim=100, hidden_dim=128, output_dim=3):
        super(SimpleLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        # x shape: [batch_size, max_len]
        embedded = self.embedding(x) 
        # embedded shape: [batch_size, max_len, embed_dim]
        
        # LSTM Output
        # output: [batch, len, hidden], (hidden_state, cell_state)
        _, (hidden, _) = self.lstm(embedded)
        
        # Ambil hidden state terakhir dari layer terakhir
        # hidden shape: [1, batch_size, hidden_dim] -> squeeze -> [batch_size, hidden_dim]
        last_hidden = hidden[-1]
        
        logits = self.fc(last_hidden)
        return logits

# D. Setup LSTM Training
train_ds_lstm = LSTMDataset(X_train_raw, y_train, vocab, MAX_LEN)
test_ds_lstm = LSTMDataset(X_test_raw, y_test, vocab, MAX_LEN)

train_loader_lstm = DataLoader(train_ds_lstm, batch_size=BATCH_SIZE, shuffle=True)
test_loader_lstm = DataLoader(test_ds_lstm, batch_size=BATCH_SIZE, shuffle=False)

model_lstm = SimpleLSTM(vocab_size=len(vocab)).to(DEVICE)
optim_lstm = Adam(model_lstm.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

# ==========================================
# 3. MODEL B: BERT (Transformer)
# ==========================================
print("\n" + "="*30)
print("BUILDING MODEL B: BERT")
print("="*30)

# A. Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# B. BERT Dataset
class BERTDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts.tolist()
        self.labels = labels.tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            'input_ids': enc['input_ids'].squeeze(0),
            'attention_mask': enc['attention_mask'].squeeze(0),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }
        
    def __len__(self):
        return len(self.texts)

# C. Setup BERT Training
train_ds_bert = BERTDataset(X_train_raw, y_train, tokenizer, MAX_LEN)
test_ds_bert = BERTDataset(X_test_raw, y_test, tokenizer, MAX_LEN)

train_loader_bert = DataLoader(train_ds_bert, batch_size=BATCH_SIZE, shuffle=True)
test_loader_bert = DataLoader(test_ds_bert, batch_size=BATCH_SIZE, shuffle=False)

# Menggunakan BERT Base (lebih powerful dari DistilBERT, tapi lebih berat)
model_bert = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
model_bert.to(DEVICE)
optim_bert = AdamW(model_bert.parameters(), lr=2e-5)

# ==========================================
# 4. TRAINING ENGINE (Unified)
# ==========================================
def train_engine(model_name, model, loader, optimizer, epochs, is_bert=False):
    print(f"\n[Training] Starting {model_name} for {epochs} epochs...")
    
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        
        # Progress bar
        loop = tqdm(loader, desc=f"Epoch {epoch+1}/{epochs}", leave=False)
        
        for batch in loop:
            optimizer.zero_grad()
            
            if is_bert:
                # BERT Input
                input_ids = batch['input_ids'].to(DEVICE)
                mask = batch['attention_mask'].to(DEVICE)
                labels = batch['labels'].to(DEVICE)
                outputs = model(input_ids, attention_mask=mask, labels=labels)
                loss = outputs.loss
            else:
                # LSTM Input
                x, y = batch
                x, y = x.to(DEVICE), y.to(DEVICE)
                preds = model(x)
                loss = criterion(preds, y)
                
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            loop.set_postfix(loss=loss.item())
            
        print(f"Epoch {epoch+1} | Avg Loss: {total_loss/len(loader):.4f}")

def evaluate_engine(model_name, model, loader, is_bert=False):
    model.eval()
    preds_list, labels_list = [], []
    
    with torch.no_grad():
        for batch in loader:
            if is_bert:
                input_ids = batch['input_ids'].to(DEVICE)
                mask = batch['attention_mask'].to(DEVICE)
                labels = batch['labels'].to(DEVICE)
                outputs = model(input_ids, attention_mask=mask)
                preds = torch.argmax(outputs.logits, dim=1)
            else:
                x, y = batch
                x, y = x.to(DEVICE), y.to(DEVICE)
                logits = model(x)
                preds = torch.argmax(logits, dim=1)
                labels = y
            
            preds_list.extend(preds.cpu().numpy())
            labels_list.extend(labels.cpu().numpy())
            
    acc = accuracy_score(labels_list, preds_list)
    print(f"\n[{model_name}] Validation Accuracy: {acc*100:.2f}%")
    return acc

# ==========================================
# 5. EXECUTION
# ==========================================

# --- RUN LSTM ---
train_engine("LSTM", model_lstm, train_loader_lstm, optim_lstm, LSTM_EPOCHS, is_bert=False)
acc_lstm = evaluate_engine("LSTM", model_lstm, test_loader_lstm, is_bert=False)

# --- RUN BERT ---
train_engine("BERT", model_bert, train_loader_bert, optim_bert, BERT_EPOCHS, is_bert=True)
acc_bert = evaluate_engine("BERT", model_bert, test_loader_bert, is_bert=True)

# ==========================================
# 6. FINAL COMPARISON
# ==========================================
print("\n" + "="*30)
print("FINAL RESULT COMPARISON")
print("="*30)
print(f"1. LSTM Accuracy : {acc_lstm*100:.2f}%")
print(f"2. BERT Accuracy : {acc_bert*100:.2f}%")

if acc_bert > acc_lstm:
    diff = acc_bert - acc_lstm
    print(f"\nConclusion: BERT outperformed LSTM by {diff*100:.2f}%.")
    print("Reason: BERT uses 'Attention Mechanism' to understand context better than LSTM's sequential processing.")
else:
    print("\nConclusion: LSTM performed comparably to BERT.")

[System] Using device: cpu
[Data] Train size: 585 | Test size: 147

BUILDING MODEL A: LSTM
[LSTM] Vocabulary size: 908 unique words

BUILDING MODEL B: BERT


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



[Training] Starting LSTM for 5 epochs...


                                                                                                                       

Epoch 1 | Avg Loss: 0.7792


                                                                                                                       

Epoch 2 | Avg Loss: 0.6634


                                                                                                                       

Epoch 3 | Avg Loss: 0.6652


                                                                                                                       

Epoch 4 | Avg Loss: 0.6647


                                                                                                                       

Epoch 5 | Avg Loss: 0.6651

[LSTM] Validation Accuracy: 75.51%

[Training] Starting BERT for 3 epochs...


                                                                                                                       

Epoch 1 | Avg Loss: 0.6726


                                                                                                                       

Epoch 2 | Avg Loss: 0.4697


                                                                                                                       

Epoch 3 | Avg Loss: 0.3608

[BERT] Validation Accuracy: 79.59%

FINAL RESULT COMPARISON
1. LSTM Accuracy : 75.51%
2. BERT Accuracy : 79.59%

Conclusion: BERT outperformed LSTM by 4.08%.
Reason: BERT uses 'Attention Mechanism' to understand context better than LSTM's sequential processing.


In [5]:
import pandas as pd

# Data hasil training kamu
results = {
    'Model Architecture': ['Custom LSTM', 'DistilBERT'],
    'Accuracy': ['75.51%', '79.59%'],
    'Difference': ['-', '+ 4.08%'],
    'Mechanism': ['Sequential (RNN)', 'Self-Attention (Transformer)']
}

# Buat DataFrame
comparison_table = pd.DataFrame(results)

# Tampilkan dengan style highlight
print("üèÜ DEEP LEARNING BENCHMARK RESULTS üèÜ")
display(comparison_table.style.applymap(lambda x: 'background-color: lightgreen' if '79.59%' in str(x) else '', subset=['Accuracy']))

üèÜ DEEP LEARNING BENCHMARK RESULTS üèÜ


  display(comparison_table.style.applymap(lambda x: 'background-color: lightgreen' if '79.59%' in str(x) else '', subset=['Accuracy']))


Unnamed: 0,Model Architecture,Accuracy,Difference,Mechanism
0,Custom LSTM,75.51%,-,Sequential (RNN)
1,DistilBERT,79.59%,+ 4.08%,Self-Attention (Transformer)


In [7]:
import os
import torch
import pickle

# Konfigurasi Path Penyimpanan
BASE_DIR = './saved_models'
BERT_PATH = os.path.join(BASE_DIR, 'distilbert_v1')
LSTM_PATH = os.path.join(BASE_DIR, 'custom_lstm_v1')

# Membuat folder jika belum ada
os.makedirs(BERT_PATH, exist_ok=True)
os.makedirs(LSTM_PATH, exist_ok=True)

print(f"[System] Initiating save process to '{BASE_DIR}'...")

# ---------------------------------------------------------
# 1. MENYIMPAN BERT (Model + Tokenizer)
# ---------------------------------------------------------
if 'model_bert' in globals() and 'tokenizer' in globals():
    print(f"[BERT] Saving model artifacts to {BERT_PATH}...")
    
    # Menyimpan konfigurasi dan bobot model
    model_bert.save_pretrained(BERT_PATH)
    
    # Menyimpan tokenizer (vocab.txt, config, dll)
    tokenizer.save_pretrained(BERT_PATH)
    
    print("[BERT] Successfully saved.")
else:
    print("[BERT] Model object not found in memory. Skipping.")

# ---------------------------------------------------------
# 2. MENYIMPAN LSTM (Weights + Vocabulary + Config)
# ---------------------------------------------------------
if 'model_lstm' in globals() and 'vocab' in globals():
    print(f"[LSTM] Saving model artifacts to {LSTM_PATH}...")
    
    # a. Simpan Bobot Model (State Dict)
    weights_file = os.path.join(LSTM_PATH, 'weights.pth')
    torch.save(model_lstm.state_dict(), weights_file)
    
    # b. Simpan Vocabulary (Penting untuk mapping kata -> angka)
    vocab_file = os.path.join(LSTM_PATH, 'vocab.pkl')
    with open(vocab_file, 'wb') as f:
        pickle.dump(vocab, f)
        
    # c. Simpan Hyperparameters (Agar saat load tahu dimensinya)
    config = {
        'vocab_size': len(vocab),
        'embed_dim': 100,   # Sesuaikan dengan config training
        'hidden_dim': 128,  # Sesuaikan dengan config training
        'output_dim': 3
    }
    config_file = os.path.join(LSTM_PATH, 'config.pkl')
    with open(config_file, 'wb') as f:
        pickle.dump(config, f)
        
    print("[LSTM] Successfully saved.")
else:
    print("[LSTM] Model object not found in memory. Skipping.")

print("[System] All processes completed.")

[System] Initiating save process to './saved_models'...
[BERT] Saving model artifacts to ./saved_models\distilbert_v1...
[BERT] Successfully saved.
[LSTM] Saving model artifacts to ./saved_models\custom_lstm_v1...
[LSTM] Successfully saved.
[System] All processes completed.
