In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import nltk
nltk.download('punkt')
import time
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import os

import torch
import torch.nn.functional as F

from torch import nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, TensorDataset, WeightedRandomSampler
from transformers import RobertaTokenizer, RobertaModel, get_linear_schedule_with_warmup

from sklearn.metrics import confusion_matrix, classification_report, f1_score
from torch.optim import AdamW
from tqdm import tqdm
from sklearn.metrics import classification_report, confusion_matrix, matthews_corrcoef
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package punkt to
[nltk_data]     /home/basilmusyaffa19/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
from datasets import load_dataset
dataset = load_dataset("nyu-mll/glue", "cola")

training_data = dataset["train"]
validation_data = dataset["validation"]

In [3]:
df_train = pd.DataFrame(training_data)
df_test = pd.DataFrame(validation_data) # Sebagai data test, karena memiliki label

In [4]:
df_train = df_train.drop('idx', axis=1)
df_test = df_test.drop('idx', axis=1)

Pre-processing

In [5]:
def cleaning_text(text):
    text = text.lower()
    text = ' '.join(text.split())
    text = text.encode('ascii', 'ignore').decode()
    text = re.sub(r'([.,!?])', r'\1 ', text)
    text = re.sub(r'\s+', ' ', text)
    return text

In [6]:
df_train['sentence']=df_train['sentence'].apply(cleaning_text)
df_test['sentence']=df_test['sentence'].apply(cleaning_text)

In [7]:
df_train

Unnamed: 0,sentence,label
0,"our friends won't buy this analysis, let alone...",1
1,one more pseudo generalization and i'm giving ...,1
2,one more pseudo generalization or i'm giving up.,1
3,"the more we study verbs, the crazier they get.",1
4,day by day the facts are getting murkier.,1
...,...,...
8546,poseidon appears to own a dragon,0
8547,digitize is my happiest memory,0
8548,it is easy to slay the gorgon.,1
8549,i had the strangest feeling that i knew you.,1


In [8]:
print("Jumlah data sebelum:", len(df_train))

# 1. Menghapus data kosong dan NaN sekaligus
df_train = df_train.replace('', np.nan).dropna(subset=['sentence'])
# 2. Menghapus data yang memiliki tipe float -> Teksnya hanya angka
df_train = df_train[~df_train['sentence'].apply(lambda x: isinstance(x, float))]
# 3. Menghapus duplikat, dengan mempertahankan data pertama
df_train = df_train.drop_duplicates(subset=['sentence'], keep='first')

print("Jumlah data setelah:", len(df_train))

Jumlah data sebelum: 8551
Jumlah data setelah: 8530


In [9]:
print("Jumlah data sebelum:", len(df_test))

# 1. Menghapus data kosong dan NaN sekaligus
df_test = df_test.replace('', np.nan).dropna(subset=['sentence'])
# 2. Menghapus data yang memiliki tipe float -> Teksnya hanya angka
df_test = df_test[~df_test['sentence'].apply(lambda x: isinstance(x, float))]
# 3. Menghapus duplikat, dengan mempertahankan data pertama
df_test = df_test.drop_duplicates(subset=['sentence'], keep='first')

print("Jumlah data setelah:", len(df_test))

Jumlah data sebelum: 1043
Jumlah data setelah: 1039


In [10]:
label = df_train['label'].value_counts()
print(label)

label
1    6012
0    2518
Name: count, dtype: int64


In [11]:
max_length = int(df_train['sentence'].str.len().max())
max_length

232

In [12]:
def prepare_data_loader(df, tokenizer, max_length, val_size, batch_size, random_state=42, use_weighted_sampler=True):
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    encoded_data = tokenizer(
        df['sentence'].tolist(),
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors='pt'
    )
    
    # Split data
    input_ids = encoded_data['input_ids'].numpy()  
    attention_mask = encoded_data['attention_mask'].numpy()
    labels = df['label'].values  
    
    # Train test split
    train_inputs, val_inputs, train_masks, val_masks, train_labels, val_labels = train_test_split(
        input_ids, attention_mask, labels,
        test_size=val_size,
        random_state=random_state,
        stratify=labels
    )
    
    # Convert back to tensors after split
    train_inputs = torch.tensor(train_inputs)
    val_inputs = torch.tensor(val_inputs)
    train_masks = torch.tensor(train_masks)
    val_masks = torch.tensor(val_masks)
    train_labels = torch.tensor(train_labels, dtype=torch.long)
    val_labels = torch.tensor(val_labels, dtype=torch.long)
    
    # Create dataset
    train_dataset = TensorDataset(train_inputs, train_masks, train_labels)
    val_dataset = TensorDataset(val_inputs, val_masks, val_labels)
    
    # Create dataloader
    if use_weighted_sampler:
        # Hitung sample weights
        sample_weights = compute_sample_weights(train_labels)
        
        # Gunakan Weighted Random Sampler
        train_sampler = WeightedRandomSampler(
            weights=sample_weights, 
            num_samples=len(train_dataset), 
            replacement=True
        )
        train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=train_sampler)
    else:
        # Jika tidak menggunakan weighted sampler
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    
    # Validation loader tetap sama
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
    return train_loader, val_loader

In [13]:
def prepare_data_loader(df, tokenizer, max_length, val_size, batch_size, random_state=42):
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    encoded_data = tokenizer(
        df['sentence'].tolist(),
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors='pt'
    )
    
    # Split data
    input_ids = encoded_data['input_ids'].numpy()  
    attention_mask = encoded_data['attention_mask'].numpy()
    labels = df['label'].values  
    
    # Train test split
    train_inputs, val_inputs, train_masks, val_masks, train_labels, val_labels = train_test_split(
        input_ids, attention_mask, labels,
        test_size=val_size,
        random_state=random_state,
        stratify=labels
    )
    
    # Convert back to tensors after split
    train_inputs = torch.tensor(train_inputs)
    val_inputs = torch.tensor(val_inputs)
    train_masks = torch.tensor(train_masks)
    val_masks = torch.tensor(val_masks)
    train_labels = torch.tensor(train_labels, dtype=torch.long)
    val_labels = torch.tensor(val_labels, dtype=torch.long)
    
    # Create dataset
    train_dataset = TensorDataset(train_inputs, train_masks, train_labels)
    val_dataset = TensorDataset(val_inputs, val_masks, val_labels)
    
    # Hitung sample weights untuk train_labels
    class_counts = torch.bincount(train_labels)
    sample_weights = 1.0 / class_counts[train_labels].float()
    sample_weights = sample_weights / sample_weights.sum()
    
    train_sampler = WeightedRandomSampler(
        weights=sample_weights, 
        num_samples=len(train_dataset),
        replacement=True
    )
    
    # Create dataloader
    train_loader = DataLoader(
        train_dataset, 
        batch_size=batch_size, 
        sampler=train_sampler
    )
    val_loader = DataLoader(
        val_dataset, 
        batch_size=batch_size, 
        shuffle=False
    )
    
    return train_loader, val_loader

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [15]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
base_model = RobertaModel.from_pretrained('roberta-large')

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
class ClassificationModel(nn.Module):
    def __init__(self, base_model, num_labels, class_weights):
        super(ClassificationModel, self).__init__()
        self.base_model = base_model
        self.dropout = nn.Dropout(0.2)  # 0.2 for RoBERTa, 0.3 for GPT2
        self.classifier = nn.Linear(self.base_model.config.hidden_size, num_labels)
        self.class_weights = class_weights

        self.attention_layer = nn.Sequential(
            nn.Linear(self.base_model.config.hidden_size, self.base_model.config.hidden_size),
            nn.Tanh(),
            nn.Linear(self.base_model.config.hidden_size, 1),
            nn.Softmax(dim=1)
        )

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.base_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=True
        )

        hidden_states = outputs.last_hidden_state
        # Calculate attention scores using the complex attention layer
        attention_scores = self.attention_layer(hidden_states)
        
        # Handle attention mask
        if attention_mask is not None:
            # Expand attention mask to match attention scores shape
            attention_mask = attention_mask.unsqueeze(-1)
            # Apply mask by multiplication
            attention_scores = attention_scores * attention_mask
            # Normalize scores after masking
            attention_scores = attention_scores / (attention_scores.sum(dim=1, keepdim=True) + 1e-9)

        # Apply attention pooling
        pooled_output = torch.sum(hidden_states * attention_scores, dim=1)

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            weights = self.class_weights.to(logits.device)
            loss_fct = nn.CrossEntropyLoss(weight=weights)
            loss = loss_fct(logits, labels)

        return loss, logits

In [17]:
class EarlyStopping:
    def __init__(self, patience, min_delta, verbose=True):
        self.patience = patience
        self.min_delta = min_delta
        self.verbose = verbose
        self.counter = 0
        self.best_loss = None
        self.early_stop = False
        
    def __call__(self, val_loss):
        if self.best_loss is None:
            self.best_loss = val_loss
        elif val_loss > self.best_loss - self.min_delta:
            self.counter += 1
            if self.verbose:
                print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_loss = val_loss
            self.counter = 0

In [18]:
def plot_learning_curves(train_losses, val_losses, train_f1_scores, val_f1_scores, train_mcc_scores, val_mcc_scores):
    plt.figure(figsize=(12, 5))
    
    # Plot Loss
    plt.subplot(1, 3, 1)
    plt.plot(train_losses, label='Training Loss')
    plt.plot(val_losses, label='Validation Loss')
    plt.title('Learning Curves - Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    
    # Plot F1 Score
    plt.subplot(1, 3, 2)
    plt.plot(train_f1_scores, label='Training F1 Score')
    plt.plot(val_f1_scores, label='Validation F1 Score')
    plt.title('Learning Curves - F1 Score')
    plt.xlabel('Epoch')
    plt.ylabel('F1 Score')
    plt.legend()
    
    # Plot MCC Score
    plt.subplot(1, 3, 3)
    plt.plot(train_mcc_scores, label='Training MCC Score')
    plt.plot(val_mcc_scores, label='Validation MCC Score')
    plt.title('Learning Curves - MCC Score')
    plt.xlabel('Epoch')
    plt.ylabel('MCC Score')
    plt.legend()
    
    
    plt.tight_layout()
    plt.savefig('learning_curves.png')
    plt.close()

In [19]:
def train_model(model, train_loader, val_loader, optimizer, device, epochs, save_path, patience, min_delta):
    model = model.to(device)
    best_val_loss = float('inf')
    best_val_f1 = 0
    best_epoch = -1
    
    train_losses = []
    val_losses = []
    train_f1_scores = []
    val_f1_scores = []
    train_mcc_scores = []
    val_mcc_scores = []
    
    total_training_steps = len(train_loader) * epochs
    
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=0, 
        num_training_steps=total_training_steps
    )
    
    early_stopping = EarlyStopping(patience=patience, min_delta=min_delta)
    
    for epoch in range(epochs):
        # Training phase
        model.train()
        start_time = time.time()
        total_train_loss = 0
        train_preds = []
        train_true_labels = []
        
        for batch in tqdm(train_loader, desc=f"Training Epoch {epoch + 1}/{epochs}"):
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            
            optimizer.zero_grad()
            # Perhatikan perubahan di sini
            loss, logits = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            loss.backward()
            optimizer.step()
            scheduler.step() 
            
            total_train_loss += loss.item()
            
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            train_preds.extend(preds)
            train_true_labels.extend(labels.cpu().numpy())
        
        avg_train_loss = total_train_loss / len(train_loader)
        train_f1 = f1_score(train_true_labels, train_preds, average='weighted')
        train_mcc = matthews_corrcoef(train_true_labels, train_preds)
        
        # Validation phase
        model.eval()
        total_val_loss = 0
        val_preds = []
        val_true_labels = []
        
        with torch.no_grad():
            for batch in tqdm(val_loader, desc=f"Validation Epoch {epoch + 1}/{epochs}"):
                input_ids, attention_mask, labels = [b.to(device) for b in batch]
                
                loss, logits = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
                
                total_val_loss += loss.item()
                
                preds = torch.argmax(logits, dim=1).cpu().numpy()
                val_preds.extend(preds)
                val_true_labels.extend(labels.cpu().numpy())
        
        avg_val_loss = total_val_loss / len(val_loader)
        val_f1 = f1_score(val_true_labels, val_preds, average='weighted')
        val_mcc = matthews_corrcoef(val_true_labels, val_preds)
        
        # Simpan history
        train_losses.append(avg_train_loss)
        val_losses.append(avg_val_loss)
        train_f1_scores.append(train_f1)
        val_f1_scores.append(val_f1)
        train_mcc_scores.append(train_mcc)
        val_mcc_scores.append(val_mcc)
        
        elapsed_time = time.time() - start_time
        
        print(f"Epoch {epoch + 1}/{epochs}")
        print(f"Training Loss: {avg_train_loss:.4f}")
        print(f"Validation Loss: {avg_val_loss:.4f}")
        print(f"Training F1 Score: {train_f1:.4f}")
        print(f"Validation F1 Score: {val_f1:.4f}")
        print(f"Training MCC Score: {train_mcc:.4f}")
        print(f"Validation MCC Score: {val_mcc:.4f}")
        print(f"Time: {elapsed_time:.2f}s")
        
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            best_epoch = epoch + 1
            print(f"Best Validation Loss: {best_val_loss:.4f}")
            
            os.makedirs(os.path.dirname(save_path), exist_ok=True)
            
            torch.save({
                'epoch': epoch + 1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'scheduler_state_dict': scheduler.state_dict(), 
            }, save_path)
        else:
            print(f"Best Validation Loss {best_val_loss:.4f} (epoch {best_epoch})")
        
        early_stopping(avg_val_loss)
        if early_stopping.early_stop:
            print(f"Early stopping triggered after epoch {epoch + 1}")
            break
    
    plot_learning_curves(train_losses, val_losses, train_f1_scores, val_f1_scores, train_mcc_scores, val_mcc_scores)
    
    return model

In [20]:
def evaluate_model(model, dataloader, device):
    model.eval()
    all_predictions = []
    all_labels = []
    total_loss = 0.0
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            
            loss, logits = model(input_ids, attention_mask, labels)
            
            total_loss += loss.item()
            
            predictions = torch.argmax(logits, dim=-1)
            
            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    average_loss = total_loss / len(dataloader)
    
    accuracy = np.mean(np.array(all_predictions) == np.array(all_labels))
    print(f"Test Loss: {average_loss:.4f}")
    print(f"Accuracy: {accuracy:.4f}")
    
    # Confusion Matrix
    cm = confusion_matrix(all_labels, all_predictions)
    print("\nConfusion Matrix:")
    print(cm)
    
    # Classification Report
    report = classification_report(all_labels, all_predictions)
    print("\nClassification Report:")
    print(report)
    
    mcc = matthews_corrcoef(all_labels, all_predictions)
    print("\nMCC Score:")
    print(mcc)

In [21]:
BATCH_SIZE=32
EPOCHS=7

In [22]:
train_loader, val_loader = prepare_data_loader(
    df_train,
    tokenizer,
    max_length=max_length,
    val_size=0.1,
    batch_size=BATCH_SIZE
)

In [23]:
def compute_class_weights(labels):
    labels_tensor = torch.tensor(labels.values, dtype=torch.long) 
    
    class_counts = torch.bincount(labels_tensor)
    total_samples = len(labels)
    weights = total_samples / (len(class_counts) * class_counts.float())
    return weights

class_weights = compute_class_weights(df_train['label'])
print(class_weights)

tensor([1.6938, 0.7094])


In [24]:
optimizer = AdamW(
    base_model.parameters(),
    lr=2e-5,
    eps=1e-8,
    weight_decay=1e-4
)

In [25]:
num_labels = 2
classification_model = ClassificationModel(base_model=base_model, 
                                           num_labels=num_labels,
                                           class_weights=class_weights)

In [26]:
model = train_model(model=classification_model, 
                    train_loader=train_loader,
                    val_loader=val_loader,
                    optimizer=optimizer, 
                    device=device, 
                    epochs=EPOCHS,
                    save_path='/home/basilmusyaffa19/CoLA/Grammatical/model-2/With Attention Pooling/roBERTa_lowercase_best_checkpoint.pt',
                    patience=3,
                    min_delta=1e-4)

Training Epoch 1/7: 100%|██████████| 240/240 [03:42<00:00,  1.08it/s]
Validation Epoch 1/7: 100%|██████████| 27/27 [00:07<00:00,  3.46it/s]


Epoch 1/7
Training Loss: 0.4276
Validation Loss: 0.4232
Training F1 Score: 0.7407
Validation F1 Score: 0.8579
Training MCC Score: 0.5235
Validation MCC Score: 0.6615
Time: 230.20s
Best Validation Loss: 0.4232


Training Epoch 2/7: 100%|██████████| 240/240 [03:40<00:00,  1.09it/s]
Validation Epoch 2/7: 100%|██████████| 27/27 [00:07<00:00,  3.45it/s]


Epoch 2/7
Training Loss: 0.2420
Validation Loss: 0.4479
Training F1 Score: 0.8894
Validation F1 Score: 0.8615
Training MCC Score: 0.7840
Validation MCC Score: 0.6669
Time: 228.82s
Best Validation Loss 0.4232 (epoch 1)
EarlyStopping counter: 1 out of 3


Training Epoch 3/7: 100%|██████████| 240/240 [03:40<00:00,  1.09it/s]
Validation Epoch 3/7: 100%|██████████| 27/27 [00:07<00:00,  3.46it/s]


Epoch 3/7
Training Loss: 0.1528
Validation Loss: 0.6118
Training F1 Score: 0.9321
Validation F1 Score: 0.8703
Training MCC Score: 0.8664
Validation MCC Score: 0.6869
Time: 228.70s
Best Validation Loss 0.4232 (epoch 1)
EarlyStopping counter: 2 out of 3


Training Epoch 4/7: 100%|██████████| 240/240 [03:40<00:00,  1.09it/s]
Validation Epoch 4/7: 100%|██████████| 27/27 [00:07<00:00,  3.47it/s]


Epoch 4/7
Training Loss: 0.0964
Validation Loss: 0.6278
Training F1 Score: 0.9570
Validation F1 Score: 0.8762
Training MCC Score: 0.9152
Validation MCC Score: 0.7036
Time: 228.52s
Best Validation Loss 0.4232 (epoch 1)
EarlyStopping counter: 3 out of 3
Early stopping triggered after epoch 4


In [27]:
def load_saved_model(model, save_path, device):
    checkpoint = torch.load(save_path, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    model = model.to(device)
    model.eval() 
    return model

In [28]:
best_model = load_saved_model(
    model=classification_model, 
    save_path='/home/basilmusyaffa19/CoLA/Grammatical/model-2/With Attention Pooling/roBERTa_lowercase_best_checkpoint.pt',
    device=device
)

In [29]:
max_length_test = int(df_test['sentence'].str.len().max())

encoded_data_test = tokenizer(
        df_test['sentence'].tolist(),
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors='pt'
    )

test_inputs = encoded_data_test['input_ids']  
test_masks = encoded_data_test['attention_mask']
test_labels = torch.tensor(df_test['label'].values, dtype=torch.long)
    
test_dataset = TensorDataset(test_inputs, test_masks, test_labels)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [30]:
evaluate_model(best_model, 
               test_loader, 
               device)

Evaluating: 100%|██████████| 33/33 [00:09<00:00,  3.47it/s]

Test Loss: 0.4262
Accuracy: 0.8412

Confusion Matrix:
[[244  76]
 [ 89 630]]

Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.76      0.75       320
           1       0.89      0.88      0.88       719

    accuracy                           0.84      1039
   macro avg       0.81      0.82      0.82      1039
weighted avg       0.84      0.84      0.84      1039


MCC Score:
0.6318637210946597



