In [36]:
import torch
import pandas as pd
from transformers import BertModel, BertTokenizer
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import torch.optim as optim
import torch.nn as nn
from tqdm import tqdm
import re
from nltk.corpus import stopwords

In [37]:
# Preprocess text
def preprocess_text(text):
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word.lower() not in stopwords.words('indonesian')])
    
    # Remove special characters and symbols
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    
    # Convert to lowercase
    text = text.lower()
    
    return text

In [38]:
# Define RCNN model
class RCNN(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, num_classes):
        super(RCNN, self).__init__()
        self.bert = BertModel.from_pretrained('indolem/indobert-base-uncased')
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        embeddings = outputs.last_hidden_state
        lstm_out, _ = self.lstm(embeddings)
        concatenated = torch.cat((embeddings, lstm_out), dim=2)
        logits = self.fc(concatenated[:, -1, :])
        return logits

In [39]:
# Define RCNN dataset
class RCNNDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data.loc[index, 'text_clean']
        label = self.data.loc[index, 'polarity']
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            truncation=True,
            max_length=512,
            padding='max_length',
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': label
        }


In [40]:
# Load pre-trained tokenizer
tokenizer = BertTokenizer.from_pretrained('indolem/indobert-base-uncased')

In [41]:
# Load labeled training data
train_dataset_path = "clean.csv"
train_data = pd.read_csv(train_dataset_path)

# Load unlabeled data
unlabeled_dataset_path = "IKN_Nusantara.csv"
unlabeled_data = pd.read_csv(unlabeled_dataset_path)

In [43]:
# Preprocess labeled training data
train_data['text_clean'] = train_data['text_clean'].apply(preprocess_text)

# Preprocess unlabeled data
unlabeled_data['title'] = unlabeled_data['title'].apply(preprocess_text)

In [None]:
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [None]:
# Define hyperparameters
embedding_dim = 768
hidden_dim = 128
num_classes = 3
num_epochs = 5
batch_size = 32
learning_rate = 1e-3

In [None]:
# Create RCNN model instance
model = RCNN(embedding_dim, hidden_dim, num_classes).to(device)

Some weights of the model checkpoint at indolem/indobert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
# Split labeled data into train and validation sets
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

In [None]:
# Create RCNN datasets
train_dataset = RCNNDataset(train_data, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

TypeError: RCNNDataset.__init__() got an unexpected keyword argument 'text_column'

In [None]:
# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [None]:
# Training loop
for epoch in range(num_epochs):
    # Training
    model.train()
    train_loss = 0.0
    train_correct = 0
    train_total = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

# Training loop
num_epochs = 5
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = RCNNModel()  # Replace with your RCNN model
model.to(device)

for epoch in range(num_epochs):
    model.train()
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask)
        _, predicted = torch.max(outputs, dim=1)
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        train_total += labels.size(0)
        train_correct += (predicted == labels).sum().item()

    train_accuracy = train_correct / train_total
    train_loss /= len(train_loader)

    # Validation
    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Validation"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask)
            _, predicted = torch.max(outputs, dim=1)
            loss = criterion(outputs, labels)

            val_loss += loss.item()
            val_total += labels.size(0)
            val_correct += (predicted == labels).sum().item()

    val_accuracy = val_correct / val_total
    val_loss /= len(val_loader)

    # Print progress
    print(f"Epoch {epoch+1}/{num_epochs} - "
          f"Train Loss: {train_loss:.4f} - Train Accuracy: {train_accuracy:.4f} - "
          f"Val Loss: {val_loss:.4f} - Val Accuracy: {val_accuracy:.4f}")


NameError: name 'RCNNModel' is not defined