In [100]:
import torch
import torch.nn as nn
import pandas as pd
import regex as re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

### Cleaning Text

In [101]:
def noise_remove(text):
    text = re.sub(r'http[s]?://\S+|www\.\S+|#\S+', ' ', text)
    text = re.sub(r'[^\x00-\x7F]', ' ', text)
    text = re.sub(r"'Simak.*", '', text)
    text = re.sub(r'"Simak.*', '', text)
    text = re.sub(r"'Baca halaman.*", '', text)
    text = re.sub(r'\[Gambas:.*', '', text)
    text = re.sub(r'Lihat juga.*', '', text)
    text = re.sub(r'Lihat video.*', '', text)
    text = re.sub(r'Lihat Video.*', '', text)
    text = re.sub(r"'Artikel.*", '', text)
    text = re.sub(r'"Artikel.*', '', text)
    text = re.sub(r"'NEXT:.*", '', text)
    text = re.sub(r'"NEXT:.*', '', text)
    text = re.sub(r" {4}Halaman.*", '', text)
    text = text.replace('\n', ' ').replace('\r', ' ')
    text = re.sub(r'http[s]?://\S+|www\.\S+|#\S+', ' ', text)
    text = re.sub('NARASI:', ' ', text, flags=re.IGNORECASE)
    text = re.sub(r':\w+?:', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    text = re.sub(r'[^\w\s]|_', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\d+', '', text)
    words = word_tokenize(text)
    # Menghapus stopword
    stop_words_nltk = stopwords.words('indonesian')
    filter_words = [word for word in words if word not in stop_words_nltk]
    new_text = ' '.join(filter_words)
    new_text = re.sub(r'\s+', ' ', new_text)

    return text.strip()

### Load Tokenizer

In [102]:
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer

bert_tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
bert_tokenizer.save_vocabulary('.')

tokenizer = BertWordPieceTokenizer('vocab.txt', lowercase=True)
tokenizer.enable_padding()

In [103]:
def tokenize_text(text):
    encoded = tokenizer.encode(text)
    return {
        'input_ids': torch.tensor([encoded.ids]),  # Tambahkan batch dimension
        'attention_mask': torch.tensor([encoded.attention_mask])
    }

In [104]:
from transformers import BertModel

# Load pre-trained IndoBERT model
model_name = 'indobenchmark/indobert-base-p1'
indobert_model = BertModel.from_pretrained(model_name)

for param in indobert_model.parameters():
    param.requires_grad = False

In [105]:
# Definisikan model LSTM
class LSTMClassifier(nn.Module):
    def __init__(self, input_size, lstm_hidden_size, fc_hidden_sizes, output_size, dropout_prob=0.6):
        super(LSTMClassifier, self).__init__()
        self.lstm_hidden_size = lstm_hidden_size

        # LSTM layer
        self.lstm = nn.LSTM(input_size, lstm_hidden_size, num_layers=1, batch_first=True)

        # Fully connected layers
        layer_sizes = [lstm_hidden_size] + fc_hidden_sizes
        print(layer_sizes)
        fc_layers = []

        fc_layers.append(nn.Linear(layer_sizes[0], layer_sizes[1]))
        fc_layers.append(nn.BatchNorm1d(layer_sizes[1])) 
        fc_layers.append(nn.Dropout(dropout_prob))  
        fc_layers.append(nn.ReLU())

        for i in range(1, len(fc_hidden_sizes)):
            fc_layers.append(nn.Linear(layer_sizes[i], layer_sizes[i+1]))
            fc_layers.append(nn.ReLU())

        self.fc = nn.Sequential(*fc_layers)

        # Output layer
        self.output_layer = nn.Linear(fc_hidden_sizes[-1], output_size)

    def forward(self, input_ids):
        lstm_out, _ = self.lstm(input_ids)

        # Only take the output from the last timestep
        lstm_out = lstm_out[:, -1, :]
        # print(f"Shape dari lstm_out setelah LSTM: {lstm_out.shape}")

        # Feedforward through fully connected layers
        fc_out = self.fc(lstm_out)
        # print(f"Shape dari fc_out setelah fully connected layers: {fc_out.shape}")

        # Feedforward through output layer
        logits = self.output_layer(fc_out)

        return logits

# Dimensi input untuk LSTM
input_size = indobert_model.config.hidden_size
hidden_size = 768
fc_hidden_sizes = [128, 32, 32]
output_size = 2  # Jumlah kelas (hoax dan non-hoax)

# Inisialisasi model LSTM
lstm_model = LSTMClassifier(input_size, hidden_size, fc_hidden_sizes, output_size)

[768, 128, 32, 32]


In [106]:
# Definisikan model gabungan
class IndoBERT_LSTM(nn.Module):
    def __init__(self, indobert_model, lstm_model):
        super(IndoBERT_LSTM, self).__init__()
        self.indobert = indobert_model
        self.lstm = lstm_model

    def forward(self, input_ids, attention_mask):
        # Forward pass through IndoBERT
        with torch.no_grad():
            outputs = self.indobert(input_ids=input_ids, attention_mask=attention_mask)
            last_hidden_state = outputs[0]

        # # Debugging shape of last_hidden_state
        # print(f"last_hidden_state shape: {last_hidden_state.shape}")  # Should be [batch_size, seq_len, hidden_size]

        # Forward pass through LSTMClassifier
        logits = self.lstm(last_hidden_state)

        return logits

# Inisialisasi model gabungan
model = IndoBERT_LSTM(indobert_model, lstm_model)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [107]:
def predict_hoax(input_text, model_source, model_name):
    # Pembersihan teks
    cleaned_text = noise_remove(input_text)

    # Tokenisasi
    tokenized = tokenize_text(cleaned_text)
    input_ids = tokenized['input_ids']
    attention_mask = tokenized['attention_mask']

    # Ubah ke tensor
    input_ids_tensor = input_ids.to(device)
    attention_mask_tensor = attention_mask.to(device)

    # Memuat model dan pindahkan ke perangkat yang sesuai
    model = IndoBERT_LSTM(indobert_model, lstm_model)
    model.load_state_dict(torch.load(f'{model_source}/{model_name}'))
    model.eval()
    model.to(device)

    input_ids_tensor = input_ids_tensor.to(device)
    attention_mask_tensor = attention_mask_tensor.to(device)

    # Prediksi
    with torch.no_grad():
        logits = model(input_ids_tensor, attention_mask_tensor)

    predicted_class = torch.argmax(logits, dim=1).item()

    # Interpretasi hasil
    if predicted_class == 0:
        return "Prediksi Berita : Non-Hoax"
    else:
        return "Prediksi Berita : Hoax"

In [108]:
# 1. Input teks manual
input_text = input("Masukkan teks yang ingin diuji: ")

# 2. Prediksi
hasil_prediksi = predict_hoax(input_text, "Dropout Batch 64 p 0.6", "indobert_lstm_modelcombine_64_0.6.pth")

# 3. Menampilkan hasil
print(hasil_prediksi)

Prediksi Berita : Hoax
