In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import torch.optim as optim

In [None]:
df = pd.read_csv("/kaggle/input/vietnamese-online-news-csv-dataset/Fixed_news_dataset.csv")
df = df[['content', 'title', 'topic']]
df.dropna(subset=['content', 'title', 'topic'], inplace=True)


In [None]:
class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # tokenization
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# combine title and content
df['text'] = df['title'] + " " + df['content']

# encode topics
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['topic'])

# Train-test split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].values,
    df['label'].values,
    test_size=0.2,
    random_state=42,
)

# initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_len = 512

# create datasets and dataLoaders
train_dataset = NewsDataset(train_texts, train_labels, tokenizer, max_len)
val_dataset = NewsDataset(val_texts, val_labels, tokenizer, max_len)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Hybrid CNN-LSTM

In [None]:
class HybridCNNLSTM(nn.Module):
    def __init__(self, vocab_size, embed_size, num_classes, cnn_out_channels=128, lstm_hidden_size=128, lstm_layers=1, kernel_size=3, dropout=0.5):
        super(HybridCNNLSTM, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.conv = nn.Conv1d(in_channels=embed_size, out_channels=cnn_out_channels, kernel_size=kernel_size, padding=1)
        self.relu = nn.ReLU()
        self.lstm = nn.LSTM(input_size=cnn_out_channels, hidden_size=lstm_hidden_size, num_layers=lstm_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(lstm_hidden_size * 2, num_classes)  # *2 for bidirectional
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        # Embedding
        x = self.embedding(x)
        x = x.permute(0, 2, 1)  # (batch_size, embed_size, sequence_length)
        
        # cNN
        x = self.conv(x)
        x = self.relu(x)
        x = x.permute(0, 2, 1)  # (batch_size, sequence_length, cnn_out_channels)
        
        # lSTM
        x, _ = self.lstm(x)
        
        # max pooling
        x, _ = torch.max(x, dim=1)
        
        # fully connected layer with dropout
        x = self.dropout(x)
        x = self.fc(x)
        
        return x


In [None]:
vocab_size = len(tokenizer.vocab) 
embed_size = 128                   # embed size
num_classes = len(label_encoder.classes_)  # Number of output classes

# instantiate model
model = HybridCNNLSTM(vocab_size, embed_size, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [None]:
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    correct_predictions = 0
    total_predictions = 0
    
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, preds = torch.max(outputs, dim=1)
        correct_predictions += torch.sum(preds == labels)
        total_predictions += labels.size(0)
    
    avg_train_loss = train_loss / len(train_loader)
    train_accuracy = correct_predictions.double() / total_predictions
    
    train_losses.append(avg_train_loss)
    train_accuracies.append(train_accuracy.item())

    # valid
    model.eval()
    val_loss = 0
    correct_predictions = 0
    total_predictions = 0
    
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, preds = torch.max(outputs, dim=1)
            correct_predictions += torch.sum(preds == labels)
            total_predictions += labels.size(0)
    
    avg_val_loss = val_loss / len(val_loader)
    val_accuracy = correct_predictions.double() / total_predictions
    
    val_losses.append(avg_val_loss)
    val_accuracies.append(val_accuracy.item())
    
    print(f"Epoch [{epoch + 1}/{num_epochs}], "
          f"Train Loss: {avg_train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, "
          f"Val Loss: {avg_val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")


In [None]:
# loss
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.plot(train_losses, label='Train Loss')
plt.plot(val_losses, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Loss Over Epochs')
plt.legend()

# acc
plt.subplot(1, 2, 2)
plt.plot(train_accuracies, label='Train Accuracy')
plt.plot(val_accuracies, label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Accuracy Over Epochs')
plt.legend()

plt.show()"""