# Import library

In [None]:
!pip install underthesea



In [None]:
from __future__ import print_function
import numpy as np
import pandas as pd
from underthesea import word_tokenize

In [185]:
from keras.preprocessing import sequence
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Lambda, Embedding, Convolution1D, LSTM, Concatenate, Input, TimeDistributed, Flatten
from keras.datasets import imdb
import keras.backend as K
from keras.optimizers import Adadelta
from keras.preprocessing import sequence as sq
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical

In [None]:
from sklearn import preprocessing
import os
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Download raw data and handle word tokenize

In [None]:
!kaggle datasets download -d phamtheds/news-dataset-vietnameses
!unzip -q /content/news-dataset-vietnameses

Dataset URL: https://www.kaggle.com/datasets/phamtheds/news-dataset-vietnameses
License(s): copyright-authors
Downloading news-dataset-vietnameses.zip to /content
100% 307M/308M [00:04<00:00, 71.6MB/s]
100% 308M/308M [00:04<00:00, 80.4MB/s]


In [None]:
data = pd.read_csv('/content/Dataset_articles_NoID.csv', header=0)
data

In [None]:
data_remove_summary_NaN = data.dropna(subset=["Summary"])

In [None]:
data_remove_summary_NaN.isnull().any()

URL          False
Title        False
Summary      False
Contents      True
Date         False
Author(s)     True
Category     False
Tags         False
dtype: bool

In [None]:
data_remove_summary_NaN.info()

<class 'pandas.core.frame.DataFrame'>
Index: 313140 entries, 0 to 313319
Data columns (total 8 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   URL        313140 non-null  object
 1   Title      313140 non-null  object
 2   Summary    313140 non-null  object
 3   Contents   307598 non-null  object
 4   Date       313140 non-null  object
 5   Author(s)  312455 non-null  object
 6   Category   313140 non-null  object
 7   Tags       313140 non-null  object
dtypes: object(8)
memory usage: 21.5+ MB


In [None]:
raw_data = data_remove_summary_NaN[["Summary", "Category"]]

In [None]:
raw_data

Unnamed: 0,Summary,Category
0,"Lâm Đồng - Lãnh đạo thành phố Bảo Lộc, Lâm Đồn...",Bất động sản
1,TPHCM - Việc không thể cưỡng chế thuế của hai ...,Bất động sản
2,"Hiện trên địa bàn tỉnh Ninh Bình có 32 khu, cụ...",Bất động sản
3,Hoàn công nhà ở với ý nghĩa là điều kiện để đư...,Bất động sản
4,Có rất nhiều lý do khiến những dự án thấp nội ...,Bất động sản
...,...,...
313315,"Bà Dương Thị Tuyết ở thị trấn Khe Sanh, huyện ...",Tấm Lòng Vàng
313316,"Ngày 25.7, Đại diện Chương trình “Tấm lưới ngh...",Tin hoạt động
313317,"Ngày 17.7, lãnh đạo Quỹ TLV Lao Động và LĐLĐ t...",Tin hoạt động
313318,Nhà báo Trần Đình Chính (tức Trần Hoài Thu) - ...,Tấm Lòng Vàng


In [None]:
import re
def handle_sentence(raw_string):
    print("handle for", raw_string)
    return word_tokenize(raw_string.replace("-", ""), format="text")

In [None]:
raw_data["Summary"] = raw_data["Summary"].apply(lambda content: handle_sentence(str(content)))

In [None]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 313140 entries, 0 to 313319
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   Summary   313140 non-null  object
 1   Category  313140 non-null  object
dtypes: object(2)
memory usage: 7.2+ MB


In [None]:
raw_data[:10000].to_csv("/content/processing_data.csv", header=True, encoding="utf8")

# Load processing data

In [None]:
processing_data = pd.read_csv("/content/processing_data.csv", index_col=0, names=["summary", "category"])

In [None]:
label_encoder = preprocessing.LabelEncoder()
list_label = set(processing_data["category"])
le = label_encoder.fit(processing_data["category"])
processing_data["category"] = le.transform(processing_data["category"])

In [None]:
values_counts_of_category = processing_data["category"].value_counts()

In [None]:
values_counts_of_category

category
1     6184
14    3397
4      242
19      76
5       22
8       22
0       20
6        7
16       5
2        4
12       4
11       2
9        2
7        2
10       2
17       2
3        2
13       1
18       1
15       1
Name: count, dtype: int64

In [165]:
selected_label = list(values_counts_of_category[values_counts_of_category > 3000].index)

In [166]:
print("List of category which have number of new larger than 3000:")
list(le.inverse_transform(selected_label))

List of category which have number of new larger than 3000:


['Bất động sản', 'Thời sự']

In [172]:
selected_data = processing_data[processing_data["category"].isin(selected_label)]

In [175]:
selected_data["category"].unique()

array([ 1, 14])

# Set parameters

In [134]:
max_features = 21540
maxlen = 400
batch_size = 8
embedding_dims = 200
nb_filter = 150
filter_length = 3
hidden_dims = 100

# Create Model

In [135]:
def max_1d(X):
    return K.max(X, axis=1)

In [219]:
def LSTM_and_CNN_model():
    model = Sequential()

    input_layer = Input(shape=(maxlen,), dtype='int32', name="main_input")
    emb_layer = Embedding(max_features,
                          embedding_dims,
                          input_length=maxlen
                          )(input_layer)

    # CNN block
    con3_layer = Convolution1D(filters=nb_filter, kernel_size=3,
                        padding='same',
                        activation='relu',
                        strides = 1
                        )(emb_layer)
    pool_con3_layer = Lambda(max_1d, output_shape=(nb_filter,))(con3_layer)

    con4_layer = Convolution1D(filters=nb_filter, kernel_size=5,
                        padding='same',
                        activation='relu',
                        strides = 1
                        )(emb_layer)
    pool_con4_layer = Lambda(max_1d, output_shape=(nb_filter,))(con4_layer)

    con5_layer = Convolution1D(filters=nb_filter,
                        kernel_size=7,
                        padding='same',
                        activation='relu',
                        strides = 1
                        )(emb_layer)
    pool_con5_layer = Lambda(max_1d, output_shape=(nb_filter,))(con5_layer)

    cnn_block = Concatenate()([pool_con3_layer, pool_con4_layer, pool_con5_layer])

    # LSTM block
    x = Embedding(max_features, embedding_dims, input_length=maxlen)(input_layer)
    lstm_block = LSTM(128)(x)

    cnn_lstm_block = Concatenate()([lstm_block, cnn_block])

    dense_layer = Dense(hidden_dims*2, activation='sigmoid')(cnn_lstm_block)
    output_layer = Dense(2, trainable=True, activation='softmax')(dense_layer)

    model = Model(inputs=[input_layer], outputs=[output_layer])
    adadelta = Adadelta(learning_rate=0.1, rho=0.95, epsilon=1e-06)
    model.compile(loss='categorical_crossentropy',
                  optimizer="adamax",
                  metrics=['accuracy'])

    return model

In [220]:
model = LSTM_and_CNN_model()
model.summary()

Model: "model_26"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 main_input (InputLayer)     [(None, 400)]                0         []                            
                                                                                                  
 embedding_58 (Embedding)    (None, 400, 200)             4308000   ['main_input[0][0]']          
                                                                                                  
 conv1d_81 (Conv1D)          (None, 400, 150)             90150     ['embedding_58[0][0]']        
                                                                                                  
 conv1d_82 (Conv1D)          (None, 400, 150)             150150    ['embedding_58[0][0]']        
                                                                                           

# Train model

In [221]:
X_train, X_test, y_train, y_test = train_test_split(selected_data["summary"], selected_data["category"], test_size=0.2)

vectorizer = TfidfVectorizer(max_features = max_features)
vectorizer = vectorizer.fit(X_train)

In [222]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

In [223]:
sequences_train = tokenizer.texts_to_sequences(X_train)
sequences_train = sq.pad_sequences(sequences_train, maxlen=maxlen)

sequences_test = tokenizer.texts_to_sequences(X_test)
sequences_test = sq.pad_sequences(sequences_test, maxlen=maxlen)

y_train = y_train.replace(14, 0)
y_train_encoded = to_categorical(y_train, num_classes=2)

y_test = y_test.replace(14, 0)
y_test_encoded = to_categorical(y_test, num_classes=2)

In [224]:
sequences_train.shape, y_train_encoded.shape, sequences_test.shape, y_test_encoded.shape

((7664, 400), (7664, 2), (1917, 400), (1917, 2))

In [225]:
model.fit(sequences_train, y_train_encoded)



<keras.src.callbacks.History at 0x7a49f0b180a0>

In [226]:
model.compile(loss='categorical_crossentropy',
                  optimizer="adamax",
                  metrics=['accuracy'])
score, acc = model.evaluate(sequences_test, y_test_encoded)
print("Acc:", acc)

Acc: 0.9598330855369568
