## Members

# Pre-processing

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re

from vncorenlp import VnCoreNLP

In [2]:
vncorenlp_file = "VnCoreNLP/VnCoreNLP-1.1.1.jar"

In [3]:
data_df = pd.read_csv("vfnd/CSV/vn_news_223_tdlfr.csv")
data_df.head()

Unnamed: 0,text,domain,label
0,Thủ tướng Abe cúi đầu xin lỗi vì hành động phi...,binhluan.biz,1
1,Thủ tướng Nhật cúi đầu xin lỗi vì tinh thần ph...,www.ipick.vn,1
2,Choáng! Cơ trưởng đeo khăn quàng quẩy banh nóc...,tintucqpvn.net,1
3,Chưa bao giờ nhạc Kpop lại dễ hát đến thế!!!\n...,tintucqpvn.net,1
4,"Đại học Hutech sẽ áp dụng cải cách ""Tiếq Việt""...",www.gioitreviet.net,1


In [4]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 223 entries, 0 to 222
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    223 non-null    object
 1   domain  223 non-null    object
 2   label   223 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 5.4+ KB


#### List of vietnamese stopwords

In [5]:
vn_stopwords = []
with open('data/vietnamese_stopwords.txt', encoding="utf8") as file:
    for line in file.read().splitlines():
        vn_stopwords.append(line.strip())

#### Dữ liệu có bị lặp không

In [6]:
data_df.index.duplicated().sum()

0

In [7]:
data_df.isnull().sum()

text      0
domain    0
label     0
dtype: int64

In [8]:
test_df = data_df
X_df = test_df.iloc[:, :-2].values
Y_df = test_df.iloc[:, -1].values

In [9]:
for index, test_str in enumerate(X_df):
    # Remove links
    X_df[index][0] = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', " ", test_str[0])
    # Remove all number in string
    X_df[index][0] = ''.join([i for i in X_df[index][0] if not i.isdigit()])
    # Remove all special characters and punctuation
    X_df[index][0] = re.sub('\W+',' ', X_df[index][0])
    X_df[index][0] = X_df[index][0].strip()
    X_df[index][0] = X_df[index][0].lower()

In [10]:
X_df = X_df.flatten()

In [11]:
# Remove stop words
for index, test_str in enumerate(X_df):
    X_df[index] = [w for w in test_str.split(' ') if not w in vn_stopwords]
    X_df[index] = ' '.join(X_df[index])
# print(X_df)

In [None]:
# annotator = VnCoreNLP(vncorenlp_file, annotators="wseg,pos,ner,parse", max_heap_size='-Xmx2g')
annotator = VnCoreNLP(vncorenlp_file)
with annotator:
    for index, filtered_text in enumerate(X_df):
        X_df[index] = annotator.tokenize(filtered_text)[0]
        # print(' '.join(tokenized_text))
        X_df[index] = ' '.join(X_df[index])
print(X_df)

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

# Use CountVectorizer to convert text data into numerical values
cv = CountVectorizer(analyzer='word', max_features=5000)
data = cv.fit_transform(X_df).todense()
df_data = pd.DataFrame(data, columns=cv.get_feature_names())
df_data

Unnamed: 0,abe,accent,acepromazin,adn,aff,afp,air,airbus,airlines,al,...,ổn_định_thiết,ớn,ớt,ục,ủng_hộ,ủng_hộ_tra,ức_chế,ứng,ứng_dụng,ứng_viên
0,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
218,0,0,0,0,0,0,0,0,6,0,...,0,0,0,0,0,0,0,0,0,0
219,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
220,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
221,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, Y_df, test_size=0.2, random_state=0)
print(X_train)
print(y_train)
print(X_test)
print(y_test)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
[1 0 0 0 0 0 0 0 1 0 0 0 1 0 1 1 1 1 0 1 1 1 1 0 1 1 0 1 1 1 1 1 0 0 1 0 0
 1 0 1 1 0 0 0 1 0 0 0 0 1 1 1 0 0 1 0 1 0 0 0 0 1 0 1 1 0 0 1 1 0 1 0 0 0
 0 0 1 1 1 0 0 0 1 0 1 1 1 1 1 1 1 0 1 1 0 0 0 0 1 1 1 0 0 0 0 0 0 1 0 1 0
 0 1 1 1 1 0 0 1 0 0 0 1 1 0 1 0 0 0 1 1 0 0 0 1 0 0 0 1 0 0 0 0 1 0 1 1 0
 1 0 0 0 1 1 1 0 1 0 0 0 1 0 1 0 1 1 1 1 1 0 1 0 0 1 0 0 1 0]
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 4 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
[1 0 0 0 0 0 1 0 1 1 0 1 1 0 1 0 0 0 0 1 0 1 0 1 0 1 0 0 0 0 1 0 0 0 1 1 1
 1 1 0 1 0 0 0 0]


In [15]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(criterion='entropy')
dtc.fit(X_train, y_train)
y_pred = dtc.predict(X_test)
y_pred

array([0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0,
       0], dtype=int64)

In [16]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[22,  5],
       [10,  8]], dtype=int64)

In [17]:
# Importing wordcloud for plotting word clouds and textwrap for wrapping longer text
from wordcloud import WordCloud
from textwrap import wrap

# Function for generating word clouds
def generate_wordcloud(data,title):
  wc = WordCloud(width=400, height=330, max_words=150,colormap="Dark2").generate_from_frequencies(data)
  plt.figure(figsize=(10,8))
  plt.imshow(wc, interpolation='bilinear')
  plt.axis("off")
#   plt.title('\n'.join(wrap(title,60)),fontsize=13)
  plt.show()
  
# Transposing document term matrix
df_data = df_data.transpose()
df_data
# Plotting word cloud for each product
# for index,product in enumerate(df_data.columns):
#     print(index, product)
#     generate_wordcloud(df_data[product].sort_values(ascending=False),product)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,213,214,215,216,217,218,219,220,221,222
abe,4,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
accent,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
acepromazin,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
adn,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aff,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ủng_hộ_tra,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ức_chế,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
ứng,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ứng_dụng,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### EDA

### Build model

### Deploy