In [None]:
import json
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pickle
import time
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import os
import numpy as np
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [None]:
# Tiền xử lý document
def text_preprocess(document):
    # dua ve lower
    document = document.lower()
    # xóa khoảng trắng thừa
    document = re.sub(r'\s+', ' ', document).strip()

    return document

In [None]:
with open('stopword.txt') as file:
    stopword = (file.read()).split('\n')

X = []
y = []

list_labels = ['batdongsan', 'chinhtri', 'congnghe', 'doingoai', 'doisong', 'dulich', 'giaitri', 'giaoduc', 'khoahoc', 'kinhte', 'phapluat', 'quansu', 'thethao', 'vanhoa', 'xahoi']

# đọc mini files bộ 2
for label_ in list_labels:
    with open('content_'+ label_ + '_1000.json') as json_file:
        data = json.load(json_file)
        print(label_, len(data))
        for i in range(0, len(data), 1):
            temp_text = text_preprocess(data[i]['message'] + data[i]['feature']).split(' ')
            temp_text = [word for word in temp_text if word not in stopword]
            X.append(temp_text)
            y.append(label_)

#     # đọc mini files bộ 1
# for label_ in list_labels:
#     with open('content_'+ label_ + '_mini.json') as json_file:
#         data = json.load(json_file)
#         print(label_, len(data))
#         for i in range(0, len(data), 1):
#             temp_text = text_preprocess(data[i]['message'] + data[i]['feature']).split(' ')
#             temp_text = [word for word in temp_text if word not in stopword]
#             X.append(temp_text)
#             y.append(label_)

In [None]:
#join list to string
X_train1 = []
for list in X:
    temp = ""
    for word in list:
        temp += word + ' '
    X_train1.append(temp)
X = X_train1

# tỉ lệ tập test - train là 8 : 2
test_percent = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_percent, random_state=42)

In [None]:
# encode label
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
y_train = label_encoder.transform(y_train)
y_test = label_encoder.transform(y_test)

In [None]:
# Train mô hình với navie bayes 
start_time = time.time()
text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,1),
                                             max_df=0.8,
                                             max_features=None)), 
                     ('tfidf', TfidfTransformer()), 
                     ('clf', MultinomialNB())
                    ])

text_clf = text_clf.fit(X_train, y_train)


train_time = time.time() - start_time
print('Done training Naive Bayes in', train_time, 'seconds.')
 

MODEL_PATH = "models"


if not os.path.exists(MODEL_PATH):
    os.makedirs(MODEL_PATH)
# Save model
pickle.dump(text_clf, open(os.path.join(MODEL_PATH, "naive_bayes.pkl"), 'wb'))

In [None]:
# Naive Bayes
model = pickle.load(open(os.path.join(MODEL_PATH,"naive_bayes.pkl"), 'rb'))
y_pred = model.predict(X_test)
print(y_pred)
print(y_test)
print('Naive Bayes, Accuracy =', np.mean(y_pred == y_test))

print('Naive Bayes, Accuracy_score =', accuracy_score(y_pred, y_test))
print('Naive Bayes, F1_Score =', f1_score(y_pred, y_test, average='micro'))

In [None]:
#check trên google colab
from google.colab import files
uploaded = files.upload()

In [None]:
import re
from underthesea import word_tokenize
with open('input_data.txt') as f:
    data = f.read()
test = re.sub(r'\s+', ' ', data).strip()
test_doc = [word_tokenize(test, format = 'text')]

predict_doc = model.predict(test_doc)

print(label_encoder.classes_[predict_doc])
print(label_encoder.classes_)
print(model.predict_proba(test_doc)[0]*100, "%")

In [None]:
# train mô hình với SVM
from sklearn.svm import SVC
 
start_time = time.time()
text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,1),
                                             max_df=0.8,
                                             max_features=None)), 
                     ('tfidf', TfidfTransformer()),
                     ('clf', SVC(gamma='scale'))
                    ])
print("Training...")
text_clf = text_clf.fit(X_train, y_train)

train_time = time.time() - start_time
print('Done training SVM in', train_time, 'seconds.')
 
# Save model
pickle.dump(text_clf, open(os.path.join(MODEL_PATH, "svm.pkl"), 'wb'))

In [None]:
# SVM
model = pickle.load(open(os.path.join(MODEL_PATH,"svm.pkl"), 'rb'))
y_pred = model.predict(X_test)
print('SVM, Accuracy =', np.mean(y_pred == y_test))
print('SVM, Accuracy_score =', accuracy_score(y_pred, y_test))
print('SVM, F1_Score =', f1_score(y_pred, y_test, average='micro'))

In [None]:
# train mô hình với Logistic Regression
from sklearn.linear_model import LogisticRegression
    
start_time = time.time()
text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,1),
                                             max_df=0.8,
                                             max_features=None)), 
                     ('tfidf', TfidfTransformer()),
                     ('clf', LogisticRegression(solver='lbfgs', 
                                                multi_class='auto',
                                                max_iter=10000))
                    ])
text_clf = text_clf.fit(X_train, y_train)
 
train_time = time.time() - start_time
print('Done training Linear Classifier in', train_time, 'seconds.')
 
# Save model
pickle.dump(text_clf, open(os.path.join(MODEL_PATH, "linear_classifier.pkl"), 'wb'))

In [None]:
# Linear Classifier
model = pickle.load(open(os.path.join(MODEL_PATH,"linear_classifier.pkl"), 'rb'))
y_pred = model.predict(X_test)
print('Linear Classifier, Accuracy =', np.mean(y_pred == y_test))
print('LR, Accuracy_score =', accuracy_score(y_pred, y_test))
print('LR, F1_Score =', f1_score(y_pred, y_test, average='micro'))