In [71]:
import re
import os
from pyvi import ViTokenizer

In [72]:
def remove_duplicates(text):
    # Xóa dữ liệu trùng lặp
    text = re.sub(r'\b(\w+)(\s+\1\b)+', r'\1', text, flags=re.IGNORECASE)
    return text

In [73]:
def remove_html_tags(text):
    # Loại bỏ các thẻ HTML
    text = re.sub(r'<[^>]+>', '', text)
    return text

In [74]:
def remove_code(text):
    # Loại bỏ các thẻ code, mã JavaScript và mã CSS
    text = re.sub(r'<script.?>.?</script>', '', text, flags=re.DOTALL)
    text = re.sub(r'<style.?>.?</style>', '', text, flags=re.DOTALL)
    return text

In [75]:
def remove_special_characters(text):
    # Loại bỏ các ký tự không mong muốn, nhưng giữ lại "__label"
    text = re.sub(r'[^\w\s:]|(?<!_)_label', '', text)
    return text

In [76]:
def remove_curly_braces_content(text):
    # Loại bỏ nội dung trong các dấu ngoặc nhọn {{...}}
    text = re.sub(r'{{.*?}}', '', text)
    return text

In [77]:
def format_content(text, inputFileWasReplaced):
    text = re.sub(r':', '', text)
    text = re.sub(r'content\s*', '', text, flags=re.DOTALL)
    text = re.sub(r'^_label__', '__label__', text)
    text = re.sub(rf'__label__{inputFileWasReplaced}_', f'__label__{inputFileWasReplaced} ', text)
    text = re.sub(r'\s+', ' ', text)
    return text + '\n'

In [78]:
def remove_startswith_or_endswith_underscore_except_label(text, inputFileWasReplaced):
    words = text.split()
    updated_words = []
    for word in words:
        if word.startswith('_') and word != '__label__' + inputFileWasReplaced:
            word = word[1:]
        if word.endswith('_'):
            word = word[:-1]
        updated_words.append(word)
    return ' '.join(updated_words)

In [79]:
def remove_stopwords(output_file_path, inputFileWasReplaced):
    tonghop_directory = 'tonghop'
    if not os.path.exists(tonghop_directory):
        os.makedirs(tonghop_directory)
    filetonghop = os.path.join(tonghop_directory, 'tonghop_' + inputFileWasReplaced + '.txt')
    with open('stopwords.txt', 'r', encoding='utf-8') as stop_file:
        stopwords = set(stop_file.read().splitlines())

    with open(output_file_path, 'r', encoding='utf-8') as input_file, open(filetonghop, 'w', encoding='utf-8') as output_file:
        for line in input_file:
            # tách các từ trong dòng và kiểm tra xem chúng có trong stopwords không
            words = line.split()
            words = [word for word in words if word.lower() not in stopwords]

            # ghi vào file mới content đã được loại bỏ từ stopwords vào tệp tonghop2
            output_file.write(' '.join(words) + '\n')

In [80]:
def loaddicchar():
    dic = {}
    char1252 = 'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ'.split(
        '|')
    charutf8 = "à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ".split(
        '|')
    for i in range(len(char1252)):
        dic[char1252[i]] = charutf8[i]
    return dic
dicchar = loaddicchar()

In [81]:
def convert_unicode(txt):
    return re.sub(
        r'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ',
        lambda x: dicchar[x.group()], txt)

bang_nguyen_am = [['a', 'à', 'á', 'ả', 'ã', 'ạ', 'a'],
                  ['ă', 'ằ', 'ắ', 'ẳ', 'ẵ', 'ặ', 'aw'],
                  ['â', 'ầ', 'ấ', 'ẩ', 'ẫ', 'ậ', 'aa'],
                  ['e', 'è', 'é', 'ẻ', 'ẽ', 'ẹ', 'e'],
                  ['ê', 'ề', 'ế', 'ể', 'ễ', 'ệ', 'ee'],
                  ['i', 'ì', 'í', 'ỉ', 'ĩ', 'ị', 'i'],
                  ['o', 'ò', 'ó', 'ỏ', 'õ', 'ọ', 'o'],
                  ['ô', 'ồ', 'ố', 'ổ', 'ỗ', 'ộ', 'oo'],
                  ['ơ', 'ờ', 'ớ', 'ở', 'ỡ', 'ợ', 'ow'],
                  ['u', 'ù', 'ú', 'ủ', 'ũ', 'ụ', 'u'],
                  ['ư', 'ừ', 'ứ', 'ử', 'ữ', 'ự', 'uw'],
                  ['y', 'ỳ', 'ý', 'ỷ', 'ỹ', 'ỵ', 'y']]
bang_ky_tu_dau = ['', 'f', 's', 'r', 'x', 'j']

nguyen_am_to_ids = {}

for i in range(len(bang_nguyen_am)):
    for j in range(len(bang_nguyen_am[i]) - 1):
        nguyen_am_to_ids[bang_nguyen_am[i][j]] = (i, j)

In [82]:
def is_valid_vietnam_word(word):
    chars = list(word)
    nguyen_am_index = -1
    for index, char in enumerate(chars):
        x, y = nguyen_am_to_ids.get(char, (-1, -1))
        if x != -1:
            if nguyen_am_index == -1:
                nguyen_am_index = index
            else:
                if index - nguyen_am_index != 1:
                    return False
                nguyen_am_index = index
    return True

In [83]:
def chuan_hoa_dau_tu_tieng_viet(word):
    if not is_valid_vietnam_word(word):
        return word

    chars = list(word)
    dau_cau = 0
    nguyen_am_index = []
    qu_or_gi = False
    for index, char in enumerate(chars):
        x, y = nguyen_am_to_ids.get(char, (-1, -1))
        if x == -1:
            continue
        elif x == 9:  # check qu
            if index != 0 and chars[index - 1] == 'q':
                chars[index] = 'u'
                qu_or_gi = True
        elif x == 5:  # check gi
            if index != 0 and chars[index - 1] == 'g':
                chars[index] = 'i'
                qu_or_gi = True
        if y != 0:
            dau_cau = y
            chars[index] = bang_nguyen_am[x][0]
        if not qu_or_gi or index != 1:
            nguyen_am_index.append(index)
    if len(nguyen_am_index) < 2:
        if qu_or_gi:
            if len(chars) == 2:
                x, y = nguyen_am_to_ids.get(chars[1])
                chars[1] = bang_nguyen_am[x][dau_cau]
            else:
                x, y = nguyen_am_to_ids.get(chars[2], (-1, -1))
                if x != -1:
                    chars[2] = bang_nguyen_am[x][dau_cau]
                else:
                    chars[1] = bang_nguyen_am[5][dau_cau] if chars[1] == 'i' else bang_nguyen_am[9][dau_cau]
            return ''.join(chars)
        return word

    for index in nguyen_am_index:
        x, y = nguyen_am_to_ids[chars[index]]
        if x == 4 or x == 8:  # ê, ơ
            chars[index] = bang_nguyen_am[x][dau_cau]
            # for index2 in nguyen_am_index:
            #     if index2 != index:
            #         x, y = nguyen_am_to_ids[chars[index]]
            #         chars[index2] = bang_nguyen_am[x][0]
            return ''.join(chars)

    if len(nguyen_am_index) == 2:
        if nguyen_am_index[-1] == len(chars) - 1:
            x, y = nguyen_am_to_ids[chars[nguyen_am_index[0]]]
            chars[nguyen_am_index[0]] = bang_nguyen_am[x][dau_cau]
            # x, y = nguyen_am_to_ids[chars[nguyen_am_index[1]]]
            # chars[nguyen_am_index[1]] = bang_nguyen_am[x][0]
        else:
            # x, y = nguyen_am_to_ids[chars[nguyen_am_index[0]]]
            # chars[nguyen_am_index[0]] = bang_nguyen_am[x][0]
            x, y = nguyen_am_to_ids[chars[nguyen_am_index[1]]]
            chars[nguyen_am_index[1]] = bang_nguyen_am[x][dau_cau]
    else:
        # x, y = nguyen_am_to_ids[chars[nguyen_am_index[0]]]
        # chars[nguyen_am_index[0]] = bang_nguyen_am[x][0]
        x, y = nguyen_am_to_ids[chars[nguyen_am_index[1]]]
        chars[nguyen_am_index[1]] = bang_nguyen_am[x][dau_cau]
        # x, y = nguyen_am_to_ids[chars[nguyen_am_index[2]]]
        # chars[nguyen_am_index[2]] = bang_nguyen_am[x][0]
    return ''.join(chars)

In [84]:
def chuan_hoa_dau_cau_tieng_viet(sentence):
    """
        Chuyển câu tiếng việt về chuẩn gõ dấu kiểu cũ.
        :param sentence:
        :return:
    """
    sentence = sentence.lower()
    words = sentence.split()
    for index, word in enumerate(words):
        cw = re.sub(r'(^\p{P}*)([p{L}.]*\p{L}+)(\p{P}*$)', r'\1/\2/\3', word).split('/')
        if len(cw) == 3:
            cw[1] = chuan_hoa_dau_tu_tieng_viet(cw[1])
        words[index] = ''.join(cw)
    return ' '.join(words)

In [85]:
def remove_html(txt):
    return re.sub(r'<[^>]*>', '', txt)

In [86]:
def clean_text_from_file(folder_path):
    inputFiles = os.listdir(folder_path)
    for inputFile in inputFiles:
        inputFileWasReplaced = inputFile.replace(inputFile[-11:], '')
        inputFileWasReplaced = inputFileWasReplaced.replace(inputFileWasReplaced[:9], '')
        with open('output' + '/' + inputFile, 'r', encoding='utf-8') as file:
            for text in file.readlines():
                if re.match('Title(.*)', text):
                    continue
                if text == '\n':
                    continue


                # Loại bỏ dữ liệu trùng lặp
                cleaned_text = remove_duplicates(text)

                # Loại bỏ các thẻ HTML
                cleaned_text = remove_html_tags(cleaned_text)

                # Loại bỏ các thẻ code, mã JavaScript và mã CSS
                cleaned_text = remove_code(cleaned_text)

                cleaned_text = convert_unicode(cleaned_text)

                cleaned_text = ViTokenizer.tokenize(cleaned_text)

                cleaned_text = cleaned_text.lower()

                # Loại bỏ lỗi font hoặc các ký tự không mong muốn khác, nhưng giữ lại "__label"
                cleaned_text = remove_special_characters(cleaned_text)

                # Loại bỏ dữ liệu trong các dấu ngoặc nhọn {{...}}
                cleaned_text = remove_curly_braces_content(cleaned_text)

                cleaned_text = remove_startswith_or_endswith_underscore_except_label(cleaned_text, inputFileWasReplaced)

                cleaned_text = format_content(cleaned_text, inputFileWasReplaced)

                # Ghi dữ liệu đã được làm sạch vào tập tin mới
                result_directory = 'result'
                if not os.path.exists(result_directory):
                    os.makedirs(result_directory)
                output_file = os.path.join(result_directory, inputFileWasReplaced + '.txt')
                with open(output_file, 'a', encoding='utf-8') as file:
                    file.write(cleaned_text)
        remove_stopwords(output_file, inputFileWasReplaced)

In [87]:
output_directory = 'output'
clean_text_from_file(output_directory)

In [88]:
# with open('stopwords.txt', 'r', encoding='utf-8') as stop_file:
#     stopwords = set(stop_file.read().splitlines())
# with open('result/chính_phủ.txt', 'r', encoding='utf-8') as input_file:
#     for line in input_file:
#         # tách các từ trong dòng và kiểm tra xem chúng có trong stopwords không
#         words = line.split()
#         print([word for word in words if word.lower() in stopwords])
#         words = [word for word in words if word.lower() not in stopwords]

# result_directory = 'result'
# if not os.path.exists(result_directory):
#     os.makedirs(result_directory)
# output_file = os.path.join(result_directory, 'chính_phủ' + '.txt')
# remove_stopwords(output_file, 'chính_phủ')

# stopwords = ['thông']
# paragraph = 'thông cáo thông_tin thông_báo'

# words = paragraph.split()
# for word in words:
#     if word.lower() in stopwords:
#         print(word)