In [1]:
import openpyxl
import re
import pandas as pd

In [2]:
def excel_to_list2d(file_path: str, sheet_name: str):
    wb = openpyxl.open(filename=file_path, data_only=True)
    ws = wb[sheet_name]
    list2d = [[cell.value for cell in row] for row in ws[1:ws.max_row]]
    
    return list2d

In [3]:
def list2d_to_new_excel(list2d: list, new_path: str):
    wb = openpyxl.Workbook()
    ws = wb.active

    for row_idx, row in enumerate(list2d):
        for col_idx, col in enumerate(row):
            val = list2d[row_idx][col_idx]
            ws.cell(
                row=row_idx+1,
                column=col_idx+1,
                value=val
            )

    wb.save(new_path)

In [4]:
unikey = {
    "a": [
        "a", "á", "à", "ả", "ã", "ạ",
        "ă", "ắ", "ằ", "ẳ", "ẵ", "ặ",
        "â", "ấ", "ầ", "ẩ", "ẫ", "ậ"
    ],
    "d": [
        "d", "đ"
    ],
    "e": [
        "e", "é", "è", "ẻ", "ẽ", "ẹ",
        "ê", "ế", "ề", "ể", "ễ", "ệ"
    ],
    "i": [
        "i", "í", "ì", "ỉ", "ĩ", "ị"
    ],
    "o": [
        "o", "ó", "ò", "ỏ", "õ", "ọ",
        "ô", "ố", "ồ", "ổ", "ỗ", "ộ",
        "ơ", "ớ", "ờ", "ở", "ỡ", "ợ"
    ],
    "u": [
        "u", "ú", "ù", "ủ", "ũ", "ụ",
        "ư", "ứ", "ừ", "ử", "ữ", "ự"
    ],
    "y": [
        "y", "ý", "ỳ", "ỷ", "ỹ", "ỵ"
    ]
}

In [5]:
def remove_diacritic(txt: str):
    txt = txt.lower()
    for key, values in unikey.items():
        for val in values:
            txt = re.sub(val, key, txt)
    txt = re.sub("[^a-z0-9]", "_", txt)
    txt = re.sub("_+", "_", txt)

    return txt

In [12]:
a = "01. Thành phố Cà Mau"
remove_diacritic(a)

'01_thanh_pho_ca_mau'

In [7]:
table = excel_to_list2d(r"C:\05_Project\DDCI Ca Mau 2023\3. DSDN\2. Xu ly mau\cm2023_phanbo.xlsx", "Data")

In [8]:
df = pd.DataFrame(table[1:], columns=table[0])

In [9]:
groups = df.groupby(by="donvi_01")

In [10]:
target_dir = r"C:\05_Project\DDCI Ca Mau 2023\3. DSDN\2. Xu ly mau\Processing\Step 8"

In [11]:
for idx in df.value_counts(subset=["donvi_01"], dropna=False).index:
    grp_name = idx[0]
    grp_df = groups.get_group(grp_name)
    file_name = remove_diacritic(grp_name)
    grp_df.to_excel(target_dir+"\\"+file_name+".xlsx", index=False)
    break