In [1]:
import json
import re
from unidecode import unidecode
import sys

In [2]:
def get_place_name(full_name):
    # Replace administrative unit strings
    no_unit_name = re.sub(
        r"\A(Thành phố|Tỉnh|Huyện|Quận|Thị xã|Xã|Phường|Thị trấn)\s",
        "",
        full_name
    )
    # Convert 009 -> 9, 0003->3
    if no_unit_name.isnumeric():
        return str(int(no_unit_name))
    return no_unit_name

In [3]:
def create_doc(doc_raw):
    name = get_place_name(doc_raw['name'])
    return {
        "name": name, 
        "alias": re.sub(r"[^a-z0-9]", "", unidecode(name.lower())),
        "type": doc_raw['type']
    }

In [4]:
def transform(fin_name, fout_name):
    with open(fin_name,'r') as json_file:
        dvhc_raw = json.load(json_file)['data']

    dvhc_res = {}
    for level1 in dvhc_raw:
        level1_doc = create_doc(level1)
        level1_doc['level2s'] = {}
        for level2 in level1['level2s']:
            level2_doc = create_doc(level2)
            level2_doc['level3s'] = {}
            for level3 in level2['level3s']:
                level2_doc['level3s'][str(int(level3['level3_id']))] = create_doc(level3)

            level1_doc['level2s'][str(int(level2['level2_id']))] = level2_doc
        dvhc_res[str(int(level1['level1_id']))] = level1_doc
    # Write file
    with open(fout_name,'w',encoding='utf-8') as out_file:
        json.dump(dvhc_res,out_file,indent=4,ensure_ascii=True)

In [5]:
transform('./raw/dvhcvn_2020_raw.json', 'dvhcvn_2020.json')
transform('./raw/dvhcvn_2024_raw.json', 'dvhcvn_2024.json')