In [None]:
# !pip3 install numpy
# !pip3 install pandas
# !pip3 install csv
# !pip3 install openpyxl

In [8]:
import pandas as pd
import numpy as np
import csv
import re

all_sheets = pd.ExcelFile(r"datasets\พจนานุกรมภาษาถิ่นเหนือ.xlsx").sheet_names
excluding_sheets = ['อักษรย่อชนิดคำ', 'Example', 'note']
importing_sheet = [sheet for sheet in all_sheets if sheet not in excluding_sheets]

north = pd.read_excel(r"datasets\พจนานุกรมภาษาถิ่นเหนือ.xlsx", sheet_name=importing_sheet, header=None)
northeast = pd.read_excel(r"datasets\พจนานุกรมภาษาถิ่นอีสาน.xlsx", sheet_name=None, header=None)
south = pd.read_excel(r"datasets\พจนานุกรมภาษาถิ่นใต้.xlsx", sheet_name=None, header=None)

In [10]:
# north_list = [alphabet for alphabet in north.keys()]

# print(north_list)

['ข', 'ก', 'ฃ', 'ค', 'ฅ', 'ง', 'จ', 'ช', 'ซ', 'ฌ', 'ญ', 'ด', 'ฏ', 'ฐ', 'ณ', 'ต', 'ถ', 'ท', 'ธ', 'น', 'บ', 'ป', 'ป ต่อ', 'ปร (ผ)', 'ผ', 'ฝ', 'พ', 'ฟ', 'ภ', 'ม', 'ย', 'ร', 'ฤ', 'ว', 'ศ', 'ส', 'ห', 'อ', 'อย', 'ฮ']


In [11]:
# 1 inspecting the first few rows from each dataset

for sheet_name, df in north.items():
    print(f"Sheet name: {sheet_name}")
    print(df.head(), "\n")

for sheet_name, df in northeast.items():
    print(f"Sheet name: {sheet_name}")
    print(df.head(), "\n")

for sheet_name, df in south.items():
    print(f"Sheet name: {sheet_name}")
    print(df.head(), "\n")

Sheet name: ข
        0                                                  1    2
0  ช่อง 1                                                  2    3
1  ขกอยาก                                      ก. กระโดดขึ้น  NaN
2      ขง                            น. กรุง เมืองอาณาบริเวณ  NaN
3     NaN     น. กรง - สิ่งที่ทำเป็นซี่ๆ สำหรับขัง สัตว์เล็ก  NaN
4     NaN  ก. กลัด - อาการที่เลือดหรือหนองคั่งอยู่ในผิวหร...  NaN 

Sheet name: ก
        0                                              1    2    3
0  ช่อง 1                                              2    3  4.0
1      กก                        น. โคน อย่าง โคนต้นไม้   NaN  NaN
2     NaN                           น. ชื่อเรียกนกเงือก   NaN  NaN
3     NaN                น. ชื่อแม่น้ำใน จังหวัดเชียงราย  NaN  NaN
4     NaN  น.ชื่อหญ้าชนิดหนึ่งใช้ทำเลื่อ เรียกว่าเลื่อกก  NaN  NaN 

Sheet name: ฃ
         0                                             1
0  ฃาบอยาบ  ก. เซ่นผี สังเวยผี เซ่นผีบรรพชนในพิธีแต่งงาน 

Sheet name: ค
                     0    

In [12]:
# 2 identifying and shifting misaligned words

def align_headwords(df):
    for index, row in df.iterrows():
        if pd.isnull(row[0]) and not pd.isnull(row[1]) and len(row) >= 3:
            df.at[index, 0] = row[1]
            df.at[index, 1] = row[2]
            df.at[index, 2] = row[3] if len(row) > 3 else None
        elif pd.isnull(row[0]) and len(row) == 2:
            df.at[index, 0] = row[1]
            df.at[index, 1] = None
    return df

north = {sheet_name: align_headwords(df) for sheet_name, df in north.items()}
northeast = {sheet_name: align_headwords(df) for sheet_name, df in northeast.items()}
south = {sheet_name: align_headwords(df) for sheet_name, df in south.items()}


In [19]:
# 3 spliting POS from eefinitions

pos_pattern = r"(น\.|ก\.|ว\.|สัน\.|สำนวน\.|ลัก\.|ล\.)"

def split_pos_from_definition(df):
    for index, row in df.iterrows():
        if re.match(pos_pattern, str(row[1])):
            df.at[index, 1] = re.search(pos_pattern, row[1]).group(0)
            df.at[index, 2] = re.sub(pos_pattern, '', row[1], count=1).strip()
    return df

north = {sheet_name: split_pos_from_definition(df) for sheet_name, df in north.items()}
northeast = {sheet_name: split_pos_from_definition(df) for sheet_name, df in northeast.items()}
south = {sheet_name: split_pos_from_definition(df) for sheet_name, df in south.items()}


In [21]:
# 4 handling multiple definitions and POS

thai_numbers_pattern = r"([๑๒๓๔๕๖๗๘๙])\."

def split_multiple_definitions(df):
    new_rows = []
    for index, row in df.iterrows():
        definitions = re.split(thai_numbers_pattern, str(row[2]))
        pos = row[1]
        
        if len(definitions) > 1:
            for i in range(1, len(definitions), 2):
                definition = f"{definitions[i]}. {definitions[i+1].strip()}"
                new_rows.append([row[0], pos, definition])
        else:
            new_rows.append([row[0], row[1], row[2]])
    return pd.DataFrame(new_rows, columns=['Headword', 'POS', 'Definition'])

north = {sheet_name: split_multiple_definitions(df) for sheet_name, df in north.items()}
northeast = {sheet_name: split_multiple_definitions(df) for sheet_name, df in northeast.items()}
south = {sheet_name: split_multiple_definitions(df) for sheet_name, df in south.items()}



In [23]:
# 5 merging alternative forms

alt_forms_keywords = ["ก็เรียก", "ก็ว่า"]

def merge_alternative_forms(df):
    new_df = []
    
    for i in range(len(df)):
        if any(keyword in str(df.iloc[i, 2]) for keyword in alt_forms_keywords):
            if isinstance(new_df[-1][2], str):
                new_df[-1][2] += f", {df.iloc[i, 2]}"
            else:
                new_df[-1][2] = f"{df.iloc[i, 2]}"
        else:
            new_df.append(df.iloc[i].tolist())
    
    return pd.DataFrame(new_df, columns=df.columns)

north = {sheet_name: merge_alternative_forms(df) for sheet_name, df in north.items()}
northeast = {sheet_name: merge_alternative_forms(df) for sheet_name, df in northeast.items()}
south = {sheet_name: merge_alternative_forms(df) for sheet_name, df in south.items()}


In [26]:
# 6 cleaning up

def final_cleanup(df):
    df = df.dropna(subset=['Headword', 'POS', 'Definition'])
    return df

north = {sheet_name: final_cleanup(df) for sheet_name, df in north.items()}
northeast = {sheet_name: final_cleanup(df) for sheet_name, df in northeast.items()}
south = {sheet_name: final_cleanup(df) for sheet_name, df in south.items()}


In [30]:
output_path = r"output\csv_test"

for sheet_name, df in north.items():
    df.to_csv(f"{output_path}\\cleaned_north_{sheet_name}.csv", index=False, encoding='utf-8-sig')

for sheet_name, df in northeast.items():
    df.to_csv(f"{output_path}\\cleaned_northeast_{sheet_name}.csv", index=False, encoding='utf-8-sig')

for sheet_name, df in south.items():
    df.to_csv(f"{output_path}\\cleaned_south_{sheet_name}.csv", index=False, encoding='utf-8-sig')

In [17]:
# north['ข']
# northeast['ข']
south['ข']

Unnamed: 0,0,1,2
0,ขก,ก.,
1,ขกแข็ก,ว.,
2,ขด,น.,
3,ขดแข็ดคดแค็ด,ว.,
4,ขน,น.,
...,...,...,...
1085,ดอแมว,ก็เรียก.,
1086,ไขว,ก.,
1087,ไขว้,ว.,
1088,ไขวใจ,ก.,


In [None]:
"""
1. xlsx --> csv files (or sheets)
2. run function
3. run for loop that applies this function into every xlsx
4. the output should be each csv file
5. for example -- print(f"{dict}_{dict[alphabet]}")
5.1 assuming dict is pd.read_excel as an input
5.2 dict[alphabet] is ก-ฮ
6 the output thus has 26*3 (78) csv files
"""

"""
function


"""

In [None]:
"""get csv inside xlsx"""
# for alphabet, content in south.items():
    # print(south[alphabet])

"""get key values--ก.-ฮ."""

north_list = [alphabet for alphabet in north.keys()]
northeast_list = [alphabet for alphabet in south.keys()]
south_list = [alphabet for alphabet in south.keys()]

kokai = south[south_list[0]]
kokai.head()

# for i in range(1, len(kokai)):
#     if pd.isna(kokai.iloc[i, 0]) and not pd.isna(kokai.iloc[i, 1]):
#         kokai.iloc[i, 0] = kokai.iloc[i, 1]
#         kokai.iloc[i, 1] = None

# kokai

In [None]:
# south.columns = ["Word", "POS_and_Definition"]

# south1 = south.loc[:, ~south.apply(lambda col: col.isna().all())]

# south = south.dropna(how="all")

# south_dict = south.to_dict(orient='records')

# def shift_values_left(d):
#     values = [v for v in d.values() if pd.notna(v)]
#     return {i: values[i] if i < len(values) else np.nan for i in d.keys()}

# south = pd.DataFrame([shift_values_left(d) for d in south_dict], index=["Word", "POS_Definition"])

# south[["POS", "Definition"]] = south["Definition"].str.split(" ", n=1, expand=True)

# south = south.drop("POS_Definition", axis=1)

In [None]:
# ver1

# def split_definition(row):
#     word = row[0]
#     definition = row[1] if pd.notna(row[1]) else ''
#     parts = re.split(r',\s*(?=(?:น\.|ก\.|ว\.|สัน\.|สำ\.|ลัก\.))', definition) 
#     result = []
#     for part in parts:
#         match = re.match(r'(น\.|ก\.|ว\.|สัน\.|สำ\.|ลัก\.)\s*(.*)', part)
#         if match:
#             pos, def_part = match.groups()
#             sub_parts = re.split(r'(?<=\.)\s*\d+\)\.\s*', def_part)
#             for sub_part in sub_parts:
#                 if sub_part.strip():
#                     result.append([word, pos.strip(), sub_part.strip()])
#         else:
#             result.append([word, '', part.strip()])
#     return result

# ver 2

# def split_definition(row):
#     word = str(row[0]) if pd.notna(row[0]) else ''
#     definition = str(row[1]) if pd.notna(row[1]) else ''
#     if not isinstance(definition, str):
#         return [[word, '', str(definition)]]
#     parts = re.split(r'(?:(?<=\s)|^)(?=(?:น\.|ก\.|ว\.|สัน\.|สำ\.|ลัก\.)|\d+\s+(?:น\.|ก\.|ว\.|สัน\.|สำ\.|ลัก\.))', definition)
#     result = []
#     current_pos = ''
#     for part in parts:
#         pos_match = re.match(r'^(?:\d+\s+)?(น\.|ก\.|ว\.|สัน\.|สำ\.|ลัก\.)', part)
#         if pos_match:
#             current_pos = pos_match.group(1)
#             def_part = part[pos_match.end():].strip()
#         else:
#             def_part = part.strip()
#         sub_parts = re.split(r'(?<=\.)\s*(?:\d+\.|\d+\)\.)\s*', def_part)
#         for sub_part in sub_parts:
#             if sub_part.strip():
#                 result.append([word, current_pos.strip(), sub_part.strip()])
#     return result if result else [[word, '', definition]]

# ver 3

# def split_definition(row):
#     word = str(row[0]) if pd.notna(row[0]) else ''
#     definition = str(row[1]) if pd.notna(row[1]) else ''
    
#     if not isinstance(definition, str):
#         return [word, '', '', str(definition)]
    
#     def remove_thai_numbers(s):
#         return re.sub(r'^[๑๒๓๔๕๖๗๘๙๐]+\.\s*', '', s)
    
#     pos_splits = re.split(r'(?:(?<=\s)|^)(?=(น\.|ก\.|ว\.|สัน\.|สำ\.|ลัก\.))', definition)
    
#     alternative_forms = []
#     pos_definitions = []
    
#     for i, part in enumerate(pos_splits):
#         if part in ['น.', 'ก.', 'ว.', 'สัน.', 'สำ.', 'ลัก.']:
#             current_pos = part
#         else:
#             number_splits = re.split(r'(?:(?<=\s)|^)(?=[๑๒๓๔๕๖๗๘๙๐]+\.)', part)
            
#             for sub_part in number_splits:
#                 clean_part = remove_thai_numbers(sub_part.strip())
#                 if clean_part:
#                     if i == 0:
#                         alternative_forms.append(clean_part)
#                     else:
#                         pos_definitions.append((current_pos.strip(), clean_part))
    
#     alt_forms_str = ', '.join(alternative_forms) if alternative_forms else ''
    
#     if pos_definitions:
#         pos_str = pos_definitions[0][0]
#         def_str = ', '.join([d[1] for d in pos_definitions])
#         return [word, alt_forms_str, pos_str, def_str]
#     else:
#         return [word, alt_forms_str, '', '']