In [None]:
# !pip3 install numpy
# !pip3 install pandas
# !pip3 install csv
# !pip3 install openpyxl

In [1]:
import pandas as pd
import numpy as np
import csv
import re

all_sheets = pd.ExcelFile(r"datasets\พจนานุกรมภาษาถิ่นเหนือ.xlsx").sheet_names
excluding_sheets = ['อักษรย่อชนิดคำ', 'Example', 'note']
importing_sheet = [sheet for sheet in all_sheets if sheet not in excluding_sheets]

north = pd.read_excel(r"datasets\พจนานุกรมภาษาถิ่นเหนือ.xlsx", sheet_name=importing_sheet, header=None)
northeast = pd.read_excel(r"datasets\พจนานุกรมภาษาถิ่นอีสาน.xlsx", sheet_name=None, header=None)
south = pd.read_excel(r"datasets\พจนานุกรมภาษาถิ่นใต้.xlsx", sheet_name=None, header=None)

north

{'ข':                       0                                                  1  \
 0                ช่อง 1                                                  2   
 1                ขกอยาก                                      ก. กระโดดขึ้น   
 2                    ขง                            น. กรุง เมืองอาณาบริเวณ   
 3                   NaN     น. กรง - สิ่งที่ทำเป็นซี่ๆ สำหรับขัง สัตว์เล็ก   
 4                   NaN  ก. กลัด - อาการที่เลือดหรือหนองคั่งอยู่ในผิวหร...   
 ...                 ...                                                ...   
 1203          ไข้หวัดนก  น. ไข้ที่เกิดจากเชื้อไวรัสที่แพร่เชื้อมาจากสัต...   
 1204               ไขว่                  น. เรียกกับข้าวของแห้งว่า ของไขว่   
 1205                NaN  ก. ไขว้กัน ก่ายกัน สานอย่างลานไม้ไผ่เป็นตาๆ เช...   
 1206  ไขว่ขว้างขวิดขวาง                          ว. ระเกะระกะ ไม่มีระเบียบ   
 1207             ไขว่ผี       ก. เซ่นผี สังเวยผี เซ่นผีบรรพชนในพิธีแต่งงาน   
 
         2  
 0       3  
 1     NaN  
 2    

In [None]:
# north_list = [alphabet for alphabet in north.keys()]

# print(north_list)

In [2]:
pos_pattern = r"(น\.|ก\.|ว\.|สัน\.|สำนวน\.|ลัก\.|ล\.)"
alt_forms_keywords = ["ก็ว่า", "ก็เรียก"]

def clean_alternative_forms(definition):
    if "ก็ว่า" in definition and not definition.endswith("ก็ว่า."):
        definition = definition.replace("ก็ว่า", "ก็ว่า.")
    if "ก็เรียก" in definition and not definition.endswith("ก็เรียก."):
        definition = definition.replace("ก็เรียก", "ก็เรียก.")
    
    definition = re.sub(r"ก็ว่า\s*=\s*", "ก็ว่า. ", definition)
    definition = re.sub(r"ก็เรียก,\s*บางถิ่นเรียก\s*", "ก็เรียก. ", definition)
    
    return definition.strip()

def align_and_split_pos(df):
    new_rows = []
    current_headword = None
    
    for index, row in df.iterrows():
        if pd.notnull(row[0]):
            current_headword = row[0]
        headword = current_headword

        if pd.notnull(row[1]):
            split_definitions = re.split(pos_pattern, str(row[1]))
            if len(split_definitions) > 1:
                for i in range(1, len(split_definitions), 2):
                    pos = split_definitions[i].strip()
                    definition = split_definitions[i+1].strip() if i+1 < len(split_definitions) else ''
                    new_rows.append([headword, pos, definition])
            else:
                new_rows.append([headword, row[1], row[2]])
        else:
            new_rows.append([headword, row[1], row[2]])

    final_rows = []
    for i in range(len(new_rows)):
        headword, pos, definition = new_rows[i]

        if any(keyword in str(definition) for keyword in alt_forms_keywords):
            if len(final_rows) > 0:
                definition = clean_alternative_forms(definition)
                if pd.notnull(final_rows[-1][2]):
                    final_rows[-1][2] += f", {definition.strip()}"
                else:
                    final_rows[-1][2] = f"{definition.strip()}"
        else:
            final_rows.append([headword, pos, definition])

    return pd.DataFrame(final_rows, columns=['Headword', 'POS', 'Definition'])

north = {sheet_name: align_and_split_pos(df) for sheet_name, df in north.items()}
northeast = {sheet_name: align_and_split_pos(df) for sheet_name, df in northeast.items()}
south = {sheet_name: align_and_split_pos(df) for sheet_name, df in south.items()}

output_path = r"output\csv_test2"

for sheet_name, df in north.items():
    df.to_csv(f"{output_path}\\cleaned_north_{sheet_name}.csv", index=False, encoding='utf-8-sig')

for sheet_name, df in northeast.items():
    df.to_csv(f"{output_path}\\cleaned_northeast_{sheet_name}.csv", index=False, encoding='utf-8-sig')

for sheet_name, df in south.items():
    df.to_csv(f"{output_path}\\cleaned_south_{sheet_name}.csv", index=False, encoding='utf-8-sig')

In [2]:
# 2 identifying and shifting misaligned words

pos_pattern = r"(น\.|ก\.|ว\.|สัน\.|สำนวน\.|ลัก\.|ล\.)"

def align_and_split_pos(df):
    new_rows = []
    current_headword = None
    
    for index, row in df.iterrows():
        if pd.notnull(row[0]):
            current_headword = row[0]
        headword = current_headword

        if pd.notnull(row[1]):
            split_definitions = re.split(pos_pattern, str(row[1]))
            if len(split_definitions) > 1:
                for i in range(1, len(split_definitions), 2):
                    pos = split_definitions[i].strip()
                    definition = split_definitions[i+1].strip() if i+1 < len(split_definitions) else ''
                    new_rows.append([headword, pos, definition])
            else:
                new_rows.append([headword, row[1], row[2]])
        else:
            new_rows.append([headword, row[1], row[2]])

    return pd.DataFrame(new_rows, columns=['Headword', 'POS', 'Definition'])


north = {sheet_name: align_and_split_pos(df) for sheet_name, df in north.items()}
northeast = {sheet_name: align_and_split_pos(df) for sheet_name, df in northeast.items()}
south = {sheet_name: align_and_split_pos(df) for sheet_name, df in south.items()}

output_path = r"output\csv_test"

for sheet_name, df in north.items():
    df.to_csv(f"{output_path}\\cleaned_north_{sheet_name}.csv", index=False, encoding='utf-8-sig')

for sheet_name, df in northeast.items():
    df.to_csv(f"{output_path}\\cleaned_northeast_{sheet_name}.csv", index=False, encoding='utf-8-sig')

for sheet_name, df in south.items():
    df.to_csv(f"{output_path}\\cleaned_south_{sheet_name}.csv", index=False, encoding='utf-8-sig')

In [None]:
# 1 inspecting the first few rows from each dataset

for sheet_name, df in north.items():
    print(f"Sheet name: {sheet_name}")
    print(df.head(), "\n")

for sheet_name, df in northeast.items():
    print(f"Sheet name: {sheet_name}")
    print(df.head(), "\n")

for sheet_name, df in south.items():
    print(f"Sheet name: {sheet_name}")
    print(df.head(), "\n")

In [7]:
# Regular expression to detect POS
pos_pattern = r"(น\.|ก\.|ว\.|สัน\.|สำนวน\.|ลัก\.|ล\.)"
alt_forms_keywords = ["ก็ว่า", "ก็เรียก"]  # Keywords to detect alternative forms

def align_and_split_pos(df):
    new_rows = []
    current_headword = None  # To keep track of the current headword for misaligned rows
    
    for index, row in df.iterrows():
        # Check if the headword exists in the first column, else use the previous headword
        if pd.notnull(row[0]):
            current_headword = row[0]  # Update the current headword
        headword = current_headword

        # Check if the POS and definition are combined in one cell
        if pd.notnull(row[1]):
            # Convert the value of row[1] to a string, ensuring it's compatible with re.split
            split_definitions = re.split(pos_pattern, str(row[1]))
            if len(split_definitions) > 1:
                # Rebuild rows by pairing each POS with its respective definition
                for i in range(1, len(split_definitions), 2):
                    pos = split_definitions[i].strip()  # POS
                    definition = split_definitions[i+1].strip() if i+1 < len(split_definitions) else ''
                    new_rows.append([headword, pos, definition])
            else:
                # If no POS found, append the row as is
                new_rows.append([headword, row[1], row[2]])
        else:
            # Handle rows where POS and definition might be separate
            new_rows.append([headword, row[1], row[2]])

    # Now let's handle alternative forms (those containing "ก็ว่า" or "ก็เรียก")
    final_rows = []
    for i in range(len(new_rows)):
        headword, pos, definition = new_rows[i]

        # Check if the current row contains an alternative form keyword
        if any(keyword in str(definition) for keyword in alt_forms_keywords):
            # Ensure there is at least one row before appending
            if len(final_rows) > 0:
                # Append the current definition to the previous row's definition
                if pd.notnull(final_rows[-1][2]):
                    final_rows[-1][2] += f", {definition.strip()}"
                else:
                    final_rows[-1][2] = f"{definition.strip()}"
        else:
            # Otherwise, just add the row to the final output
            final_rows.append([headword, pos, definition])

    # Return the final DataFrame with three columns (Headword, POS, Definition)
    return pd.DataFrame(final_rows, columns=['Headword', 'POS', 'Definition'])

# Apply this function to all sheets
north = {sheet_name: align_and_split_pos(df) for sheet_name, df in north.items()}
northeast = {sheet_name: align_and_split_pos(df) for sheet_name, df in northeast.items()}
south = {sheet_name: align_and_split_pos(df) for sheet_name, df in south.items()}

output_path = r"output\csv_test2"

for sheet_name, df in north.items():
    df.to_csv(f"{output_path}\\cleaned_north_{sheet_name}.csv", index=False, encoding='utf-8-sig')

for sheet_name, df in northeast.items():
    df.to_csv(f"{output_path}\\cleaned_northeast_{sheet_name}.csv", index=False, encoding='utf-8-sig')

for sheet_name, df in south.items():
    df.to_csv(f"{output_path}\\cleaned_south_{sheet_name}.csv", index=False, encoding='utf-8-sig')

In [None]:
# 3 spliting POS from eefinitions

pos_pattern = r"(น\.|ก\.|ว\.|สัน\.|สำนวน\.|ลัก\.|ล\.)"

def split_pos_from_definition(df):
    for index, row in df.iterrows():
        if re.match(pos_pattern, str(row[1])):
            df.at[index, 1] = re.search(pos_pattern, row[1]).group(0)
            df.at[index, 2] = re.sub(pos_pattern, '', row[1], count=1).strip()
    return df

north = {sheet_name: split_pos_from_definition(df) for sheet_name, df in north.items()}
northeast = {sheet_name: split_pos_from_definition(df) for sheet_name, df in northeast.items()}
south = {sheet_name: split_pos_from_definition(df) for sheet_name, df in south.items()}

north['ข']

In [None]:
# 4 handling multiple definitions and POS

thai_numbers_pattern = r"([๑๒๓๔๕๖๗๘๙])\."

def split_multiple_definitions(df):
    new_rows = []
    for index, row in df.iterrows():
        definitions = re.split(thai_numbers_pattern, str(row[2]))
        pos = row[1]
        
        if len(definitions) > 1:
            for i in range(1, len(definitions), 2):
                definition = f"{definitions[i]}. {definitions[i+1].strip()}"
                new_rows.append([row[0], pos, definition])
        else:
            new_rows.append([row[0], row[1], row[2]])
    return pd.DataFrame(new_rows, columns=['Headword', 'POS', 'Definition'])

north = {sheet_name: split_multiple_definitions(df) for sheet_name, df in north.items()}
northeast = {sheet_name: split_multiple_definitions(df) for sheet_name, df in northeast.items()}
south = {sheet_name: split_multiple_definitions(df) for sheet_name, df in south.items()}

north['ข']

In [23]:
# 5 merging alternative forms

alt_forms_keywords = ["ก็เรียก", "ก็ว่า"]

def merge_alternative_forms(df):
    new_df = []
    
    for i in range(len(df)):
        if any(keyword in str(df.iloc[i, 2]) for keyword in alt_forms_keywords):
            if isinstance(new_df[-1][2], str):
                new_df[-1][2] += f", {df.iloc[i, 2]}"
            else:
                new_df[-1][2] = f"{df.iloc[i, 2]}"
        else:
            new_df.append(df.iloc[i].tolist())
    
    return pd.DataFrame(new_df, columns=df.columns)

north = {sheet_name: merge_alternative_forms(df) for sheet_name, df in north.items()}
northeast = {sheet_name: merge_alternative_forms(df) for sheet_name, df in northeast.items()}
south = {sheet_name: merge_alternative_forms(df) for sheet_name, df in south.items()}


In [26]:
# 6 cleaning up

def final_cleanup(df):
    df = df.dropna(subset=['Headword', 'POS', 'Definition'])
    return df

north = {sheet_name: final_cleanup(df) for sheet_name, df in north.items()}
northeast = {sheet_name: final_cleanup(df) for sheet_name, df in northeast.items()}
south = {sheet_name: final_cleanup(df) for sheet_name, df in south.items()}


In [30]:
output_path = r"output\csv_test"

for sheet_name, df in north.items():
    df.to_csv(f"{output_path}\\cleaned_north_{sheet_name}.csv", index=False, encoding='utf-8-sig')

for sheet_name, df in northeast.items():
    df.to_csv(f"{output_path}\\cleaned_northeast_{sheet_name}.csv", index=False, encoding='utf-8-sig')

for sheet_name, df in south.items():
    df.to_csv(f"{output_path}\\cleaned_south_{sheet_name}.csv", index=False, encoding='utf-8-sig')

In [None]:
# north['ข']
# northeast['ข']
south['ข']

In [None]:
"""
1. xlsx --> csv files (or sheets)
2. run function
3. run for loop that applies this function into every xlsx
4. the output should be each csv file
5. for example -- print(f"{dict}_{dict[alphabet]}")
5.1 assuming dict is pd.read_excel as an input
5.2 dict[alphabet] is ก-ฮ
6 the output thus has 26*3 (78) csv files
"""

"""
function


"""

In [None]:
"""get csv inside xlsx"""
# for alphabet, content in south.items():
    # print(south[alphabet])

"""get key values--ก.-ฮ."""

north_list = [alphabet for alphabet in north.keys()]
northeast_list = [alphabet for alphabet in south.keys()]
south_list = [alphabet for alphabet in south.keys()]

kokai = south[south_list[0]]
kokai.head()

# for i in range(1, len(kokai)):
#     if pd.isna(kokai.iloc[i, 0]) and not pd.isna(kokai.iloc[i, 1]):
#         kokai.iloc[i, 0] = kokai.iloc[i, 1]
#         kokai.iloc[i, 1] = None

# kokai