# Outline

## North

- change from "สรรพ." to "ส."
- change from "สำ." to "สำนวน."
- change from "ลัก." to "น."
- if คำอุทานมี POS ไม่แก้ else เปลี่ยน r"อุทาน\.|คำอุทาน\." เป็น "อ."  
- ลบ note หรือ comment กำกับข้อมูลใน column 4-5
- ค้างทุง "ความเปรียบ"

## South

- change from "สรรพ." to "ส."
- change from "ลัก." to "น."
- change from "อุ" to "อ."

## Isan

- change from "สรรพ." to "ส."
- change from "สำ." to "สำนวน."
- change from "ลัก." to "น."
- change from "อุ" to "อ."
- remove any note or comment in 5th column
- clear the first row

## Central

# Main Program

## Load the Data

In [None]:
# downloading packages

# !pip3 install numpy
# !pip3 install pandas
# !pip3 install csv
# !pip3 install openpyxl

In [None]:
# loading the data

import pandas as pd
import numpy as np
import os
import re

# north_path = r"datasets\พจนานุกรมภาษาถิ่นเหนือ.xlsx"
# isan_path = r"datasets\พจนานุกรมภาษาถิ่นอีสาน.xlsx"
# south_path = r"datasets\พจนานุกรมภาษาถิ่นใต้.xlsx"
# orst_path = r"datasets\ราชบัณฑิตฯ"

# second datasets
north_path = r"datasets_copy\พจนานุกรมภาษาถิ่นเหนือ.xlsx"
isan_path = r"datasets_copy\พจนานุกรมภาษาถิ่นอีสาน.xlsx"
south_path = r"datasets_copy\พจนานุกรมภาษาถิ่นใต้.xlsx"
orst_path = r"datasets_copy\ราชบัณฑิตฯ"

def load_sheets(file_path, exclude_sheets):
    all_sheets = pd.ExcelFile(file_path).sheet_names
    importing_sheets = [sheet for sheet in all_sheets if sheet not in exclude_sheets]
    return pd.read_excel(file_path, sheet_name=importing_sheets, header=None)

# north_df = load_sheets(north_path, ['อักษรย่อชนิดคำ', 'Example', 'note'])

north_df = pd.read_excel(north_path, sheet_name=None, header=None)
isan_df = pd.read_excel(isan_path, sheet_name=None, header=None)
south_df = pd.read_excel(south_path, sheet_name=None, header=None)

def load_excel(directory):
    excel_files = {}
    for filename in os.listdir(directory):
        name = os.path.splitext(filename)[0]
        excel_files[name] = pd.read_excel(os.path.join(directory, filename), header=None)
    return excel_files

orst_df = load_excel(orst_path)

## Inspect the Data

In [None]:
# run to copy

north = north_df.copy()
isan = isan_df.copy()
south = south_df.copy()
orst = orst_df.copy()

In [None]:
# inspect the data

def inspect(num=10, *dfs):
  for df in dfs:
    for key in df.keys():
      print(f"{key}\n{df[key].head(num)}\n")

inspect(10, orst)
# inspect(10, north, south, isan, orst)

## Cleaning

In [None]:
# cleaning

def cleaning(*dfs):
    for df in dfs:
        removed = [
            'ช่อง 1', 'ลูกคำ/ ความหมาย', 'แม่คำ', 'ความหมายลูกคำ คำสื่อ',
            'headword', '2', '3']

        for key in df.keys():
            df[key] = df[key][~df[key].apply(lambda row: any(string in str(val) for string in removed for val in row), axis=1)]
            
            if df == north:
                df[key] = df[key].dropna(how='all')
                df[key] = df[key].drop(df[key].columns[2:], axis=1)

            elif df == orst:
                # remove last column
                df[key] = df[key].iloc[:, :-1]

                # if the first cell is null, replace with second cell
                df[key].iloc[:, 0] = df[key].apply(lambda row: row.iloc[1] if pd.isnull(row.iloc[0]) else row.iloc[0], axis=1)
                
                # drop the second column
                df[key] = df[key].drop(columns=[df[key].columns[1]])

                # reset the index
                df[key] = df[key].reset_index(drop=True)
                df[key].columns = ['word', 'meaning']

cleaning(orst)
# cleaning(north, isan, south, orst)

# inspect(20, orst)
# inspect(10, north, south, isan, orst)

## Split POS

In [None]:
# orst

def rearrange(df = orst):
  for key in orst.keys():
    orst[key]["คำ"] = orst[key]['word']
    orst[key]['meaning'] = orst[key]['meaning'].str.split(';')
    orst[key] = orst[key].explode('meaning')
    orst[key] = orst[key].drop_duplicates()
    orst[key] = orst[key].reset_index(drop=True)

    # split by POS

    orst[key][["POS", "ความหมาย"]] = orst[key]['meaning'].str.extract(
      r"^(น\.|ก\.|ว\.|สัน\.|สำนวน\.|ล\.|ส\.|อ\.)(.*)$", expand=True)
    orst[key]["POS"].fillna("", inplace=True)
    orst[key]["ความหมาย"] = orst[key].apply(lambda row: row[2] if row["POS"] == "" else row["ความหมาย"], axis=1)
      
    # remove first two columns
    
    orst[key] = orst[key].iloc[:, 2:]

rearrange()

# def clean_whitespace(text):
#     if isinstance(text, str): 
#         return ' '.join(text.split())
#     return text

# for key in orst.keys():
#   for column in orst[key].columns:
#     if orst[key][column].dtype == 'object':
#       orst[key][column] = orst[key][column].apply(clean_whitespace)

def clean_whitespace(df):
  for key in df.keys():
    for column in df[key].columns:
      if df[key][column].dtype == 'object':
        df[key][column] = df[key][column].apply(
          lambda text: ' '.join(text.split()) if isinstance(text, str) else text
        )
  return df

orst = clean_whitespace(orst)

# inspect(10, orst)

orst['หมวด_ก(3881)'].head(n=10)

In [None]:
# original

pattern = r"(น\.|ก\.|ว\.|สัน\.|สำนวน\.|ล\.|ส\.|อ\.)"
"""
น. --> นาม
ก. --> กริยา
ว. --> วิเศษณ์
สัน. --> สันธาน
สำนวน. --> สำนวน
ล. --> ลักษณนาม
ส. --> สรรพนาม
อ. --> อุทาน
"""

def split_pos_and_definition(df):
    def split_row(row):
        matches = []
        seen = set()
        alt_forms = ["ก็ว่า.", "ก็เรียก."]
        
        text = ' '.join(str(val) for val in row if pd.notna(val))
        
        # temporary value 1
        temp_text = text

        # split_text = re.split(r";", temp_text)
        # print(split_text)

        # replace POS
        # temp_text = re.sub(r'ลัก\.', r'ล.', temp_text)
        # temp_text = re.sub(r'สรรพ\.', r'ส.', temp_text)
        # temp_text = re.sub(r'คำอุทาน\.|อุทาน\.', r'อ.', temp_text)

        # remove alternative forms
        # temp_text = re.sub(r'\sดู\s.+', '', temp_text)
        # temp_text = re.sub(r'\(ดู\s*-\s*.+\)$', '', temp_text)
        # temp_text = re.sub(r'\(ดูเพิ่มเติมที่ .*\)', '', temp_text)
        # temp_text = re.sub(r'"([^"]*)"', r'\1', temp_text)

        # temporary value 2
        text = temp_text

        matches = [match for match in re.findall(pattern, text) if not (match in seen or seen.add(match))]
        new_text = text
        if len(matches) > 1:
            first_string = matches[1][:-1]
            second_string = matches[0][:-1]
            
            # >1 POS & 1 meaning
            if re.search(rf'{first_string}\.\s?{second_string}\.|{second_string}\.\s?{first_string}\.', text):
                text = re.sub(rf'{first_string}\.', '', text, count=1)
                new_text = re.sub(rf'{second_string}\.', '', new_text, count=1)
            
            # >1 POS & >1 meaning
            else:
                text = re.sub(rf'{first_string}\..*$', '', text, count=1)
        
        elif len(matches) < 1:
            # check "ก็ว่า." and "ก็เรียก."
            if any(alt_forms) in text:
                print(text)
            else:
                text = ''

        # typos
        text = text.replace("“", "\"").replace("”", "\"").replace("''", "\"").replace("‘", "'").replace("’", "'")

        parts = re.split(pattern, text)
        word = parts[0].strip()
        pos = parts[1].strip() if len(parts) > 1 else ''
        definition = ' '.join(parts[2:]).strip() if len(parts) > 2 else ''
        definition = definition.strip()
        return pd.Series([word, pos, definition])

    for key in df.keys():
        df[key].columns = ['คำ', 'POS', 'ความหมาย']
    return df

# north = split_pos_and_definition(north)
# isan = split_pos_and_definition(isan)
# south = split_pos_and_definition(south)
orst = split_pos_and_definition(orst)

inspect(20, orst)
# inspect(10, north, isan, south, orst)

In [None]:
def fill_blank_words(iter=2, *dfs):
    for df in dfs:
        for _ in range(iter):
            for key in df.keys():
                df[key].dropna(how='all', inplace=True)
                df[key].replace("", np.nan, inplace=True)
                df[key] = df[key].infer_objects(copy=False)
                df[key].ffill(axis=0, inplace=True)
        return df

fill_blank_words(2, orst)
# fill_blank_words(2, north, isan, south, orst)

inspect(10, orst)
# inspect(10, north, isan, south, orst)

## Output

In [None]:
# output

def df_to_csv(dfs, region, path):
    for sheet, df in dfs.items():
        file = f"{path}/{region}_{sheet}.csv"
        df.to_csv(file, index=False, encoding='utf-8-sig')

regions = {
    # 'north': north,
    # 'isan': isan,
    # 'south': south,
    'cen': orst
    }

for region, dfs in regions.items():
    df_to_csv(dfs, region, r"output\csv_test2")

# Test